sys_pipe.c revision 76364
113675Sdyson/*
213675Sdyson * Copyright (c) 1996 John S. Dyson
313675Sdyson * All rights reserved.
413675Sdyson *
513675Sdyson * Redistribution and use in source and binary forms, with or without
613675Sdyson * modification, are permitted provided that the following conditions
713675Sdyson * are met:
813675Sdyson * 1. Redistributions of source code must retain the above copyright
913675Sdyson *    notice immediately at the beginning of the file, without modification,
1013675Sdyson *    this list of conditions, and the following disclaimer.
1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1213675Sdyson *    notice, this list of conditions and the following disclaimer in the
1313675Sdyson *    documentation and/or other materials provided with the distribution.
1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author
1513675Sdyson *    John S. Dyson.
1614037Sdyson * 4. Modifications may be freely made to this file if the above conditions
1713675Sdyson *    are met.
1813675Sdyson *
1950477Speter * $FreeBSD: head/sys/kern/sys_pipe.c 76364 2001-05-08 09:09:18Z alfred $
2013675Sdyson */
2113675Sdyson
2213675Sdyson/*
2313675Sdyson * This file contains a high-performance replacement for the socket-based
2413675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2513675Sdyson * all features of sockets, but does do everything that pipes normally
2613675Sdyson * do.
2713675Sdyson */
2813675Sdyson
2913907Sdyson/*
3013907Sdyson * This code has two modes of operation, a small write mode and a large
3113907Sdyson * write mode.  The small write mode acts like conventional pipes with
3213907Sdyson * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3313907Sdyson * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3413907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3513907Sdyson * the receiving process can copy it directly from the pages in the sending
3613907Sdyson * process.
3713907Sdyson *
3813907Sdyson * If the sending process receives a signal, it is possible that it will
3913913Sdyson * go away, and certainly its address space can change, because control
4013907Sdyson * is returned back to the user-mode side.  In that case, the pipe code
4113907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable
4213907Sdyson * kernel buffer, and the receiving process will grab the data from the
4313907Sdyson * pageable kernel buffer.  Since signals don't happen all that often,
4413907Sdyson * the copy operation is normally eliminated.
4513907Sdyson *
4613907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4713907Sdyson * happen for small transfers so that the system will not spend all of
4813913Sdyson * its time context switching.  PIPE_SIZE is constrained by the
4913907Sdyson * amount of kernel virtual memory.
5013907Sdyson */
5113907Sdyson
5213675Sdyson#include <sys/param.h>
5313675Sdyson#include <sys/systm.h>
5424131Sbde#include <sys/fcntl.h>
5513675Sdyson#include <sys/file.h>
5613675Sdyson#include <sys/filedesc.h>
5724206Sbde#include <sys/filio.h>
5876166Smarkm#include <sys/lock.h>
5924206Sbde#include <sys/ttycom.h>
6013675Sdyson#include <sys/stat.h>
6129356Speter#include <sys/poll.h>
6270834Swollman#include <sys/selinfo.h>
6313675Sdyson#include <sys/signalvar.h>
6413675Sdyson#include <sys/sysproto.h>
6513675Sdyson#include <sys/pipe.h>
6676166Smarkm#include <sys/proc.h>
6755112Sbde#include <sys/vnode.h>
6834924Sbde#include <sys/uio.h>
6959288Sjlemon#include <sys/event.h>
7013675Sdyson
7113675Sdyson#include <vm/vm.h>
7213675Sdyson#include <vm/vm_param.h>
7313675Sdyson#include <vm/vm_object.h>
7413675Sdyson#include <vm/vm_kern.h>
7513675Sdyson#include <vm/vm_extern.h>
7613675Sdyson#include <vm/pmap.h>
7713675Sdyson#include <vm/vm_map.h>
7813907Sdyson#include <vm/vm_page.h>
7927899Sdyson#include <vm/vm_zone.h>
8013675Sdyson
8114037Sdyson/*
8214037Sdyson * Use this define if you want to disable *fancy* VM things.  Expect an
8314037Sdyson * approx 30% decrease in transfer rate.  This could be useful for
8414037Sdyson * NetBSD or OpenBSD.
8514037Sdyson */
8614037Sdyson/* #define PIPE_NODIRECT */
8714037Sdyson
8814037Sdyson/*
8914037Sdyson * interfaces to the outside world
9014037Sdyson */
9113675Sdysonstatic int pipe_read __P((struct file *fp, struct uio *uio,
9251418Sgreen		struct ucred *cred, int flags, struct proc *p));
9313675Sdysonstatic int pipe_write __P((struct file *fp, struct uio *uio,
9451418Sgreen		struct ucred *cred, int flags, struct proc *p));
9513675Sdysonstatic int pipe_close __P((struct file *fp, struct proc *p));
9629356Speterstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
9729356Speter		struct proc *p));
9872521Sjlemonstatic int pipe_kqfilter __P((struct file *fp, struct knote *kn));
9952983Speterstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
10036735Sdfrstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
10113675Sdyson
10272521Sjlemonstatic struct fileops pipeops = {
10372521Sjlemon	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
10472521Sjlemon	pipe_stat, pipe_close
10572521Sjlemon};
10613675Sdyson
10759288Sjlemonstatic void	filt_pipedetach(struct knote *kn);
10859288Sjlemonstatic int	filt_piperead(struct knote *kn, long hint);
10959288Sjlemonstatic int	filt_pipewrite(struct knote *kn, long hint);
11059288Sjlemon
11172521Sjlemonstatic struct filterops pipe_rfiltops =
11272521Sjlemon	{ 1, NULL, filt_pipedetach, filt_piperead };
11372521Sjlemonstatic struct filterops pipe_wfiltops =
11472521Sjlemon	{ 1, NULL, filt_pipedetach, filt_pipewrite };
11559288Sjlemon
11672521Sjlemon
11713675Sdyson/*
11813675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe
11913675Sdyson * space is pageable.  The pipe code will try to maintain locality of
12013675Sdyson * reference for performance reasons, so small amounts of outstanding I/O
12113675Sdyson * will not wipe the cache.
12213675Sdyson */
12313907Sdyson#define MINPIPESIZE (PIPE_SIZE/3)
12413907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3)
12513675Sdyson
12613907Sdyson/*
12713907Sdyson * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
12813907Sdyson * is there so that on large systems, we don't exhaust it.
12913907Sdyson */
13013907Sdyson#define MAXPIPEKVA (8*1024*1024)
13113907Sdyson
13213907Sdyson/*
13313907Sdyson * Limit for direct transfers, we cannot, of course limit
13413907Sdyson * the amount of kva for pipes in general though.
13513907Sdyson */
13613907Sdyson#define LIMITPIPEKVA (16*1024*1024)
13717163Sdyson
13817163Sdyson/*
13917163Sdyson * Limit the number of "big" pipes
14017163Sdyson */
14117163Sdyson#define LIMITBIGPIPES	32
14233181Seivindstatic int nbigpipe;
14317163Sdyson
14417124Sbdestatic int amountpipekva;
14513907Sdyson
14613675Sdysonstatic void pipeclose __P((struct pipe *cpipe));
14776364Salfredstatic void pipe_free_kmem __P((struct pipe *cpipe));
14876364Salfredstatic int pipe_create __P((struct pipe **cpipep));
14913907Sdysonstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
15013675Sdysonstatic __inline void pipeunlock __P((struct pipe *cpipe));
15114122Speterstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
15214037Sdyson#ifndef PIPE_NODIRECT
15313907Sdysonstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
15413907Sdysonstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
15513907Sdysonstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
15613907Sdysonstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
15714037Sdyson#endif
15876364Salfredstatic int pipespace __P((struct pipe *cpipe, int size));
15913675Sdyson
16033181Seivindstatic vm_zone_t pipe_zone;
16127899Sdyson
16213675Sdyson/*
16313675Sdyson * The pipe system call for the DTYPE_PIPE type of pipes
16413675Sdyson */
16513675Sdyson
16613675Sdyson/* ARGSUSED */
16713675Sdysonint
16830994Sphkpipe(p, uap)
16913675Sdyson	struct proc *p;
17013675Sdyson	struct pipe_args /* {
17113675Sdyson		int	dummy;
17213675Sdyson	} */ *uap;
17313675Sdyson{
17476364Salfred	struct filedesc *fdp = p->p_fd;
17513675Sdyson	struct file *rf, *wf;
17613675Sdyson	struct pipe *rpipe, *wpipe;
17713675Sdyson	int fd, error;
17813675Sdyson
17927899Sdyson	if (pipe_zone == NULL)
18027923Sdyson		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
18127899Sdyson
18276364Salfred	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
18376364Salfred		pipeclose(rpipe);
18476364Salfred		pipeclose(wpipe);
18576364Salfred		return (ENFILE);
18676364Salfred	}
18776364Salfred
18813907Sdyson	rpipe->pipe_state |= PIPE_DIRECTOK;
18913907Sdyson	wpipe->pipe_state |= PIPE_DIRECTOK;
19013675Sdyson
19170915Sdwmalone	error = falloc(p, &rf, &fd);
19270915Sdwmalone	if (error) {
19370915Sdwmalone		pipeclose(rpipe);
19470915Sdwmalone		pipeclose(wpipe);
19570915Sdwmalone		return (error);
19670915Sdwmalone	}
19770915Sdwmalone	fhold(rf);
19870915Sdwmalone	p->p_retval[0] = fd;
19970915Sdwmalone
20070803Sdwmalone	/*
20170803Sdwmalone	 * Warning: once we've gotten past allocation of the fd for the
20270803Sdwmalone	 * read-side, we can only drop the read side via fdrop() in order
20370803Sdwmalone	 * to avoid races against processes which manage to dup() the read
20470803Sdwmalone	 * side while we are blocked trying to allocate the write side.
20570803Sdwmalone	 */
20613675Sdyson	rf->f_flag = FREAD | FWRITE;
20713675Sdyson	rf->f_type = DTYPE_PIPE;
20849413Sgreen	rf->f_data = (caddr_t)rpipe;
20913675Sdyson	rf->f_ops = &pipeops;
21013675Sdyson	error = falloc(p, &wf, &fd);
21170915Sdwmalone	if (error) {
21270915Sdwmalone		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
21370915Sdwmalone			fdp->fd_ofiles[p->p_retval[0]] = NULL;
21470915Sdwmalone			fdrop(rf, p);
21570915Sdwmalone		}
21670915Sdwmalone		fdrop(rf, p);
21770915Sdwmalone		/* rpipe has been closed by fdrop(). */
21870915Sdwmalone		pipeclose(wpipe);
21970915Sdwmalone		return (error);
22070915Sdwmalone	}
22113675Sdyson	wf->f_flag = FREAD | FWRITE;
22213675Sdyson	wf->f_type = DTYPE_PIPE;
22349413Sgreen	wf->f_data = (caddr_t)wpipe;
22413675Sdyson	wf->f_ops = &pipeops;
22530994Sphk	p->p_retval[1] = fd;
22613675Sdyson
22713675Sdyson	rpipe->pipe_peer = wpipe;
22813675Sdyson	wpipe->pipe_peer = rpipe;
22968883Sdillon	fdrop(rf, p);
23013675Sdyson
23113675Sdyson	return (0);
23213675Sdyson}
23313675Sdyson
23413909Sdyson/*
23513909Sdyson * Allocate kva for pipe circular buffer, the space is pageable
23676364Salfred * This routine will 'realloc' the size of a pipe safely, if it fails
23776364Salfred * it will retain the old buffer.
23876364Salfred * If it fails it will return ENOMEM.
23913909Sdyson */
24076364Salfredstatic int
24176364Salfredpipespace(cpipe, size)
24213675Sdyson	struct pipe *cpipe;
24376364Salfred	int size;
24413675Sdyson{
24576364Salfred	struct vm_object *object;
24676364Salfred	caddr_t buffer;
24713688Sdyson	int npages, error;
24813675Sdyson
24976364Salfred	npages = round_page(size)/PAGE_SIZE;
25013675Sdyson	/*
25113675Sdyson	 * Create an object, I don't like the idea of paging to/from
25213675Sdyson	 * kernel_object.
25314037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
25413675Sdyson	 */
25576364Salfred	object = vm_object_allocate(OBJT_DEFAULT, npages);
25676364Salfred	buffer = (caddr_t) vm_map_min(kernel_map);
25713675Sdyson
25813675Sdyson	/*
25913675Sdyson	 * Insert the object into the kernel map, and allocate kva for it.
26013675Sdyson	 * The map entry is, by default, pageable.
26114037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
26213675Sdyson	 */
26376364Salfred	error = vm_map_find(kernel_map, object, 0,
26476364Salfred		(vm_offset_t *) &buffer, size, 1,
26513688Sdyson		VM_PROT_ALL, VM_PROT_ALL, 0);
26613675Sdyson
26776364Salfred	if (error != KERN_SUCCESS) {
26876364Salfred		vm_object_deallocate(object);
26976364Salfred		return (ENOMEM);
27076364Salfred	}
27176364Salfred
27276364Salfred	/* free old resources if we're resizing */
27376364Salfred	pipe_free_kmem(cpipe);
27476364Salfred	cpipe->pipe_buffer.object = object;
27576364Salfred	cpipe->pipe_buffer.buffer = buffer;
27676364Salfred	cpipe->pipe_buffer.size = size;
27776364Salfred	cpipe->pipe_buffer.in = 0;
27876364Salfred	cpipe->pipe_buffer.out = 0;
27976364Salfred	cpipe->pipe_buffer.cnt = 0;
28013907Sdyson	amountpipekva += cpipe->pipe_buffer.size;
28176364Salfred	return (0);
28213907Sdyson}
28313688Sdyson
28413907Sdyson/*
28513907Sdyson * initialize and allocate VM and memory for pipe
28613907Sdyson */
28776364Salfredstatic int
28876364Salfredpipe_create(cpipep)
28976364Salfred	struct pipe **cpipep;
29076364Salfred{
29113907Sdyson	struct pipe *cpipe;
29276364Salfred	int error;
29313907Sdyson
29476364Salfred	*cpipep = zalloc(pipe_zone);
29576364Salfred	if (*cpipep == NULL)
29676364Salfred		return (ENOMEM);
29717163Sdyson
29876364Salfred	cpipe = *cpipep;
29976364Salfred
30076364Salfred	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
30176364Salfred	cpipe->pipe_buffer.object = NULL;
30276364Salfred#ifndef PIPE_NODIRECT
30376364Salfred	cpipe->pipe_map.kva = NULL;
30476364Salfred#endif
30576364Salfred	/*
30676364Salfred	 * protect so pipeclose() doesn't follow a junk pointer
30776364Salfred	 * if pipespace() fails.
30876364Salfred	 */
30913675Sdyson	cpipe->pipe_state = 0;
31013675Sdyson	cpipe->pipe_peer = NULL;
31113675Sdyson	cpipe->pipe_busy = 0;
31213907Sdyson
31314037Sdyson#ifndef PIPE_NODIRECT
31413907Sdyson	/*
31513907Sdyson	 * pipe data structure initializations to support direct pipe I/O
31613907Sdyson	 */
31713907Sdyson	cpipe->pipe_map.cnt = 0;
31813907Sdyson	cpipe->pipe_map.kva = 0;
31913907Sdyson	cpipe->pipe_map.pos = 0;
32013907Sdyson	cpipe->pipe_map.npages = 0;
32117124Sbde	/* cpipe->pipe_map.ms[] = invalid */
32214037Sdyson#endif
32376364Salfred
32476364Salfred	error = pipespace(cpipe, PIPE_SIZE);
32576364Salfred	if (error) {
32676364Salfred		return (error);
32776364Salfred	}
32876364Salfred
32976364Salfred	vfs_timestamp(&cpipe->pipe_ctime);
33076364Salfred	cpipe->pipe_atime = cpipe->pipe_ctime;
33176364Salfred	cpipe->pipe_mtime = cpipe->pipe_ctime;
33276364Salfred	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
33376364Salfred
33476364Salfred	return (0);
33513675Sdyson}
33613675Sdyson
33713675Sdyson
33813675Sdyson/*
33913675Sdyson * lock a pipe for I/O, blocking other access
34013675Sdyson */
34113675Sdysonstatic __inline int
34213907Sdysonpipelock(cpipe, catch)
34313675Sdyson	struct pipe *cpipe;
34413907Sdyson	int catch;
34513675Sdyson{
34613776Sdyson	int error;
34776364Salfred
34813675Sdyson	while (cpipe->pipe_state & PIPE_LOCK) {
34913675Sdyson		cpipe->pipe_state |= PIPE_LWANT;
35043301Sdillon		if ((error = tsleep( cpipe,
35143301Sdillon			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
35213776Sdyson			return error;
35313675Sdyson		}
35413675Sdyson	}
35513675Sdyson	cpipe->pipe_state |= PIPE_LOCK;
35613675Sdyson	return 0;
35713675Sdyson}
35813675Sdyson
35913675Sdyson/*
36013675Sdyson * unlock a pipe I/O lock
36113675Sdyson */
36213675Sdysonstatic __inline void
36313675Sdysonpipeunlock(cpipe)
36413675Sdyson	struct pipe *cpipe;
36513675Sdyson{
36676364Salfred
36713675Sdyson	cpipe->pipe_state &= ~PIPE_LOCK;
36813675Sdyson	if (cpipe->pipe_state & PIPE_LWANT) {
36913675Sdyson		cpipe->pipe_state &= ~PIPE_LWANT;
37014177Sdyson		wakeup(cpipe);
37113675Sdyson	}
37213675Sdyson}
37313675Sdyson
37414037Sdysonstatic __inline void
37514037Sdysonpipeselwakeup(cpipe)
37614037Sdyson	struct pipe *cpipe;
37714037Sdyson{
37876364Salfred
37914037Sdyson	if (cpipe->pipe_state & PIPE_SEL) {
38014037Sdyson		cpipe->pipe_state &= ~PIPE_SEL;
38114037Sdyson		selwakeup(&cpipe->pipe_sel);
38214037Sdyson	}
38341086Struckman	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
38441086Struckman		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
38559288Sjlemon	KNOTE(&cpipe->pipe_sel.si_note, 0);
38614037Sdyson}
38714037Sdyson
38813675Sdyson/* ARGSUSED */
38913675Sdysonstatic int
39051418Sgreenpipe_read(fp, uio, cred, flags, p)
39113675Sdyson	struct file *fp;
39213675Sdyson	struct uio *uio;
39313675Sdyson	struct ucred *cred;
39451418Sgreen	struct proc *p;
39545311Sdt	int flags;
39613675Sdyson{
39713675Sdyson	struct pipe *rpipe = (struct pipe *) fp->f_data;
39847748Salc	int error;
39913675Sdyson	int nread = 0;
40018863Sdyson	u_int size;
40113675Sdyson
40213675Sdyson	++rpipe->pipe_busy;
40347748Salc	error = pipelock(rpipe, 1);
40447748Salc	if (error)
40547748Salc		goto unlocked_error;
40647748Salc
40713675Sdyson	while (uio->uio_resid) {
40813907Sdyson		/*
40913907Sdyson		 * normal pipe buffer receive
41013907Sdyson		 */
41113675Sdyson		if (rpipe->pipe_buffer.cnt > 0) {
41218863Sdyson			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
41313675Sdyson			if (size > rpipe->pipe_buffer.cnt)
41413675Sdyson				size = rpipe->pipe_buffer.cnt;
41518863Sdyson			if (size > (u_int) uio->uio_resid)
41618863Sdyson				size = (u_int) uio->uio_resid;
41747748Salc
41847748Salc			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
41913675Sdyson					size, uio);
42013675Sdyson			if (error) {
42113675Sdyson				break;
42213675Sdyson			}
42313675Sdyson			rpipe->pipe_buffer.out += size;
42413675Sdyson			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
42513675Sdyson				rpipe->pipe_buffer.out = 0;
42613675Sdyson
42713675Sdyson			rpipe->pipe_buffer.cnt -= size;
42847748Salc
42947748Salc			/*
43047748Salc			 * If there is no more to read in the pipe, reset
43147748Salc			 * its pointers to the beginning.  This improves
43247748Salc			 * cache hit stats.
43347748Salc			 */
43447748Salc			if (rpipe->pipe_buffer.cnt == 0) {
43547748Salc				rpipe->pipe_buffer.in = 0;
43647748Salc				rpipe->pipe_buffer.out = 0;
43747748Salc			}
43813675Sdyson			nread += size;
43914037Sdyson#ifndef PIPE_NODIRECT
44013907Sdyson		/*
44113907Sdyson		 * Direct copy, bypassing a kernel buffer.
44213907Sdyson		 */
44313907Sdyson		} else if ((size = rpipe->pipe_map.cnt) &&
44447748Salc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
44547748Salc			caddr_t	va;
44618863Sdyson			if (size > (u_int) uio->uio_resid)
44718863Sdyson				size = (u_int) uio->uio_resid;
44847748Salc
44947748Salc			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
45047748Salc			error = uiomove(va, size, uio);
45113907Sdyson			if (error)
45213907Sdyson				break;
45313907Sdyson			nread += size;
45413907Sdyson			rpipe->pipe_map.pos += size;
45513907Sdyson			rpipe->pipe_map.cnt -= size;
45613907Sdyson			if (rpipe->pipe_map.cnt == 0) {
45713907Sdyson				rpipe->pipe_state &= ~PIPE_DIRECTW;
45813907Sdyson				wakeup(rpipe);
45913907Sdyson			}
46014037Sdyson#endif
46113675Sdyson		} else {
46213675Sdyson			/*
46313675Sdyson			 * detect EOF condition
46413675Sdyson			 */
46513675Sdyson			if (rpipe->pipe_state & PIPE_EOF) {
46614802Sdyson				/* XXX error = ? */
46713675Sdyson				break;
46813675Sdyson			}
46943623Sdillon
47013675Sdyson			/*
47113675Sdyson			 * If the "write-side" has been blocked, wake it up now.
47213675Sdyson			 */
47313675Sdyson			if (rpipe->pipe_state & PIPE_WANTW) {
47413675Sdyson				rpipe->pipe_state &= ~PIPE_WANTW;
47513675Sdyson				wakeup(rpipe);
47613675Sdyson			}
47743623Sdillon
47843623Sdillon			/*
47947748Salc			 * Break if some data was read.
48043623Sdillon			 */
48147748Salc			if (nread > 0)
48213675Sdyson				break;
48316960Sdyson
48443623Sdillon			/*
48547748Salc			 * Unlock the pipe buffer for our remaining processing.  We
48647748Salc			 * will either break out with an error or we will sleep and
48747748Salc			 * relock to loop.
48843623Sdillon			 */
48947748Salc			pipeunlock(rpipe);
49043623Sdillon
49113675Sdyson			/*
49247748Salc			 * Handle non-blocking mode operation or
49347748Salc			 * wait for more data.
49413675Sdyson			 */
49547748Salc			if (fp->f_flag & FNONBLOCK)
49647748Salc				error = EAGAIN;
49747748Salc			else {
49847748Salc				rpipe->pipe_state |= PIPE_WANTR;
49947748Salc				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
50047748Salc					error = pipelock(rpipe, 1);
50113675Sdyson			}
50247748Salc			if (error)
50347748Salc				goto unlocked_error;
50413675Sdyson		}
50513675Sdyson	}
50647748Salc	pipeunlock(rpipe);
50713675Sdyson
50824101Sbde	if (error == 0)
50955112Sbde		vfs_timestamp(&rpipe->pipe_atime);
51047748Salcunlocked_error:
51147748Salc	--rpipe->pipe_busy;
51213913Sdyson
51347748Salc	/*
51447748Salc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
51547748Salc	 */
51613675Sdyson	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
51713675Sdyson		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
51813675Sdyson		wakeup(rpipe);
51913675Sdyson	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
52013675Sdyson		/*
52147748Salc		 * Handle write blocking hysteresis.
52213675Sdyson		 */
52313675Sdyson		if (rpipe->pipe_state & PIPE_WANTW) {
52413675Sdyson			rpipe->pipe_state &= ~PIPE_WANTW;
52513675Sdyson			wakeup(rpipe);
52613675Sdyson		}
52713675Sdyson	}
52814037Sdyson
52914802Sdyson	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
53014037Sdyson		pipeselwakeup(rpipe);
53114037Sdyson
53213675Sdyson	return error;
53313675Sdyson}
53413675Sdyson
53514037Sdyson#ifndef PIPE_NODIRECT
53613907Sdyson/*
53713907Sdyson * Map the sending processes' buffer into kernel space and wire it.
53813907Sdyson * This is similar to a physical write operation.
53913907Sdyson */
54013675Sdysonstatic int
54113907Sdysonpipe_build_write_buffer(wpipe, uio)
54213907Sdyson	struct pipe *wpipe;
54313675Sdyson	struct uio *uio;
54413675Sdyson{
54518863Sdyson	u_int size;
54613907Sdyson	int i;
54713907Sdyson	vm_offset_t addr, endaddr, paddr;
54813907Sdyson
54918863Sdyson	size = (u_int) uio->uio_iov->iov_len;
55013907Sdyson	if (size > wpipe->pipe_buffer.size)
55113907Sdyson		size = wpipe->pipe_buffer.size;
55213907Sdyson
55340286Sdg	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
55440286Sdg	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
55513907Sdyson		addr < endaddr;
55613907Sdyson		addr += PAGE_SIZE, i+=1) {
55713907Sdyson
55813907Sdyson		vm_page_t m;
55913907Sdyson
56051474Sdillon		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
56151474Sdillon		    (paddr = pmap_kextract(addr)) == 0) {
56213907Sdyson			int j;
56313907Sdyson			for(j=0;j<i;j++)
56440700Sdg				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
56513907Sdyson			return EFAULT;
56613907Sdyson		}
56713907Sdyson
56813907Sdyson		m = PHYS_TO_VM_PAGE(paddr);
56913907Sdyson		vm_page_wire(m);
57013907Sdyson		wpipe->pipe_map.ms[i] = m;
57113907Sdyson	}
57213907Sdyson
57313907Sdyson/*
57413907Sdyson * set up the control block
57513907Sdyson */
57613907Sdyson	wpipe->pipe_map.npages = i;
57713907Sdyson	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
57813907Sdyson	wpipe->pipe_map.cnt = size;
57913907Sdyson
58013907Sdyson/*
58113907Sdyson * and map the buffer
58213907Sdyson */
58313907Sdyson	if (wpipe->pipe_map.kva == 0) {
58413912Sdyson		/*
58513912Sdyson		 * We need to allocate space for an extra page because the
58613912Sdyson		 * address range might (will) span pages at times.
58713912Sdyson		 */
58813907Sdyson		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
58913912Sdyson			wpipe->pipe_buffer.size + PAGE_SIZE);
59013912Sdyson		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
59113907Sdyson	}
59213907Sdyson	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
59313907Sdyson		wpipe->pipe_map.npages);
59413907Sdyson
59513907Sdyson/*
59613907Sdyson * and update the uio data
59713907Sdyson */
59813907Sdyson
59913907Sdyson	uio->uio_iov->iov_len -= size;
60013907Sdyson	uio->uio_iov->iov_base += size;
60113907Sdyson	if (uio->uio_iov->iov_len == 0)
60213907Sdyson		uio->uio_iov++;
60313907Sdyson	uio->uio_resid -= size;
60413907Sdyson	uio->uio_offset += size;
60513907Sdyson	return 0;
60613907Sdyson}
60713907Sdyson
60813907Sdyson/*
60913907Sdyson * unmap and unwire the process buffer
61013907Sdyson */
61113907Sdysonstatic void
61213907Sdysonpipe_destroy_write_buffer(wpipe)
61313907Sdysonstruct pipe *wpipe;
61413907Sdyson{
61513907Sdyson	int i;
61676364Salfred
61717163Sdyson	if (wpipe->pipe_map.kva) {
61817163Sdyson		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
61913907Sdyson
62013907Sdyson		if (amountpipekva > MAXPIPEKVA) {
62113907Sdyson			vm_offset_t kva = wpipe->pipe_map.kva;
62213907Sdyson			wpipe->pipe_map.kva = 0;
62313907Sdyson			kmem_free(kernel_map, kva,
62413912Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
62513912Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
62613907Sdyson		}
62713907Sdyson	}
62813907Sdyson	for (i=0;i<wpipe->pipe_map.npages;i++)
62940700Sdg		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
63013907Sdyson}
63113907Sdyson
63213907Sdyson/*
63313907Sdyson * In the case of a signal, the writing process might go away.  This
63413907Sdyson * code copies the data into the circular buffer so that the source
63513907Sdyson * pages can be freed without loss of data.
63613907Sdyson */
63713907Sdysonstatic void
63813907Sdysonpipe_clone_write_buffer(wpipe)
63976364Salfred	struct pipe *wpipe;
64013907Sdyson{
64113907Sdyson	int size;
64213907Sdyson	int pos;
64313907Sdyson
64413907Sdyson	size = wpipe->pipe_map.cnt;
64513907Sdyson	pos = wpipe->pipe_map.pos;
64613907Sdyson	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
64713907Sdyson			(caddr_t) wpipe->pipe_buffer.buffer,
64813907Sdyson			size);
64913907Sdyson
65013907Sdyson	wpipe->pipe_buffer.in = size;
65113907Sdyson	wpipe->pipe_buffer.out = 0;
65213907Sdyson	wpipe->pipe_buffer.cnt = size;
65313907Sdyson	wpipe->pipe_state &= ~PIPE_DIRECTW;
65413907Sdyson
65513907Sdyson	pipe_destroy_write_buffer(wpipe);
65613907Sdyson}
65713907Sdyson
65813907Sdyson/*
65913907Sdyson * This implements the pipe buffer write mechanism.  Note that only
66013907Sdyson * a direct write OR a normal pipe write can be pending at any given time.
66113907Sdyson * If there are any characters in the pipe buffer, the direct write will
66213907Sdyson * be deferred until the receiving process grabs all of the bytes from
66313907Sdyson * the pipe buffer.  Then the direct mapping write is set-up.
66413907Sdyson */
66513907Sdysonstatic int
66613907Sdysonpipe_direct_write(wpipe, uio)
66713907Sdyson	struct pipe *wpipe;
66813907Sdyson	struct uio *uio;
66913907Sdyson{
67013907Sdyson	int error;
67176364Salfred
67213951Sdysonretry:
67313907Sdyson	while (wpipe->pipe_state & PIPE_DIRECTW) {
67413951Sdyson		if ( wpipe->pipe_state & PIPE_WANTR) {
67513951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
67613951Sdyson			wakeup(wpipe);
67713951Sdyson		}
67813992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
67913907Sdyson		error = tsleep(wpipe,
68013907Sdyson				PRIBIO|PCATCH, "pipdww", 0);
68114802Sdyson		if (error)
68213907Sdyson			goto error1;
68314802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
68414802Sdyson			error = EPIPE;
68514802Sdyson			goto error1;
68614802Sdyson		}
68713907Sdyson	}
68813907Sdyson	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
68913951Sdyson	if (wpipe->pipe_buffer.cnt > 0) {
69013951Sdyson		if ( wpipe->pipe_state & PIPE_WANTR) {
69113951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
69213951Sdyson			wakeup(wpipe);
69313951Sdyson		}
69413951Sdyson
69513992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
69613907Sdyson		error = tsleep(wpipe,
69713907Sdyson				PRIBIO|PCATCH, "pipdwc", 0);
69814802Sdyson		if (error)
69913907Sdyson			goto error1;
70014802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
70114802Sdyson			error = EPIPE;
70214802Sdyson			goto error1;
70313907Sdyson		}
70413951Sdyson		goto retry;
70513907Sdyson	}
70613907Sdyson
70713951Sdyson	wpipe->pipe_state |= PIPE_DIRECTW;
70813951Sdyson
70913907Sdyson	error = pipe_build_write_buffer(wpipe, uio);
71013907Sdyson	if (error) {
71113907Sdyson		wpipe->pipe_state &= ~PIPE_DIRECTW;
71213907Sdyson		goto error1;
71313907Sdyson	}
71413907Sdyson
71513907Sdyson	error = 0;
71613907Sdyson	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
71713907Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
71813907Sdyson			pipelock(wpipe, 0);
71913907Sdyson			pipe_destroy_write_buffer(wpipe);
72013907Sdyson			pipeunlock(wpipe);
72114037Sdyson			pipeselwakeup(wpipe);
72214802Sdyson			error = EPIPE;
72314802Sdyson			goto error1;
72413907Sdyson		}
72513992Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
72613992Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
72713992Sdyson			wakeup(wpipe);
72813992Sdyson		}
72914037Sdyson		pipeselwakeup(wpipe);
73013907Sdyson		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
73113907Sdyson	}
73213907Sdyson
73313907Sdyson	pipelock(wpipe,0);
73413907Sdyson	if (wpipe->pipe_state & PIPE_DIRECTW) {
73513907Sdyson		/*
73613907Sdyson		 * this bit of trickery substitutes a kernel buffer for
73713907Sdyson		 * the process that might be going away.
73813907Sdyson		 */
73913907Sdyson		pipe_clone_write_buffer(wpipe);
74013907Sdyson	} else {
74113907Sdyson		pipe_destroy_write_buffer(wpipe);
74213907Sdyson	}
74313907Sdyson	pipeunlock(wpipe);
74413907Sdyson	return error;
74513907Sdyson
74613907Sdysonerror1:
74713907Sdyson	wakeup(wpipe);
74813907Sdyson	return error;
74913907Sdyson}
75014037Sdyson#endif
75113907Sdyson
75216960Sdysonstatic int
75351418Sgreenpipe_write(fp, uio, cred, flags, p)
75416960Sdyson	struct file *fp;
75513907Sdyson	struct uio *uio;
75616960Sdyson	struct ucred *cred;
75751418Sgreen	struct proc *p;
75845311Sdt	int flags;
75913907Sdyson{
76013675Sdyson	int error = 0;
76113913Sdyson	int orig_resid;
76216960Sdyson	struct pipe *wpipe, *rpipe;
76316960Sdyson
76416960Sdyson	rpipe = (struct pipe *) fp->f_data;
76516960Sdyson	wpipe = rpipe->pipe_peer;
76616960Sdyson
76713675Sdyson	/*
76813675Sdyson	 * detect loss of pipe read side, issue SIGPIPE if lost.
76913675Sdyson	 */
77016960Sdyson	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
77113774Sdyson		return EPIPE;
77213675Sdyson	}
77313675Sdyson
77417163Sdyson	/*
77517163Sdyson	 * If it is advantageous to resize the pipe buffer, do
77617163Sdyson	 * so.
77717163Sdyson	 */
77817163Sdyson	if ((uio->uio_resid > PIPE_SIZE) &&
77917163Sdyson		(nbigpipe < LIMITBIGPIPES) &&
78017163Sdyson		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
78117163Sdyson		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
78217163Sdyson		(wpipe->pipe_buffer.cnt == 0)) {
78317163Sdyson
78413907Sdyson		if ((error = pipelock(wpipe,1)) == 0) {
78576364Salfred			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
78676364Salfred				nbigpipe++;
78713907Sdyson			pipeunlock(wpipe);
78813907Sdyson		} else {
78913907Sdyson			return error;
79013907Sdyson		}
79113907Sdyson	}
79276364Salfred
79376364Salfred	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
79413907Sdyson
79513675Sdyson	++wpipe->pipe_busy;
79613913Sdyson	orig_resid = uio->uio_resid;
79713675Sdyson	while (uio->uio_resid) {
79813907Sdyson		int space;
79914037Sdyson#ifndef PIPE_NODIRECT
80013907Sdyson		/*
80113907Sdyson		 * If the transfer is large, we can gain performance if
80213907Sdyson		 * we do process-to-process copies directly.
80316416Sdyson		 * If the write is non-blocking, we don't use the
80416416Sdyson		 * direct write mechanism.
80558505Sdillon		 *
80658505Sdillon		 * The direct write mechanism will detect the reader going
80758505Sdillon		 * away on us.
80813907Sdyson		 */
80917163Sdyson		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
81017163Sdyson		    (fp->f_flag & FNONBLOCK) == 0 &&
81117163Sdyson			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
81213907Sdyson			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
81313907Sdyson			error = pipe_direct_write( wpipe, uio);
81413907Sdyson			if (error) {
81513907Sdyson				break;
81613907Sdyson			}
81713907Sdyson			continue;
81813907Sdyson		}
81914037Sdyson#endif
82013907Sdyson
82113907Sdyson		/*
82213907Sdyson		 * Pipe buffered writes cannot be coincidental with
82313907Sdyson		 * direct writes.  We wait until the currently executing
82413907Sdyson		 * direct write is completed before we start filling the
82558505Sdillon		 * pipe buffer.  We break out if a signal occurs or the
82658505Sdillon		 * reader goes away.
82713907Sdyson		 */
82813907Sdyson	retrywrite:
82913907Sdyson		while (wpipe->pipe_state & PIPE_DIRECTW) {
83013992Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
83113992Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
83213992Sdyson				wakeup(wpipe);
83313992Sdyson			}
83458505Sdillon			error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0);
83558505Sdillon			if (wpipe->pipe_state & PIPE_EOF)
83658505Sdillon				break;
83713907Sdyson			if (error)
83813907Sdyson				break;
83913907Sdyson		}
84058505Sdillon		if (wpipe->pipe_state & PIPE_EOF) {
84158505Sdillon			error = EPIPE;
84258505Sdillon			break;
84358505Sdillon		}
84413907Sdyson
84513907Sdyson		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
84614644Sdyson
84714644Sdyson		/* Writes of size <= PIPE_BUF must be atomic. */
84813913Sdyson		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
84913913Sdyson			space = 0;
85013907Sdyson
85117163Sdyson		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
85213907Sdyson			if ((error = pipelock(wpipe,1)) == 0) {
85354534Stegge				int size;	/* Transfer size */
85454534Stegge				int segsize;	/* first segment to transfer */
85513907Sdyson				/*
85613907Sdyson				 * It is possible for a direct write to
85713907Sdyson				 * slip in on us... handle it here...
85813907Sdyson				 */
85913907Sdyson				if (wpipe->pipe_state & PIPE_DIRECTW) {
86013907Sdyson					pipeunlock(wpipe);
86113907Sdyson					goto retrywrite;
86213907Sdyson				}
86354534Stegge				/*
86454534Stegge				 * If a process blocked in uiomove, our
86554534Stegge				 * value for space might be bad.
86658505Sdillon				 *
86758505Sdillon				 * XXX will we be ok if the reader has gone
86858505Sdillon				 * away here?
86954534Stegge				 */
87054534Stegge				if (space > wpipe->pipe_buffer.size -
87154534Stegge				    wpipe->pipe_buffer.cnt) {
87254534Stegge					pipeunlock(wpipe);
87354534Stegge					goto retrywrite;
87454534Stegge				}
87554534Stegge
87654534Stegge				/*
87754534Stegge				 * Transfer size is minimum of uio transfer
87854534Stegge				 * and free space in pipe buffer.
87954534Stegge				 */
88054534Stegge				if (space > uio->uio_resid)
88154534Stegge					size = uio->uio_resid;
88254534Stegge				else
88354534Stegge					size = space;
88454534Stegge				/*
88554534Stegge				 * First segment to transfer is minimum of
88654534Stegge				 * transfer size and contiguous space in
88754534Stegge				 * pipe buffer.  If first segment to transfer
88854534Stegge				 * is less than the transfer size, we've got
88954534Stegge				 * a wraparound in the buffer.
89054534Stegge				 */
89154534Stegge				segsize = wpipe->pipe_buffer.size -
89254534Stegge					wpipe->pipe_buffer.in;
89354534Stegge				if (segsize > size)
89454534Stegge					segsize = size;
89554534Stegge
89654534Stegge				/* Transfer first segment */
89754534Stegge
89854534Stegge				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
89954534Stegge						segsize, uio);
90054534Stegge
90154534Stegge				if (error == 0 && segsize < size) {
90254534Stegge					/*
90354534Stegge					 * Transfer remaining part now, to
90454534Stegge					 * support atomic writes.  Wraparound
90554534Stegge					 * happened.
90654534Stegge					 */
90754534Stegge					if (wpipe->pipe_buffer.in + segsize !=
90854534Stegge					    wpipe->pipe_buffer.size)
90954534Stegge						panic("Expected pipe buffer wraparound disappeared");
91054534Stegge
91154534Stegge					error = uiomove(&wpipe->pipe_buffer.buffer[0],
91254534Stegge							size - segsize, uio);
91354534Stegge				}
91454534Stegge				if (error == 0) {
91554534Stegge					wpipe->pipe_buffer.in += size;
91654534Stegge					if (wpipe->pipe_buffer.in >=
91754534Stegge					    wpipe->pipe_buffer.size) {
91854534Stegge						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
91954534Stegge							panic("Expected wraparound bad");
92054534Stegge						wpipe->pipe_buffer.in = size - segsize;
92154534Stegge					}
92254534Stegge
92354534Stegge					wpipe->pipe_buffer.cnt += size;
92454534Stegge					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
92554534Stegge						panic("Pipe buffer overflow");
92654534Stegge
92754534Stegge				}
92813675Sdyson				pipeunlock(wpipe);
92913675Sdyson			}
93013675Sdyson			if (error)
93113675Sdyson				break;
93213675Sdyson
93313675Sdyson		} else {
93413675Sdyson			/*
93513675Sdyson			 * If the "read-side" has been blocked, wake it up now.
93613675Sdyson			 */
93713675Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
93813675Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
93913675Sdyson				wakeup(wpipe);
94013675Sdyson			}
94114037Sdyson
94213675Sdyson			/*
94313675Sdyson			 * don't block on non-blocking I/O
94413675Sdyson			 */
94516960Sdyson			if (fp->f_flag & FNONBLOCK) {
94613907Sdyson				error = EAGAIN;
94713675Sdyson				break;
94813675Sdyson			}
94913907Sdyson
95014037Sdyson			/*
95114037Sdyson			 * We have no more space and have something to offer,
95229356Speter			 * wake up select/poll.
95314037Sdyson			 */
95414037Sdyson			pipeselwakeup(wpipe);
95514037Sdyson
95613675Sdyson			wpipe->pipe_state |= PIPE_WANTW;
95743301Sdillon			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
95813675Sdyson				break;
95913675Sdyson			}
96013675Sdyson			/*
96113675Sdyson			 * If read side wants to go away, we just issue a signal
96213675Sdyson			 * to ourselves.
96313675Sdyson			 */
96413675Sdyson			if (wpipe->pipe_state & PIPE_EOF) {
96513774Sdyson				error = EPIPE;
96613907Sdyson				break;
96713675Sdyson			}
96813675Sdyson		}
96913675Sdyson	}
97013675Sdyson
97114644Sdyson	--wpipe->pipe_busy;
97213675Sdyson	if ((wpipe->pipe_busy == 0) &&
97313675Sdyson		(wpipe->pipe_state & PIPE_WANT)) {
97413675Sdyson		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
97513675Sdyson		wakeup(wpipe);
97613675Sdyson	} else if (wpipe->pipe_buffer.cnt > 0) {
97713675Sdyson		/*
97813675Sdyson		 * If we have put any characters in the buffer, we wake up
97913675Sdyson		 * the reader.
98013675Sdyson		 */
98113675Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
98213675Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
98313675Sdyson			wakeup(wpipe);
98413675Sdyson		}
98513675Sdyson	}
98613909Sdyson
98713909Sdyson	/*
98813909Sdyson	 * Don't return EPIPE if I/O was successful
98913909Sdyson	 */
99013907Sdyson	if ((wpipe->pipe_buffer.cnt == 0) &&
99113907Sdyson		(uio->uio_resid == 0) &&
99213907Sdyson		(error == EPIPE))
99313907Sdyson		error = 0;
99413913Sdyson
99524101Sbde	if (error == 0)
99655112Sbde		vfs_timestamp(&wpipe->pipe_mtime);
99724101Sbde
99814037Sdyson	/*
99914037Sdyson	 * We have something to offer,
100029356Speter	 * wake up select/poll.
100114037Sdyson	 */
100214177Sdyson	if (wpipe->pipe_buffer.cnt)
100314037Sdyson		pipeselwakeup(wpipe);
100413907Sdyson
100513675Sdyson	return error;
100613675Sdyson}
100713675Sdyson
100813675Sdyson/*
100913675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets.
101013675Sdyson */
101113675Sdysonint
101213675Sdysonpipe_ioctl(fp, cmd, data, p)
101313675Sdyson	struct file *fp;
101436735Sdfr	u_long cmd;
101576364Salfred	caddr_t data;
101613675Sdyson	struct proc *p;
101713675Sdyson{
101876364Salfred	struct pipe *mpipe = (struct pipe *)fp->f_data;
101913675Sdyson
102013675Sdyson	switch (cmd) {
102113675Sdyson
102213675Sdyson	case FIONBIO:
102313675Sdyson		return (0);
102413675Sdyson
102513675Sdyson	case FIOASYNC:
102613675Sdyson		if (*(int *)data) {
102713675Sdyson			mpipe->pipe_state |= PIPE_ASYNC;
102813675Sdyson		} else {
102913675Sdyson			mpipe->pipe_state &= ~PIPE_ASYNC;
103013675Sdyson		}
103113675Sdyson		return (0);
103213675Sdyson
103313675Sdyson	case FIONREAD:
103414037Sdyson		if (mpipe->pipe_state & PIPE_DIRECTW)
103514037Sdyson			*(int *)data = mpipe->pipe_map.cnt;
103614037Sdyson		else
103714037Sdyson			*(int *)data = mpipe->pipe_buffer.cnt;
103813675Sdyson		return (0);
103913675Sdyson
104041086Struckman	case FIOSETOWN:
104141086Struckman		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
104241086Struckman
104341086Struckman	case FIOGETOWN:
104441086Struckman		*(int *)data = fgetown(mpipe->pipe_sigio);
104513675Sdyson		return (0);
104613675Sdyson
104741086Struckman	/* This is deprecated, FIOSETOWN should be used instead. */
104841086Struckman	case TIOCSPGRP:
104941086Struckman		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
105041086Struckman
105141086Struckman	/* This is deprecated, FIOGETOWN should be used instead. */
105218863Sdyson	case TIOCGPGRP:
105341086Struckman		*(int *)data = -fgetown(mpipe->pipe_sigio);
105413675Sdyson		return (0);
105513675Sdyson
105613675Sdyson	}
105717124Sbde	return (ENOTTY);
105813675Sdyson}
105913675Sdyson
106013675Sdysonint
106129356Speterpipe_poll(fp, events, cred, p)
106213675Sdyson	struct file *fp;
106329356Speter	int events;
106429356Speter	struct ucred *cred;
106513675Sdyson	struct proc *p;
106613675Sdyson{
106776364Salfred	struct pipe *rpipe = (struct pipe *)fp->f_data;
106813675Sdyson	struct pipe *wpipe;
106929356Speter	int revents = 0;
107013675Sdyson
107113675Sdyson	wpipe = rpipe->pipe_peer;
107229356Speter	if (events & (POLLIN | POLLRDNORM))
107329356Speter		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
107429356Speter		    (rpipe->pipe_buffer.cnt > 0) ||
107529356Speter		    (rpipe->pipe_state & PIPE_EOF))
107629356Speter			revents |= events & (POLLIN | POLLRDNORM);
107713675Sdyson
107829356Speter	if (events & (POLLOUT | POLLWRNORM))
107929356Speter		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
108043311Sdillon		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
108143311Sdillon		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
108229356Speter			revents |= events & (POLLOUT | POLLWRNORM);
108313675Sdyson
108429356Speter	if ((rpipe->pipe_state & PIPE_EOF) ||
108529356Speter	    (wpipe == NULL) ||
108629356Speter	    (wpipe->pipe_state & PIPE_EOF))
108729356Speter		revents |= POLLHUP;
108829356Speter
108929356Speter	if (revents == 0) {
109029356Speter		if (events & (POLLIN | POLLRDNORM)) {
109129356Speter			selrecord(p, &rpipe->pipe_sel);
109229356Speter			rpipe->pipe_state |= PIPE_SEL;
109313675Sdyson		}
109413675Sdyson
109529356Speter		if (events & (POLLOUT | POLLWRNORM)) {
109630164Speter			selrecord(p, &wpipe->pipe_sel);
109730164Speter			wpipe->pipe_state |= PIPE_SEL;
109813907Sdyson		}
109913675Sdyson	}
110029356Speter
110129356Speter	return (revents);
110213675Sdyson}
110313675Sdyson
110452983Speterstatic int
110552983Speterpipe_stat(fp, ub, p)
110652983Speter	struct file *fp;
110752983Speter	struct stat *ub;
110852983Speter	struct proc *p;
110913675Sdyson{
111052983Speter	struct pipe *pipe = (struct pipe *)fp->f_data;
111152983Speter
111213675Sdyson	bzero((caddr_t)ub, sizeof (*ub));
111317124Sbde	ub->st_mode = S_IFIFO;
111413907Sdyson	ub->st_blksize = pipe->pipe_buffer.size;
111513675Sdyson	ub->st_size = pipe->pipe_buffer.cnt;
111613675Sdyson	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
111734901Sphk	ub->st_atimespec = pipe->pipe_atime;
111834901Sphk	ub->st_mtimespec = pipe->pipe_mtime;
111934901Sphk	ub->st_ctimespec = pipe->pipe_ctime;
112060404Schris	ub->st_uid = fp->f_cred->cr_uid;
112160404Schris	ub->st_gid = fp->f_cred->cr_gid;
112217124Sbde	/*
112360404Schris	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
112417124Sbde	 * XXX (st_dev, st_ino) should be unique.
112517124Sbde	 */
112613675Sdyson	return 0;
112713675Sdyson}
112813675Sdyson
112913675Sdyson/* ARGSUSED */
113013675Sdysonstatic int
113113675Sdysonpipe_close(fp, p)
113213675Sdyson	struct file *fp;
113313675Sdyson	struct proc *p;
113413675Sdyson{
113513675Sdyson	struct pipe *cpipe = (struct pipe *)fp->f_data;
113616322Sgpalmer
113749413Sgreen	fp->f_ops = &badfileops;
113849413Sgreen	fp->f_data = NULL;
113941086Struckman	funsetown(cpipe->pipe_sigio);
114013675Sdyson	pipeclose(cpipe);
114113675Sdyson	return 0;
114213675Sdyson}
114313675Sdyson
114476364Salfredstatic void
114576364Salfredpipe_free_kmem(cpipe)
114676364Salfred	struct pipe *cpipe;
114776364Salfred{
114876364Salfred
114976364Salfred	if (cpipe->pipe_buffer.buffer != NULL) {
115076364Salfred		if (cpipe->pipe_buffer.size > PIPE_SIZE)
115176364Salfred			--nbigpipe;
115276364Salfred		amountpipekva -= cpipe->pipe_buffer.size;
115376364Salfred		kmem_free(kernel_map,
115476364Salfred			(vm_offset_t)cpipe->pipe_buffer.buffer,
115576364Salfred			cpipe->pipe_buffer.size);
115676364Salfred		cpipe->pipe_buffer.buffer = NULL;
115776364Salfred	}
115876364Salfred#ifndef PIPE_NODIRECT
115976364Salfred	if (cpipe->pipe_map.kva != NULL) {
116076364Salfred		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
116176364Salfred		kmem_free(kernel_map,
116276364Salfred			cpipe->pipe_map.kva,
116376364Salfred			cpipe->pipe_buffer.size + PAGE_SIZE);
116476364Salfred		cpipe->pipe_map.cnt = 0;
116576364Salfred		cpipe->pipe_map.kva = 0;
116676364Salfred		cpipe->pipe_map.pos = 0;
116776364Salfred		cpipe->pipe_map.npages = 0;
116876364Salfred	}
116976364Salfred#endif
117076364Salfred}
117176364Salfred
117213675Sdyson/*
117313675Sdyson * shutdown the pipe
117413675Sdyson */
117513675Sdysonstatic void
117613675Sdysonpipeclose(cpipe)
117713675Sdyson	struct pipe *cpipe;
117813675Sdyson{
117913907Sdyson	struct pipe *ppipe;
118076364Salfred
118113675Sdyson	if (cpipe) {
118213907Sdyson
118314037Sdyson		pipeselwakeup(cpipe);
118413907Sdyson
118513675Sdyson		/*
118613675Sdyson		 * If the other side is blocked, wake it up saying that
118713675Sdyson		 * we want to close it down.
118813675Sdyson		 */
118913675Sdyson		while (cpipe->pipe_busy) {
119013675Sdyson			wakeup(cpipe);
119113675Sdyson			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
119213675Sdyson			tsleep(cpipe, PRIBIO, "pipecl", 0);
119313675Sdyson		}
119413675Sdyson
119513675Sdyson		/*
119613675Sdyson		 * Disconnect from peer
119713675Sdyson		 */
119843301Sdillon		if ((ppipe = cpipe->pipe_peer) != NULL) {
119914037Sdyson			pipeselwakeup(ppipe);
120013907Sdyson
120113907Sdyson			ppipe->pipe_state |= PIPE_EOF;
120213907Sdyson			wakeup(ppipe);
120313907Sdyson			ppipe->pipe_peer = NULL;
120413675Sdyson		}
120513675Sdyson
120613675Sdyson		/*
120713675Sdyson		 * free resources
120813675Sdyson		 */
120976364Salfred		pipe_free_kmem(cpipe);
121027899Sdyson		zfree(pipe_zone, cpipe);
121113675Sdyson	}
121213675Sdyson}
121359288Sjlemon
121472521Sjlemon/*ARGSUSED*/
121559288Sjlemonstatic int
121672521Sjlemonpipe_kqfilter(struct file *fp, struct knote *kn)
121759288Sjlemon{
121859288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
121959288Sjlemon
122072521Sjlemon	switch (kn->kn_filter) {
122172521Sjlemon	case EVFILT_READ:
122272521Sjlemon		kn->kn_fop = &pipe_rfiltops;
122372521Sjlemon		break;
122472521Sjlemon	case EVFILT_WRITE:
122572521Sjlemon		kn->kn_fop = &pipe_wfiltops;
122672521Sjlemon		break;
122772521Sjlemon	default:
122872521Sjlemon		return (1);
122972521Sjlemon	}
123072521Sjlemon
123159288Sjlemon	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
123259288Sjlemon	return (0);
123359288Sjlemon}
123459288Sjlemon
123559288Sjlemonstatic void
123659288Sjlemonfilt_pipedetach(struct knote *kn)
123759288Sjlemon{
123859288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
123959288Sjlemon
124060938Sjake	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
124159288Sjlemon}
124259288Sjlemon
124359288Sjlemon/*ARGSUSED*/
124459288Sjlemonstatic int
124559288Sjlemonfilt_piperead(struct knote *kn, long hint)
124659288Sjlemon{
124759288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
124859288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
124959288Sjlemon
125059288Sjlemon	kn->kn_data = rpipe->pipe_buffer.cnt;
125159288Sjlemon	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
125259288Sjlemon		kn->kn_data = rpipe->pipe_map.cnt;
125359288Sjlemon
125459288Sjlemon	if ((rpipe->pipe_state & PIPE_EOF) ||
125559288Sjlemon	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
125659288Sjlemon		kn->kn_flags |= EV_EOF;
125759288Sjlemon		return (1);
125859288Sjlemon	}
125959288Sjlemon	return (kn->kn_data > 0);
126059288Sjlemon}
126159288Sjlemon
126259288Sjlemon/*ARGSUSED*/
126359288Sjlemonstatic int
126459288Sjlemonfilt_pipewrite(struct knote *kn, long hint)
126559288Sjlemon{
126659288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
126759288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
126859288Sjlemon
126959288Sjlemon	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
127059288Sjlemon		kn->kn_data = 0;
127159288Sjlemon		kn->kn_flags |= EV_EOF;
127259288Sjlemon		return (1);
127359288Sjlemon	}
127459288Sjlemon	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
127565855Sjlemon	if (wpipe->pipe_state & PIPE_DIRECTW)
127659288Sjlemon		kn->kn_data = 0;
127759288Sjlemon
127859288Sjlemon	return (kn->kn_data >= PIPE_BUF);
127959288Sjlemon}
1280