sys_pipe.c revision 60404
113675Sdyson/*
213675Sdyson * Copyright (c) 1996 John S. Dyson
313675Sdyson * All rights reserved.
413675Sdyson *
513675Sdyson * Redistribution and use in source and binary forms, with or without
613675Sdyson * modification, are permitted provided that the following conditions
713675Sdyson * are met:
813675Sdyson * 1. Redistributions of source code must retain the above copyright
913675Sdyson *    notice immediately at the beginning of the file, without modification,
1013675Sdyson *    this list of conditions, and the following disclaimer.
1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1213675Sdyson *    notice, this list of conditions and the following disclaimer in the
1313675Sdyson *    documentation and/or other materials provided with the distribution.
1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author
1513675Sdyson *    John S. Dyson.
1614037Sdyson * 4. Modifications may be freely made to this file if the above conditions
1713675Sdyson *    are met.
1813675Sdyson *
1950477Speter * $FreeBSD: head/sys/kern/sys_pipe.c 60404 2000-05-11 22:08:20Z chris $
2013675Sdyson */
2113675Sdyson
2213675Sdyson/*
2313675Sdyson * This file contains a high-performance replacement for the socket-based
2413675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2513675Sdyson * all features of sockets, but does do everything that pipes normally
2613675Sdyson * do.
2713675Sdyson */
2813675Sdyson
2913907Sdyson/*
3013907Sdyson * This code has two modes of operation, a small write mode and a large
3113907Sdyson * write mode.  The small write mode acts like conventional pipes with
3213907Sdyson * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3313907Sdyson * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3413907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3513907Sdyson * the receiving process can copy it directly from the pages in the sending
3613907Sdyson * process.
3713907Sdyson *
3813907Sdyson * If the sending process receives a signal, it is possible that it will
3913913Sdyson * go away, and certainly its address space can change, because control
4013907Sdyson * is returned back to the user-mode side.  In that case, the pipe code
4113907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable
4213907Sdyson * kernel buffer, and the receiving process will grab the data from the
4313907Sdyson * pageable kernel buffer.  Since signals don't happen all that often,
4413907Sdyson * the copy operation is normally eliminated.
4513907Sdyson *
4613907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4713907Sdyson * happen for small transfers so that the system will not spend all of
4813913Sdyson * its time context switching.  PIPE_SIZE is constrained by the
4913907Sdyson * amount of kernel virtual memory.
5013907Sdyson */
5113907Sdyson
5213675Sdyson#include <sys/param.h>
5313675Sdyson#include <sys/systm.h>
5413675Sdyson#include <sys/proc.h>
5524131Sbde#include <sys/fcntl.h>
5613675Sdyson#include <sys/file.h>
5713675Sdyson#include <sys/filedesc.h>
5824206Sbde#include <sys/filio.h>
5924206Sbde#include <sys/ttycom.h>
6013675Sdyson#include <sys/stat.h>
6129356Speter#include <sys/poll.h>
6243278Sbde#include <sys/select.h>
6313675Sdyson#include <sys/signalvar.h>
6413675Sdyson#include <sys/sysproto.h>
6513675Sdyson#include <sys/pipe.h>
6655112Sbde#include <sys/vnode.h>
6734924Sbde#include <sys/uio.h>
6859288Sjlemon#include <sys/event.h>
6913675Sdyson
7013675Sdyson#include <vm/vm.h>
7113675Sdyson#include <vm/vm_param.h>
7222521Sdyson#include <sys/lock.h>
7313675Sdyson#include <vm/vm_object.h>
7413675Sdyson#include <vm/vm_kern.h>
7513675Sdyson#include <vm/vm_extern.h>
7613675Sdyson#include <vm/pmap.h>
7713675Sdyson#include <vm/vm_map.h>
7813907Sdyson#include <vm/vm_page.h>
7927899Sdyson#include <vm/vm_zone.h>
8013675Sdyson
8114037Sdyson/*
8214037Sdyson * Use this define if you want to disable *fancy* VM things.  Expect an
8314037Sdyson * approx 30% decrease in transfer rate.  This could be useful for
8414037Sdyson * NetBSD or OpenBSD.
8514037Sdyson */
8614037Sdyson/* #define PIPE_NODIRECT */
8714037Sdyson
8814037Sdyson/*
8914037Sdyson * interfaces to the outside world
9014037Sdyson */
9113675Sdysonstatic int pipe_read __P((struct file *fp, struct uio *uio,
9251418Sgreen		struct ucred *cred, int flags, struct proc *p));
9313675Sdysonstatic int pipe_write __P((struct file *fp, struct uio *uio,
9451418Sgreen		struct ucred *cred, int flags, struct proc *p));
9513675Sdysonstatic int pipe_close __P((struct file *fp, struct proc *p));
9629356Speterstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
9729356Speter		struct proc *p));
9852983Speterstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
9936735Sdfrstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
10013675Sdyson
10113675Sdysonstatic struct fileops pipeops =
10252983Speter    { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_stat, pipe_close };
10313675Sdyson
10459288Sjlemonstatic int	filt_pipeattach(struct knote *kn);
10559288Sjlemonstatic void	filt_pipedetach(struct knote *kn);
10659288Sjlemonstatic int	filt_piperead(struct knote *kn, long hint);
10759288Sjlemonstatic int	filt_pipewrite(struct knote *kn, long hint);
10859288Sjlemon
10959288Sjlemonstruct filterops pipe_rwfiltops[] = {
11059288Sjlemon	{ 1, filt_pipeattach, filt_pipedetach, filt_piperead },
11159288Sjlemon	{ 1, filt_pipeattach, filt_pipedetach, filt_pipewrite },
11259288Sjlemon};
11359288Sjlemon
11413675Sdyson/*
11513675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe
11613675Sdyson * space is pageable.  The pipe code will try to maintain locality of
11713675Sdyson * reference for performance reasons, so small amounts of outstanding I/O
11813675Sdyson * will not wipe the cache.
11913675Sdyson */
12013907Sdyson#define MINPIPESIZE (PIPE_SIZE/3)
12113907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3)
12213675Sdyson
12313907Sdyson/*
12413907Sdyson * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
12513907Sdyson * is there so that on large systems, we don't exhaust it.
12613907Sdyson */
12713907Sdyson#define MAXPIPEKVA (8*1024*1024)
12813907Sdyson
12913907Sdyson/*
13013907Sdyson * Limit for direct transfers, we cannot, of course limit
13113907Sdyson * the amount of kva for pipes in general though.
13213907Sdyson */
13313907Sdyson#define LIMITPIPEKVA (16*1024*1024)
13417163Sdyson
13517163Sdyson/*
13617163Sdyson * Limit the number of "big" pipes
13717163Sdyson */
13817163Sdyson#define LIMITBIGPIPES	32
13933181Seivindstatic int nbigpipe;
14017163Sdyson
14117124Sbdestatic int amountpipekva;
14213907Sdyson
14313675Sdysonstatic void pipeclose __P((struct pipe *cpipe));
14413675Sdysonstatic void pipeinit __P((struct pipe *cpipe));
14513907Sdysonstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
14613675Sdysonstatic __inline void pipeunlock __P((struct pipe *cpipe));
14714122Speterstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
14814037Sdyson#ifndef PIPE_NODIRECT
14913907Sdysonstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
15013907Sdysonstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
15113907Sdysonstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
15213907Sdysonstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
15314037Sdyson#endif
15413907Sdysonstatic void pipespace __P((struct pipe *cpipe));
15513675Sdyson
15633181Seivindstatic vm_zone_t pipe_zone;
15727899Sdyson
15813675Sdyson/*
15913675Sdyson * The pipe system call for the DTYPE_PIPE type of pipes
16013675Sdyson */
16113675Sdyson
16213675Sdyson/* ARGSUSED */
16313675Sdysonint
16430994Sphkpipe(p, uap)
16513675Sdyson	struct proc *p;
16613675Sdyson	struct pipe_args /* {
16713675Sdyson		int	dummy;
16813675Sdyson	} */ *uap;
16913675Sdyson{
17013675Sdyson	register struct filedesc *fdp = p->p_fd;
17113675Sdyson	struct file *rf, *wf;
17213675Sdyson	struct pipe *rpipe, *wpipe;
17313675Sdyson	int fd, error;
17413675Sdyson
17527899Sdyson	if (pipe_zone == NULL)
17627923Sdyson		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
17727899Sdyson
17827899Sdyson	rpipe = zalloc( pipe_zone);
17913675Sdyson	pipeinit(rpipe);
18013907Sdyson	rpipe->pipe_state |= PIPE_DIRECTOK;
18127899Sdyson	wpipe = zalloc( pipe_zone);
18213675Sdyson	pipeinit(wpipe);
18313907Sdyson	wpipe->pipe_state |= PIPE_DIRECTOK;
18413675Sdyson
18513675Sdyson	error = falloc(p, &rf, &fd);
18613675Sdyson	if (error)
18713675Sdyson		goto free2;
18830994Sphk	p->p_retval[0] = fd;
18913675Sdyson	rf->f_flag = FREAD | FWRITE;
19013675Sdyson	rf->f_type = DTYPE_PIPE;
19149413Sgreen	rf->f_data = (caddr_t)rpipe;
19213675Sdyson	rf->f_ops = &pipeops;
19313675Sdyson	error = falloc(p, &wf, &fd);
19413675Sdyson	if (error)
19513675Sdyson		goto free3;
19613675Sdyson	wf->f_flag = FREAD | FWRITE;
19713675Sdyson	wf->f_type = DTYPE_PIPE;
19849413Sgreen	wf->f_data = (caddr_t)wpipe;
19913675Sdyson	wf->f_ops = &pipeops;
20030994Sphk	p->p_retval[1] = fd;
20113675Sdyson
20213675Sdyson	rpipe->pipe_peer = wpipe;
20313675Sdyson	wpipe->pipe_peer = rpipe;
20413675Sdyson
20513675Sdyson	return (0);
20613675Sdysonfree3:
20749413Sgreen	fdp->fd_ofiles[p->p_retval[0]] = 0;
20813675Sdyson	ffree(rf);
20913675Sdysonfree2:
21013675Sdyson	(void)pipeclose(wpipe);
21113675Sdyson	(void)pipeclose(rpipe);
21213675Sdyson	return (error);
21313675Sdyson}
21413675Sdyson
21513909Sdyson/*
21613909Sdyson * Allocate kva for pipe circular buffer, the space is pageable
21713909Sdyson */
21813675Sdysonstatic void
21913907Sdysonpipespace(cpipe)
22013675Sdyson	struct pipe *cpipe;
22113675Sdyson{
22213688Sdyson	int npages, error;
22313675Sdyson
22413907Sdyson	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
22513675Sdyson	/*
22613675Sdyson	 * Create an object, I don't like the idea of paging to/from
22713675Sdyson	 * kernel_object.
22814037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
22913675Sdyson	 */
23013675Sdyson	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
23113688Sdyson	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
23213675Sdyson
23313675Sdyson	/*
23413675Sdyson	 * Insert the object into the kernel map, and allocate kva for it.
23513675Sdyson	 * The map entry is, by default, pageable.
23614037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
23713675Sdyson	 */
23813688Sdyson	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
23913907Sdyson		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
24013907Sdyson		cpipe->pipe_buffer.size, 1,
24113688Sdyson		VM_PROT_ALL, VM_PROT_ALL, 0);
24213675Sdyson
24313688Sdyson	if (error != KERN_SUCCESS)
24413688Sdyson		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
24513907Sdyson	amountpipekva += cpipe->pipe_buffer.size;
24613907Sdyson}
24713688Sdyson
24813907Sdyson/*
24913907Sdyson * initialize and allocate VM and memory for pipe
25013907Sdyson */
25113907Sdysonstatic void
25213907Sdysonpipeinit(cpipe)
25313907Sdyson	struct pipe *cpipe;
25413907Sdyson{
25513907Sdyson
25613675Sdyson	cpipe->pipe_buffer.in = 0;
25713675Sdyson	cpipe->pipe_buffer.out = 0;
25813675Sdyson	cpipe->pipe_buffer.cnt = 0;
25913907Sdyson	cpipe->pipe_buffer.size = PIPE_SIZE;
26017163Sdyson
26113907Sdyson	/* Buffer kva gets dynamically allocated */
26213907Sdyson	cpipe->pipe_buffer.buffer = NULL;
26317124Sbde	/* cpipe->pipe_buffer.object = invalid */
26413675Sdyson
26513675Sdyson	cpipe->pipe_state = 0;
26613675Sdyson	cpipe->pipe_peer = NULL;
26713675Sdyson	cpipe->pipe_busy = 0;
26855112Sbde	vfs_timestamp(&cpipe->pipe_ctime);
26924101Sbde	cpipe->pipe_atime = cpipe->pipe_ctime;
27024101Sbde	cpipe->pipe_mtime = cpipe->pipe_ctime;
27113675Sdyson	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
27213907Sdyson
27314037Sdyson#ifndef PIPE_NODIRECT
27413907Sdyson	/*
27513907Sdyson	 * pipe data structure initializations to support direct pipe I/O
27613907Sdyson	 */
27713907Sdyson	cpipe->pipe_map.cnt = 0;
27813907Sdyson	cpipe->pipe_map.kva = 0;
27913907Sdyson	cpipe->pipe_map.pos = 0;
28013907Sdyson	cpipe->pipe_map.npages = 0;
28117124Sbde	/* cpipe->pipe_map.ms[] = invalid */
28214037Sdyson#endif
28313675Sdyson}
28413675Sdyson
28513675Sdyson
28613675Sdyson/*
28713675Sdyson * lock a pipe for I/O, blocking other access
28813675Sdyson */
28913675Sdysonstatic __inline int
29013907Sdysonpipelock(cpipe, catch)
29113675Sdyson	struct pipe *cpipe;
29213907Sdyson	int catch;
29313675Sdyson{
29413776Sdyson	int error;
29513675Sdyson	while (cpipe->pipe_state & PIPE_LOCK) {
29613675Sdyson		cpipe->pipe_state |= PIPE_LWANT;
29743301Sdillon		if ((error = tsleep( cpipe,
29843301Sdillon			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
29913776Sdyson			return error;
30013675Sdyson		}
30113675Sdyson	}
30213675Sdyson	cpipe->pipe_state |= PIPE_LOCK;
30313675Sdyson	return 0;
30413675Sdyson}
30513675Sdyson
30613675Sdyson/*
30713675Sdyson * unlock a pipe I/O lock
30813675Sdyson */
30913675Sdysonstatic __inline void
31013675Sdysonpipeunlock(cpipe)
31113675Sdyson	struct pipe *cpipe;
31213675Sdyson{
31313675Sdyson	cpipe->pipe_state &= ~PIPE_LOCK;
31413675Sdyson	if (cpipe->pipe_state & PIPE_LWANT) {
31513675Sdyson		cpipe->pipe_state &= ~PIPE_LWANT;
31614177Sdyson		wakeup(cpipe);
31713675Sdyson	}
31813675Sdyson}
31913675Sdyson
32014037Sdysonstatic __inline void
32114037Sdysonpipeselwakeup(cpipe)
32214037Sdyson	struct pipe *cpipe;
32314037Sdyson{
32414037Sdyson	if (cpipe->pipe_state & PIPE_SEL) {
32514037Sdyson		cpipe->pipe_state &= ~PIPE_SEL;
32614037Sdyson		selwakeup(&cpipe->pipe_sel);
32714037Sdyson	}
32841086Struckman	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
32941086Struckman		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
33059288Sjlemon	KNOTE(&cpipe->pipe_sel.si_note, 0);
33114037Sdyson}
33214037Sdyson
33313675Sdyson/* ARGSUSED */
33413675Sdysonstatic int
33551418Sgreenpipe_read(fp, uio, cred, flags, p)
33613675Sdyson	struct file *fp;
33713675Sdyson	struct uio *uio;
33813675Sdyson	struct ucred *cred;
33951418Sgreen	struct proc *p;
34045311Sdt	int flags;
34113675Sdyson{
34213675Sdyson
34313675Sdyson	struct pipe *rpipe = (struct pipe *) fp->f_data;
34447748Salc	int error;
34513675Sdyson	int nread = 0;
34618863Sdyson	u_int size;
34713675Sdyson
34813675Sdyson	++rpipe->pipe_busy;
34947748Salc	error = pipelock(rpipe, 1);
35047748Salc	if (error)
35147748Salc		goto unlocked_error;
35247748Salc
35313675Sdyson	while (uio->uio_resid) {
35413907Sdyson		/*
35513907Sdyson		 * normal pipe buffer receive
35613907Sdyson		 */
35713675Sdyson		if (rpipe->pipe_buffer.cnt > 0) {
35818863Sdyson			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
35913675Sdyson			if (size > rpipe->pipe_buffer.cnt)
36013675Sdyson				size = rpipe->pipe_buffer.cnt;
36118863Sdyson			if (size > (u_int) uio->uio_resid)
36218863Sdyson				size = (u_int) uio->uio_resid;
36347748Salc
36447748Salc			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
36513675Sdyson					size, uio);
36613675Sdyson			if (error) {
36713675Sdyson				break;
36813675Sdyson			}
36913675Sdyson			rpipe->pipe_buffer.out += size;
37013675Sdyson			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
37113675Sdyson				rpipe->pipe_buffer.out = 0;
37213675Sdyson
37313675Sdyson			rpipe->pipe_buffer.cnt -= size;
37447748Salc
37547748Salc			/*
37647748Salc			 * If there is no more to read in the pipe, reset
37747748Salc			 * its pointers to the beginning.  This improves
37847748Salc			 * cache hit stats.
37947748Salc			 */
38047748Salc			if (rpipe->pipe_buffer.cnt == 0) {
38147748Salc				rpipe->pipe_buffer.in = 0;
38247748Salc				rpipe->pipe_buffer.out = 0;
38347748Salc			}
38413675Sdyson			nread += size;
38514037Sdyson#ifndef PIPE_NODIRECT
38613907Sdyson		/*
38713907Sdyson		 * Direct copy, bypassing a kernel buffer.
38813907Sdyson		 */
38913907Sdyson		} else if ((size = rpipe->pipe_map.cnt) &&
39047748Salc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
39147748Salc			caddr_t	va;
39218863Sdyson			if (size > (u_int) uio->uio_resid)
39318863Sdyson				size = (u_int) uio->uio_resid;
39447748Salc
39547748Salc			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
39647748Salc			error = uiomove(va, size, uio);
39713907Sdyson			if (error)
39813907Sdyson				break;
39913907Sdyson			nread += size;
40013907Sdyson			rpipe->pipe_map.pos += size;
40113907Sdyson			rpipe->pipe_map.cnt -= size;
40213907Sdyson			if (rpipe->pipe_map.cnt == 0) {
40313907Sdyson				rpipe->pipe_state &= ~PIPE_DIRECTW;
40413907Sdyson				wakeup(rpipe);
40513907Sdyson			}
40614037Sdyson#endif
40713675Sdyson		} else {
40813675Sdyson			/*
40913675Sdyson			 * detect EOF condition
41013675Sdyson			 */
41113675Sdyson			if (rpipe->pipe_state & PIPE_EOF) {
41214802Sdyson				/* XXX error = ? */
41313675Sdyson				break;
41413675Sdyson			}
41543623Sdillon
41613675Sdyson			/*
41713675Sdyson			 * If the "write-side" has been blocked, wake it up now.
41813675Sdyson			 */
41913675Sdyson			if (rpipe->pipe_state & PIPE_WANTW) {
42013675Sdyson				rpipe->pipe_state &= ~PIPE_WANTW;
42113675Sdyson				wakeup(rpipe);
42213675Sdyson			}
42343623Sdillon
42443623Sdillon			/*
42547748Salc			 * Break if some data was read.
42643623Sdillon			 */
42747748Salc			if (nread > 0)
42813675Sdyson				break;
42916960Sdyson
43043623Sdillon			/*
43147748Salc			 * Unlock the pipe buffer for our remaining processing.  We
43247748Salc			 * will either break out with an error or we will sleep and
43347748Salc			 * relock to loop.
43443623Sdillon			 */
43547748Salc			pipeunlock(rpipe);
43643623Sdillon
43713675Sdyson			/*
43847748Salc			 * Handle non-blocking mode operation or
43947748Salc			 * wait for more data.
44013675Sdyson			 */
44147748Salc			if (fp->f_flag & FNONBLOCK)
44247748Salc				error = EAGAIN;
44347748Salc			else {
44447748Salc				rpipe->pipe_state |= PIPE_WANTR;
44547748Salc				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
44647748Salc					error = pipelock(rpipe, 1);
44713675Sdyson			}
44847748Salc			if (error)
44947748Salc				goto unlocked_error;
45013675Sdyson		}
45113675Sdyson	}
45247748Salc	pipeunlock(rpipe);
45313675Sdyson
45424101Sbde	if (error == 0)
45555112Sbde		vfs_timestamp(&rpipe->pipe_atime);
45647748Salcunlocked_error:
45747748Salc	--rpipe->pipe_busy;
45813913Sdyson
45947748Salc	/*
46047748Salc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
46147748Salc	 */
46213675Sdyson	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
46313675Sdyson		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
46413675Sdyson		wakeup(rpipe);
46513675Sdyson	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
46613675Sdyson		/*
46747748Salc		 * Handle write blocking hysteresis.
46813675Sdyson		 */
46913675Sdyson		if (rpipe->pipe_state & PIPE_WANTW) {
47013675Sdyson			rpipe->pipe_state &= ~PIPE_WANTW;
47113675Sdyson			wakeup(rpipe);
47213675Sdyson		}
47313675Sdyson	}
47414037Sdyson
47514802Sdyson	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
47614037Sdyson		pipeselwakeup(rpipe);
47714037Sdyson
47813675Sdyson	return error;
47913675Sdyson}
48013675Sdyson
48114037Sdyson#ifndef PIPE_NODIRECT
48213907Sdyson/*
48313907Sdyson * Map the sending processes' buffer into kernel space and wire it.
48413907Sdyson * This is similar to a physical write operation.
48513907Sdyson */
48613675Sdysonstatic int
48713907Sdysonpipe_build_write_buffer(wpipe, uio)
48813907Sdyson	struct pipe *wpipe;
48913675Sdyson	struct uio *uio;
49013675Sdyson{
49118863Sdyson	u_int size;
49213907Sdyson	int i;
49313907Sdyson	vm_offset_t addr, endaddr, paddr;
49413907Sdyson
49518863Sdyson	size = (u_int) uio->uio_iov->iov_len;
49613907Sdyson	if (size > wpipe->pipe_buffer.size)
49713907Sdyson		size = wpipe->pipe_buffer.size;
49813907Sdyson
49940286Sdg	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
50040286Sdg	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
50113907Sdyson		addr < endaddr;
50213907Sdyson		addr += PAGE_SIZE, i+=1) {
50313907Sdyson
50413907Sdyson		vm_page_t m;
50513907Sdyson
50651474Sdillon		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
50751474Sdillon		    (paddr = pmap_kextract(addr)) == 0) {
50813907Sdyson			int j;
50913907Sdyson			for(j=0;j<i;j++)
51040700Sdg				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
51113907Sdyson			return EFAULT;
51213907Sdyson		}
51313907Sdyson
51413907Sdyson		m = PHYS_TO_VM_PAGE(paddr);
51513907Sdyson		vm_page_wire(m);
51613907Sdyson		wpipe->pipe_map.ms[i] = m;
51713907Sdyson	}
51813907Sdyson
51913907Sdyson/*
52013907Sdyson * set up the control block
52113907Sdyson */
52213907Sdyson	wpipe->pipe_map.npages = i;
52313907Sdyson	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
52413907Sdyson	wpipe->pipe_map.cnt = size;
52513907Sdyson
52613907Sdyson/*
52713907Sdyson * and map the buffer
52813907Sdyson */
52913907Sdyson	if (wpipe->pipe_map.kva == 0) {
53013912Sdyson		/*
53113912Sdyson		 * We need to allocate space for an extra page because the
53213912Sdyson		 * address range might (will) span pages at times.
53313912Sdyson		 */
53413907Sdyson		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
53513912Sdyson			wpipe->pipe_buffer.size + PAGE_SIZE);
53613912Sdyson		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
53713907Sdyson	}
53813907Sdyson	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
53913907Sdyson		wpipe->pipe_map.npages);
54013907Sdyson
54113907Sdyson/*
54213907Sdyson * and update the uio data
54313907Sdyson */
54413907Sdyson
54513907Sdyson	uio->uio_iov->iov_len -= size;
54613907Sdyson	uio->uio_iov->iov_base += size;
54713907Sdyson	if (uio->uio_iov->iov_len == 0)
54813907Sdyson		uio->uio_iov++;
54913907Sdyson	uio->uio_resid -= size;
55013907Sdyson	uio->uio_offset += size;
55113907Sdyson	return 0;
55213907Sdyson}
55313907Sdyson
55413907Sdyson/*
55513907Sdyson * unmap and unwire the process buffer
55613907Sdyson */
55713907Sdysonstatic void
55813907Sdysonpipe_destroy_write_buffer(wpipe)
55913907Sdysonstruct pipe *wpipe;
56013907Sdyson{
56113907Sdyson	int i;
56217163Sdyson	if (wpipe->pipe_map.kva) {
56317163Sdyson		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
56413907Sdyson
56513907Sdyson		if (amountpipekva > MAXPIPEKVA) {
56613907Sdyson			vm_offset_t kva = wpipe->pipe_map.kva;
56713907Sdyson			wpipe->pipe_map.kva = 0;
56813907Sdyson			kmem_free(kernel_map, kva,
56913912Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
57013912Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
57113907Sdyson		}
57213907Sdyson	}
57313907Sdyson	for (i=0;i<wpipe->pipe_map.npages;i++)
57440700Sdg		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
57513907Sdyson}
57613907Sdyson
57713907Sdyson/*
57813907Sdyson * In the case of a signal, the writing process might go away.  This
57913907Sdyson * code copies the data into the circular buffer so that the source
58013907Sdyson * pages can be freed without loss of data.
58113907Sdyson */
58213907Sdysonstatic void
58313907Sdysonpipe_clone_write_buffer(wpipe)
58413907Sdysonstruct pipe *wpipe;
58513907Sdyson{
58613907Sdyson	int size;
58713907Sdyson	int pos;
58813907Sdyson
58913907Sdyson	size = wpipe->pipe_map.cnt;
59013907Sdyson	pos = wpipe->pipe_map.pos;
59113907Sdyson	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
59213907Sdyson			(caddr_t) wpipe->pipe_buffer.buffer,
59313907Sdyson			size);
59413907Sdyson
59513907Sdyson	wpipe->pipe_buffer.in = size;
59613907Sdyson	wpipe->pipe_buffer.out = 0;
59713907Sdyson	wpipe->pipe_buffer.cnt = size;
59813907Sdyson	wpipe->pipe_state &= ~PIPE_DIRECTW;
59913907Sdyson
60013907Sdyson	pipe_destroy_write_buffer(wpipe);
60113907Sdyson}
60213907Sdyson
60313907Sdyson/*
60413907Sdyson * This implements the pipe buffer write mechanism.  Note that only
60513907Sdyson * a direct write OR a normal pipe write can be pending at any given time.
60613907Sdyson * If there are any characters in the pipe buffer, the direct write will
60713907Sdyson * be deferred until the receiving process grabs all of the bytes from
60813907Sdyson * the pipe buffer.  Then the direct mapping write is set-up.
60913907Sdyson */
61013907Sdysonstatic int
61113907Sdysonpipe_direct_write(wpipe, uio)
61213907Sdyson	struct pipe *wpipe;
61313907Sdyson	struct uio *uio;
61413907Sdyson{
61513907Sdyson	int error;
61613951Sdysonretry:
61713907Sdyson	while (wpipe->pipe_state & PIPE_DIRECTW) {
61813951Sdyson		if ( wpipe->pipe_state & PIPE_WANTR) {
61913951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
62013951Sdyson			wakeup(wpipe);
62113951Sdyson		}
62213992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
62313907Sdyson		error = tsleep(wpipe,
62413907Sdyson				PRIBIO|PCATCH, "pipdww", 0);
62514802Sdyson		if (error)
62613907Sdyson			goto error1;
62714802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
62814802Sdyson			error = EPIPE;
62914802Sdyson			goto error1;
63014802Sdyson		}
63113907Sdyson	}
63213907Sdyson	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
63313951Sdyson	if (wpipe->pipe_buffer.cnt > 0) {
63413951Sdyson		if ( wpipe->pipe_state & PIPE_WANTR) {
63513951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
63613951Sdyson			wakeup(wpipe);
63713951Sdyson		}
63813951Sdyson
63913992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
64013907Sdyson		error = tsleep(wpipe,
64113907Sdyson				PRIBIO|PCATCH, "pipdwc", 0);
64214802Sdyson		if (error)
64313907Sdyson			goto error1;
64414802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
64514802Sdyson			error = EPIPE;
64614802Sdyson			goto error1;
64713907Sdyson		}
64813951Sdyson		goto retry;
64913907Sdyson	}
65013907Sdyson
65113951Sdyson	wpipe->pipe_state |= PIPE_DIRECTW;
65213951Sdyson
65313907Sdyson	error = pipe_build_write_buffer(wpipe, uio);
65413907Sdyson	if (error) {
65513907Sdyson		wpipe->pipe_state &= ~PIPE_DIRECTW;
65613907Sdyson		goto error1;
65713907Sdyson	}
65813907Sdyson
65913907Sdyson	error = 0;
66013907Sdyson	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
66113907Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
66213907Sdyson			pipelock(wpipe, 0);
66313907Sdyson			pipe_destroy_write_buffer(wpipe);
66413907Sdyson			pipeunlock(wpipe);
66514037Sdyson			pipeselwakeup(wpipe);
66614802Sdyson			error = EPIPE;
66714802Sdyson			goto error1;
66813907Sdyson		}
66913992Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
67013992Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
67113992Sdyson			wakeup(wpipe);
67213992Sdyson		}
67314037Sdyson		pipeselwakeup(wpipe);
67413907Sdyson		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
67513907Sdyson	}
67613907Sdyson
67713907Sdyson	pipelock(wpipe,0);
67813907Sdyson	if (wpipe->pipe_state & PIPE_DIRECTW) {
67913907Sdyson		/*
68013907Sdyson		 * this bit of trickery substitutes a kernel buffer for
68113907Sdyson		 * the process that might be going away.
68213907Sdyson		 */
68313907Sdyson		pipe_clone_write_buffer(wpipe);
68413907Sdyson	} else {
68513907Sdyson		pipe_destroy_write_buffer(wpipe);
68613907Sdyson	}
68713907Sdyson	pipeunlock(wpipe);
68813907Sdyson	return error;
68913907Sdyson
69013907Sdysonerror1:
69113907Sdyson	wakeup(wpipe);
69213907Sdyson	return error;
69313907Sdyson}
69414037Sdyson#endif
69513907Sdyson
69616960Sdysonstatic int
69751418Sgreenpipe_write(fp, uio, cred, flags, p)
69816960Sdyson	struct file *fp;
69913907Sdyson	struct uio *uio;
70016960Sdyson	struct ucred *cred;
70151418Sgreen	struct proc *p;
70245311Sdt	int flags;
70313907Sdyson{
70413675Sdyson	int error = 0;
70513913Sdyson	int orig_resid;
70613675Sdyson
70716960Sdyson	struct pipe *wpipe, *rpipe;
70816960Sdyson
70916960Sdyson	rpipe = (struct pipe *) fp->f_data;
71016960Sdyson	wpipe = rpipe->pipe_peer;
71116960Sdyson
71213675Sdyson	/*
71313675Sdyson	 * detect loss of pipe read side, issue SIGPIPE if lost.
71413675Sdyson	 */
71516960Sdyson	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
71613774Sdyson		return EPIPE;
71713675Sdyson	}
71813675Sdyson
71917163Sdyson	/*
72017163Sdyson	 * If it is advantageous to resize the pipe buffer, do
72117163Sdyson	 * so.
72217163Sdyson	 */
72317163Sdyson	if ((uio->uio_resid > PIPE_SIZE) &&
72417163Sdyson		(nbigpipe < LIMITBIGPIPES) &&
72517163Sdyson		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
72617163Sdyson		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
72717163Sdyson		(wpipe->pipe_buffer.cnt == 0)) {
72817163Sdyson
72917163Sdyson		if (wpipe->pipe_buffer.buffer) {
73017163Sdyson			amountpipekva -= wpipe->pipe_buffer.size;
73117163Sdyson			kmem_free(kernel_map,
73217163Sdyson				(vm_offset_t)wpipe->pipe_buffer.buffer,
73317163Sdyson				wpipe->pipe_buffer.size);
73417163Sdyson		}
73517163Sdyson
73617163Sdyson#ifndef PIPE_NODIRECT
73717163Sdyson		if (wpipe->pipe_map.kva) {
73817163Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
73917163Sdyson			kmem_free(kernel_map,
74017163Sdyson				wpipe->pipe_map.kva,
74117163Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
74217163Sdyson		}
74317163Sdyson#endif
74417163Sdyson
74517163Sdyson		wpipe->pipe_buffer.in = 0;
74617163Sdyson		wpipe->pipe_buffer.out = 0;
74717163Sdyson		wpipe->pipe_buffer.cnt = 0;
74817163Sdyson		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
74917163Sdyson		wpipe->pipe_buffer.buffer = NULL;
75017163Sdyson		++nbigpipe;
75117163Sdyson
75217163Sdyson#ifndef PIPE_NODIRECT
75317163Sdyson		wpipe->pipe_map.cnt = 0;
75417163Sdyson		wpipe->pipe_map.kva = 0;
75517163Sdyson		wpipe->pipe_map.pos = 0;
75617163Sdyson		wpipe->pipe_map.npages = 0;
75717163Sdyson#endif
75817163Sdyson
75917163Sdyson	}
76017163Sdyson
76117163Sdyson
76213907Sdyson	if( wpipe->pipe_buffer.buffer == NULL) {
76313907Sdyson		if ((error = pipelock(wpipe,1)) == 0) {
76413907Sdyson			pipespace(wpipe);
76513907Sdyson			pipeunlock(wpipe);
76613907Sdyson		} else {
76713907Sdyson			return error;
76813907Sdyson		}
76913907Sdyson	}
77013907Sdyson
77113675Sdyson	++wpipe->pipe_busy;
77213913Sdyson	orig_resid = uio->uio_resid;
77313675Sdyson	while (uio->uio_resid) {
77413907Sdyson		int space;
77514037Sdyson#ifndef PIPE_NODIRECT
77613907Sdyson		/*
77713907Sdyson		 * If the transfer is large, we can gain performance if
77813907Sdyson		 * we do process-to-process copies directly.
77916416Sdyson		 * If the write is non-blocking, we don't use the
78016416Sdyson		 * direct write mechanism.
78158505Sdillon		 *
78258505Sdillon		 * The direct write mechanism will detect the reader going
78358505Sdillon		 * away on us.
78413907Sdyson		 */
78517163Sdyson		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
78617163Sdyson		    (fp->f_flag & FNONBLOCK) == 0 &&
78717163Sdyson			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
78813907Sdyson			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
78913907Sdyson			error = pipe_direct_write( wpipe, uio);
79013907Sdyson			if (error) {
79113907Sdyson				break;
79213907Sdyson			}
79313907Sdyson			continue;
79413907Sdyson		}
79514037Sdyson#endif
79613907Sdyson
79713907Sdyson		/*
79813907Sdyson		 * Pipe buffered writes cannot be coincidental with
79913907Sdyson		 * direct writes.  We wait until the currently executing
80013907Sdyson		 * direct write is completed before we start filling the
80158505Sdillon		 * pipe buffer.  We break out if a signal occurs or the
80258505Sdillon		 * reader goes away.
80313907Sdyson		 */
80413907Sdyson	retrywrite:
80513907Sdyson		while (wpipe->pipe_state & PIPE_DIRECTW) {
80613992Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
80713992Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
80813992Sdyson				wakeup(wpipe);
80913992Sdyson			}
81058505Sdillon			error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0);
81158505Sdillon			if (wpipe->pipe_state & PIPE_EOF)
81258505Sdillon				break;
81313907Sdyson			if (error)
81413907Sdyson				break;
81513907Sdyson		}
81658505Sdillon		if (wpipe->pipe_state & PIPE_EOF) {
81758505Sdillon			error = EPIPE;
81858505Sdillon			break;
81958505Sdillon		}
82013907Sdyson
82113907Sdyson		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
82214644Sdyson
82314644Sdyson		/* Writes of size <= PIPE_BUF must be atomic. */
82413913Sdyson		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
82513913Sdyson			space = 0;
82613907Sdyson
82717163Sdyson		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
82813907Sdyson			if ((error = pipelock(wpipe,1)) == 0) {
82954534Stegge				int size;	/* Transfer size */
83054534Stegge				int segsize;	/* first segment to transfer */
83113907Sdyson				/*
83213907Sdyson				 * It is possible for a direct write to
83313907Sdyson				 * slip in on us... handle it here...
83413907Sdyson				 */
83513907Sdyson				if (wpipe->pipe_state & PIPE_DIRECTW) {
83613907Sdyson					pipeunlock(wpipe);
83713907Sdyson					goto retrywrite;
83813907Sdyson				}
83954534Stegge				/*
84054534Stegge				 * If a process blocked in uiomove, our
84154534Stegge				 * value for space might be bad.
84258505Sdillon				 *
84358505Sdillon				 * XXX will we be ok if the reader has gone
84458505Sdillon				 * away here?
84554534Stegge				 */
84654534Stegge				if (space > wpipe->pipe_buffer.size -
84754534Stegge				    wpipe->pipe_buffer.cnt) {
84854534Stegge					pipeunlock(wpipe);
84954534Stegge					goto retrywrite;
85054534Stegge				}
85154534Stegge
85254534Stegge				/*
85354534Stegge				 * Transfer size is minimum of uio transfer
85454534Stegge				 * and free space in pipe buffer.
85554534Stegge				 */
85654534Stegge				if (space > uio->uio_resid)
85754534Stegge					size = uio->uio_resid;
85854534Stegge				else
85954534Stegge					size = space;
86054534Stegge				/*
86154534Stegge				 * First segment to transfer is minimum of
86254534Stegge				 * transfer size and contiguous space in
86354534Stegge				 * pipe buffer.  If first segment to transfer
86454534Stegge				 * is less than the transfer size, we've got
86554534Stegge				 * a wraparound in the buffer.
86654534Stegge				 */
86754534Stegge				segsize = wpipe->pipe_buffer.size -
86854534Stegge					wpipe->pipe_buffer.in;
86954534Stegge				if (segsize > size)
87054534Stegge					segsize = size;
87154534Stegge
87254534Stegge				/* Transfer first segment */
87354534Stegge
87454534Stegge				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
87554534Stegge						segsize, uio);
87654534Stegge
87754534Stegge				if (error == 0 && segsize < size) {
87854534Stegge					/*
87954534Stegge					 * Transfer remaining part now, to
88054534Stegge					 * support atomic writes.  Wraparound
88154534Stegge					 * happened.
88254534Stegge					 */
88354534Stegge					if (wpipe->pipe_buffer.in + segsize !=
88454534Stegge					    wpipe->pipe_buffer.size)
88554534Stegge						panic("Expected pipe buffer wraparound disappeared");
88654534Stegge
88754534Stegge					error = uiomove(&wpipe->pipe_buffer.buffer[0],
88854534Stegge							size - segsize, uio);
88954534Stegge				}
89054534Stegge				if (error == 0) {
89154534Stegge					wpipe->pipe_buffer.in += size;
89254534Stegge					if (wpipe->pipe_buffer.in >=
89354534Stegge					    wpipe->pipe_buffer.size) {
89454534Stegge						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
89554534Stegge							panic("Expected wraparound bad");
89654534Stegge						wpipe->pipe_buffer.in = size - segsize;
89754534Stegge					}
89854534Stegge
89954534Stegge					wpipe->pipe_buffer.cnt += size;
90054534Stegge					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
90154534Stegge						panic("Pipe buffer overflow");
90254534Stegge
90354534Stegge				}
90413675Sdyson				pipeunlock(wpipe);
90513675Sdyson			}
90613675Sdyson			if (error)
90713675Sdyson				break;
90813675Sdyson
90913675Sdyson		} else {
91013675Sdyson			/*
91113675Sdyson			 * If the "read-side" has been blocked, wake it up now.
91213675Sdyson			 */
91313675Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
91413675Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
91513675Sdyson				wakeup(wpipe);
91613675Sdyson			}
91714037Sdyson
91813675Sdyson			/*
91913675Sdyson			 * don't block on non-blocking I/O
92013675Sdyson			 */
92116960Sdyson			if (fp->f_flag & FNONBLOCK) {
92213907Sdyson				error = EAGAIN;
92313675Sdyson				break;
92413675Sdyson			}
92513907Sdyson
92614037Sdyson			/*
92714037Sdyson			 * We have no more space and have something to offer,
92829356Speter			 * wake up select/poll.
92914037Sdyson			 */
93014037Sdyson			pipeselwakeup(wpipe);
93114037Sdyson
93213675Sdyson			wpipe->pipe_state |= PIPE_WANTW;
93343301Sdillon			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
93413675Sdyson				break;
93513675Sdyson			}
93613675Sdyson			/*
93713675Sdyson			 * If read side wants to go away, we just issue a signal
93813675Sdyson			 * to ourselves.
93913675Sdyson			 */
94013675Sdyson			if (wpipe->pipe_state & PIPE_EOF) {
94113774Sdyson				error = EPIPE;
94213907Sdyson				break;
94313675Sdyson			}
94413675Sdyson		}
94513675Sdyson	}
94613675Sdyson
94714644Sdyson	--wpipe->pipe_busy;
94813675Sdyson	if ((wpipe->pipe_busy == 0) &&
94913675Sdyson		(wpipe->pipe_state & PIPE_WANT)) {
95013675Sdyson		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
95113675Sdyson		wakeup(wpipe);
95213675Sdyson	} else if (wpipe->pipe_buffer.cnt > 0) {
95313675Sdyson		/*
95413675Sdyson		 * If we have put any characters in the buffer, we wake up
95513675Sdyson		 * the reader.
95613675Sdyson		 */
95713675Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
95813675Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
95913675Sdyson			wakeup(wpipe);
96013675Sdyson		}
96113675Sdyson	}
96213909Sdyson
96313909Sdyson	/*
96413909Sdyson	 * Don't return EPIPE if I/O was successful
96513909Sdyson	 */
96613907Sdyson	if ((wpipe->pipe_buffer.cnt == 0) &&
96713907Sdyson		(uio->uio_resid == 0) &&
96813907Sdyson		(error == EPIPE))
96913907Sdyson		error = 0;
97013913Sdyson
97124101Sbde	if (error == 0)
97255112Sbde		vfs_timestamp(&wpipe->pipe_mtime);
97324101Sbde
97414037Sdyson	/*
97514037Sdyson	 * We have something to offer,
97629356Speter	 * wake up select/poll.
97714037Sdyson	 */
97814177Sdyson	if (wpipe->pipe_buffer.cnt)
97914037Sdyson		pipeselwakeup(wpipe);
98013907Sdyson
98113675Sdyson	return error;
98213675Sdyson}
98313675Sdyson
98413675Sdyson/*
98513675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets.
98613675Sdyson */
98713675Sdysonint
98813675Sdysonpipe_ioctl(fp, cmd, data, p)
98913675Sdyson	struct file *fp;
99036735Sdfr	u_long cmd;
99113675Sdyson	register caddr_t data;
99213675Sdyson	struct proc *p;
99313675Sdyson{
99413675Sdyson	register struct pipe *mpipe = (struct pipe *)fp->f_data;
99513675Sdyson
99613675Sdyson	switch (cmd) {
99713675Sdyson
99813675Sdyson	case FIONBIO:
99913675Sdyson		return (0);
100013675Sdyson
100113675Sdyson	case FIOASYNC:
100213675Sdyson		if (*(int *)data) {
100313675Sdyson			mpipe->pipe_state |= PIPE_ASYNC;
100413675Sdyson		} else {
100513675Sdyson			mpipe->pipe_state &= ~PIPE_ASYNC;
100613675Sdyson		}
100713675Sdyson		return (0);
100813675Sdyson
100913675Sdyson	case FIONREAD:
101014037Sdyson		if (mpipe->pipe_state & PIPE_DIRECTW)
101114037Sdyson			*(int *)data = mpipe->pipe_map.cnt;
101214037Sdyson		else
101314037Sdyson			*(int *)data = mpipe->pipe_buffer.cnt;
101413675Sdyson		return (0);
101513675Sdyson
101641086Struckman	case FIOSETOWN:
101741086Struckman		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
101841086Struckman
101941086Struckman	case FIOGETOWN:
102041086Struckman		*(int *)data = fgetown(mpipe->pipe_sigio);
102113675Sdyson		return (0);
102213675Sdyson
102341086Struckman	/* This is deprecated, FIOSETOWN should be used instead. */
102441086Struckman	case TIOCSPGRP:
102541086Struckman		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
102641086Struckman
102741086Struckman	/* This is deprecated, FIOGETOWN should be used instead. */
102818863Sdyson	case TIOCGPGRP:
102941086Struckman		*(int *)data = -fgetown(mpipe->pipe_sigio);
103013675Sdyson		return (0);
103113675Sdyson
103213675Sdyson	}
103317124Sbde	return (ENOTTY);
103413675Sdyson}
103513675Sdyson
103613675Sdysonint
103729356Speterpipe_poll(fp, events, cred, p)
103813675Sdyson	struct file *fp;
103929356Speter	int events;
104029356Speter	struct ucred *cred;
104113675Sdyson	struct proc *p;
104213675Sdyson{
104313675Sdyson	register struct pipe *rpipe = (struct pipe *)fp->f_data;
104413675Sdyson	struct pipe *wpipe;
104529356Speter	int revents = 0;
104613675Sdyson
104713675Sdyson	wpipe = rpipe->pipe_peer;
104829356Speter	if (events & (POLLIN | POLLRDNORM))
104929356Speter		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
105029356Speter		    (rpipe->pipe_buffer.cnt > 0) ||
105129356Speter		    (rpipe->pipe_state & PIPE_EOF))
105229356Speter			revents |= events & (POLLIN | POLLRDNORM);
105313675Sdyson
105429356Speter	if (events & (POLLOUT | POLLWRNORM))
105529356Speter		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
105643311Sdillon		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
105743311Sdillon		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
105829356Speter			revents |= events & (POLLOUT | POLLWRNORM);
105913675Sdyson
106029356Speter	if ((rpipe->pipe_state & PIPE_EOF) ||
106129356Speter	    (wpipe == NULL) ||
106229356Speter	    (wpipe->pipe_state & PIPE_EOF))
106329356Speter		revents |= POLLHUP;
106429356Speter
106529356Speter	if (revents == 0) {
106629356Speter		if (events & (POLLIN | POLLRDNORM)) {
106729356Speter			selrecord(p, &rpipe->pipe_sel);
106829356Speter			rpipe->pipe_state |= PIPE_SEL;
106913675Sdyson		}
107013675Sdyson
107129356Speter		if (events & (POLLOUT | POLLWRNORM)) {
107230164Speter			selrecord(p, &wpipe->pipe_sel);
107330164Speter			wpipe->pipe_state |= PIPE_SEL;
107413907Sdyson		}
107513675Sdyson	}
107629356Speter
107729356Speter	return (revents);
107813675Sdyson}
107913675Sdyson
108052983Speterstatic int
108152983Speterpipe_stat(fp, ub, p)
108252983Speter	struct file *fp;
108352983Speter	struct stat *ub;
108452983Speter	struct proc *p;
108513675Sdyson{
108652983Speter	struct pipe *pipe = (struct pipe *)fp->f_data;
108752983Speter
108813675Sdyson	bzero((caddr_t)ub, sizeof (*ub));
108917124Sbde	ub->st_mode = S_IFIFO;
109013907Sdyson	ub->st_blksize = pipe->pipe_buffer.size;
109113675Sdyson	ub->st_size = pipe->pipe_buffer.cnt;
109213675Sdyson	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
109334901Sphk	ub->st_atimespec = pipe->pipe_atime;
109434901Sphk	ub->st_mtimespec = pipe->pipe_mtime;
109534901Sphk	ub->st_ctimespec = pipe->pipe_ctime;
109660404Schris	ub->st_uid = fp->f_cred->cr_uid;
109760404Schris	ub->st_gid = fp->f_cred->cr_gid;
109817124Sbde	/*
109960404Schris	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
110017124Sbde	 * XXX (st_dev, st_ino) should be unique.
110117124Sbde	 */
110213675Sdyson	return 0;
110313675Sdyson}
110413675Sdyson
110513675Sdyson/* ARGSUSED */
110613675Sdysonstatic int
110713675Sdysonpipe_close(fp, p)
110813675Sdyson	struct file *fp;
110913675Sdyson	struct proc *p;
111013675Sdyson{
111113675Sdyson	struct pipe *cpipe = (struct pipe *)fp->f_data;
111216322Sgpalmer
111349413Sgreen	fp->f_ops = &badfileops;
111449413Sgreen	fp->f_data = NULL;
111541086Struckman	funsetown(cpipe->pipe_sigio);
111613675Sdyson	pipeclose(cpipe);
111713675Sdyson	return 0;
111813675Sdyson}
111913675Sdyson
112013675Sdyson/*
112113675Sdyson * shutdown the pipe
112213675Sdyson */
112313675Sdysonstatic void
112413675Sdysonpipeclose(cpipe)
112513675Sdyson	struct pipe *cpipe;
112613675Sdyson{
112713907Sdyson	struct pipe *ppipe;
112813675Sdyson	if (cpipe) {
112913907Sdyson
113014037Sdyson		pipeselwakeup(cpipe);
113113907Sdyson
113213675Sdyson		/*
113313675Sdyson		 * If the other side is blocked, wake it up saying that
113413675Sdyson		 * we want to close it down.
113513675Sdyson		 */
113613675Sdyson		while (cpipe->pipe_busy) {
113713675Sdyson			wakeup(cpipe);
113813675Sdyson			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
113913675Sdyson			tsleep(cpipe, PRIBIO, "pipecl", 0);
114013675Sdyson		}
114113675Sdyson
114213675Sdyson		/*
114313675Sdyson		 * Disconnect from peer
114413675Sdyson		 */
114543301Sdillon		if ((ppipe = cpipe->pipe_peer) != NULL) {
114614037Sdyson			pipeselwakeup(ppipe);
114713907Sdyson
114813907Sdyson			ppipe->pipe_state |= PIPE_EOF;
114913907Sdyson			wakeup(ppipe);
115013907Sdyson			ppipe->pipe_peer = NULL;
115113675Sdyson		}
115213675Sdyson
115313675Sdyson		/*
115413675Sdyson		 * free resources
115513675Sdyson		 */
115613907Sdyson		if (cpipe->pipe_buffer.buffer) {
115717163Sdyson			if (cpipe->pipe_buffer.size > PIPE_SIZE)
115817163Sdyson				--nbigpipe;
115913907Sdyson			amountpipekva -= cpipe->pipe_buffer.size;
116013907Sdyson			kmem_free(kernel_map,
116113907Sdyson				(vm_offset_t)cpipe->pipe_buffer.buffer,
116213907Sdyson				cpipe->pipe_buffer.size);
116313907Sdyson		}
116414037Sdyson#ifndef PIPE_NODIRECT
116513907Sdyson		if (cpipe->pipe_map.kva) {
116613912Sdyson			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
116713907Sdyson			kmem_free(kernel_map,
116813907Sdyson				cpipe->pipe_map.kva,
116913912Sdyson				cpipe->pipe_buffer.size + PAGE_SIZE);
117013907Sdyson		}
117114037Sdyson#endif
117227899Sdyson		zfree(pipe_zone, cpipe);
117313675Sdyson	}
117413675Sdyson}
117559288Sjlemon
117659288Sjlemonstatic int
117759288Sjlemonfilt_pipeattach(struct knote *kn)
117859288Sjlemon{
117959288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
118059288Sjlemon
118159288Sjlemon	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
118259288Sjlemon	return (0);
118359288Sjlemon}
118459288Sjlemon
118559288Sjlemonstatic void
118659288Sjlemonfilt_pipedetach(struct knote *kn)
118759288Sjlemon{
118859288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
118959288Sjlemon
119059288Sjlemon	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
119159288Sjlemon}
119259288Sjlemon
119359288Sjlemon/*ARGSUSED*/
119459288Sjlemonstatic int
119559288Sjlemonfilt_piperead(struct knote *kn, long hint)
119659288Sjlemon{
119759288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
119859288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
119959288Sjlemon
120059288Sjlemon	kn->kn_data = rpipe->pipe_buffer.cnt;
120159288Sjlemon	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
120259288Sjlemon		kn->kn_data = rpipe->pipe_map.cnt;
120359288Sjlemon
120459288Sjlemon	if ((rpipe->pipe_state & PIPE_EOF) ||
120559288Sjlemon	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
120659288Sjlemon		kn->kn_flags |= EV_EOF;
120759288Sjlemon		return (1);
120859288Sjlemon	}
120959288Sjlemon	return (kn->kn_data > 0);
121059288Sjlemon}
121159288Sjlemon
121259288Sjlemon/*ARGSUSED*/
121359288Sjlemonstatic int
121459288Sjlemonfilt_pipewrite(struct knote *kn, long hint)
121559288Sjlemon{
121659288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
121759288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
121859288Sjlemon
121959288Sjlemon	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
122059288Sjlemon		kn->kn_data = 0;
122159288Sjlemon		kn->kn_flags |= EV_EOF;
122259288Sjlemon		return (1);
122359288Sjlemon	}
122459288Sjlemon	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
122559288Sjlemon	if ((wpipe->pipe_state & PIPE_DIRECTW) == 0)
122659288Sjlemon		kn->kn_data = 0;
122759288Sjlemon
122859288Sjlemon	return (kn->kn_data >= PIPE_BUF);
122959288Sjlemon}
1230