sys_pipe.c revision 91412
113675Sdyson/*
213675Sdyson * Copyright (c) 1996 John S. Dyson
313675Sdyson * All rights reserved.
413675Sdyson *
513675Sdyson * Redistribution and use in source and binary forms, with or without
613675Sdyson * modification, are permitted provided that the following conditions
713675Sdyson * are met:
813675Sdyson * 1. Redistributions of source code must retain the above copyright
913675Sdyson *    notice immediately at the beginning of the file, without modification,
1013675Sdyson *    this list of conditions, and the following disclaimer.
1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1213675Sdyson *    notice, this list of conditions and the following disclaimer in the
1313675Sdyson *    documentation and/or other materials provided with the distribution.
1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author
1513675Sdyson *    John S. Dyson.
1614037Sdyson * 4. Modifications may be freely made to this file if the above conditions
1713675Sdyson *    are met.
1813675Sdyson *
1950477Speter * $FreeBSD: head/sys/kern/sys_pipe.c 91412 2002-02-27 18:49:58Z alfred $
2013675Sdyson */
2113675Sdyson
2213675Sdyson/*
2313675Sdyson * This file contains a high-performance replacement for the socket-based
2413675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2513675Sdyson * all features of sockets, but does do everything that pipes normally
2613675Sdyson * do.
2713675Sdyson */
2813675Sdyson
2913907Sdyson/*
3013907Sdyson * This code has two modes of operation, a small write mode and a large
3113907Sdyson * write mode.  The small write mode acts like conventional pipes with
3213907Sdyson * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3313907Sdyson * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3413907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3513907Sdyson * the receiving process can copy it directly from the pages in the sending
3613907Sdyson * process.
3713907Sdyson *
3813907Sdyson * If the sending process receives a signal, it is possible that it will
3913913Sdyson * go away, and certainly its address space can change, because control
4013907Sdyson * is returned back to the user-mode side.  In that case, the pipe code
4113907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable
4213907Sdyson * kernel buffer, and the receiving process will grab the data from the
4313907Sdyson * pageable kernel buffer.  Since signals don't happen all that often,
4413907Sdyson * the copy operation is normally eliminated.
4513907Sdyson *
4613907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4713907Sdyson * happen for small transfers so that the system will not spend all of
4813913Sdyson * its time context switching.  PIPE_SIZE is constrained by the
4913907Sdyson * amount of kernel virtual memory.
5013907Sdyson */
5113907Sdyson
5213675Sdyson#include <sys/param.h>
5313675Sdyson#include <sys/systm.h>
5424131Sbde#include <sys/fcntl.h>
5513675Sdyson#include <sys/file.h>
5613675Sdyson#include <sys/filedesc.h>
5724206Sbde#include <sys/filio.h>
5891372Salfred#include <sys/kernel.h>
5976166Smarkm#include <sys/lock.h>
6076827Salfred#include <sys/mutex.h>
6124206Sbde#include <sys/ttycom.h>
6213675Sdyson#include <sys/stat.h>
6329356Speter#include <sys/poll.h>
6470834Swollman#include <sys/selinfo.h>
6513675Sdyson#include <sys/signalvar.h>
6613675Sdyson#include <sys/sysproto.h>
6713675Sdyson#include <sys/pipe.h>
6876166Smarkm#include <sys/proc.h>
6955112Sbde#include <sys/vnode.h>
7034924Sbde#include <sys/uio.h>
7159288Sjlemon#include <sys/event.h>
7213675Sdyson
7313675Sdyson#include <vm/vm.h>
7413675Sdyson#include <vm/vm_param.h>
7513675Sdyson#include <vm/vm_object.h>
7613675Sdyson#include <vm/vm_kern.h>
7713675Sdyson#include <vm/vm_extern.h>
7813675Sdyson#include <vm/pmap.h>
7913675Sdyson#include <vm/vm_map.h>
8013907Sdyson#include <vm/vm_page.h>
8127899Sdyson#include <vm/vm_zone.h>
8213675Sdyson
8314037Sdyson/*
8414037Sdyson * Use this define if you want to disable *fancy* VM things.  Expect an
8514037Sdyson * approx 30% decrease in transfer rate.  This could be useful for
8614037Sdyson * NetBSD or OpenBSD.
8714037Sdyson */
8814037Sdyson/* #define PIPE_NODIRECT */
8914037Sdyson
9014037Sdyson/*
9114037Sdyson * interfaces to the outside world
9214037Sdyson */
9313675Sdysonstatic int pipe_read __P((struct file *fp, struct uio *uio,
9483366Sjulian		struct ucred *cred, int flags, struct thread *td));
9513675Sdysonstatic int pipe_write __P((struct file *fp, struct uio *uio,
9683366Sjulian		struct ucred *cred, int flags, struct thread *td));
9783366Sjulianstatic int pipe_close __P((struct file *fp, struct thread *td));
9829356Speterstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
9983366Sjulian		struct thread *td));
10072521Sjlemonstatic int pipe_kqfilter __P((struct file *fp, struct knote *kn));
10183366Sjulianstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct thread *td));
10283366Sjulianstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct thread *td));
10313675Sdyson
10472521Sjlemonstatic struct fileops pipeops = {
10572521Sjlemon	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
10672521Sjlemon	pipe_stat, pipe_close
10772521Sjlemon};
10813675Sdyson
10959288Sjlemonstatic void	filt_pipedetach(struct knote *kn);
11059288Sjlemonstatic int	filt_piperead(struct knote *kn, long hint);
11159288Sjlemonstatic int	filt_pipewrite(struct knote *kn, long hint);
11259288Sjlemon
11372521Sjlemonstatic struct filterops pipe_rfiltops =
11472521Sjlemon	{ 1, NULL, filt_pipedetach, filt_piperead };
11572521Sjlemonstatic struct filterops pipe_wfiltops =
11672521Sjlemon	{ 1, NULL, filt_pipedetach, filt_pipewrite };
11759288Sjlemon
11891362Salfred#define PIPE_GET_GIANT(pipe)							\
11991362Salfred	do {								\
12091362Salfred		PIPE_UNLOCK(wpipe);					\
12191362Salfred		mtx_lock(&Giant);					\
12291362Salfred	} while (0)
12372521Sjlemon
12491362Salfred#define PIPE_DROP_GIANT(pipe)						\
12591362Salfred	do {								\
12691362Salfred		mtx_unlock(&Giant);					\
12791362Salfred		PIPE_LOCK(wpipe);					\
12891362Salfred	} while (0)
12991362Salfred
13013675Sdyson/*
13113675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe
13213675Sdyson * space is pageable.  The pipe code will try to maintain locality of
13313675Sdyson * reference for performance reasons, so small amounts of outstanding I/O
13413675Sdyson * will not wipe the cache.
13513675Sdyson */
13613907Sdyson#define MINPIPESIZE (PIPE_SIZE/3)
13713907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3)
13813675Sdyson
13913907Sdyson/*
14013907Sdyson * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
14113907Sdyson * is there so that on large systems, we don't exhaust it.
14213907Sdyson */
14313907Sdyson#define MAXPIPEKVA (8*1024*1024)
14413907Sdyson
14513907Sdyson/*
14613907Sdyson * Limit for direct transfers, we cannot, of course limit
14713907Sdyson * the amount of kva for pipes in general though.
14813907Sdyson */
14913907Sdyson#define LIMITPIPEKVA (16*1024*1024)
15017163Sdyson
15117163Sdyson/*
15217163Sdyson * Limit the number of "big" pipes
15317163Sdyson */
15417163Sdyson#define LIMITBIGPIPES	32
15533181Seivindstatic int nbigpipe;
15617163Sdyson
15717124Sbdestatic int amountpipekva;
15813907Sdyson
15991372Salfredstatic void pipeinit __P((void *dummy __unused));
16013675Sdysonstatic void pipeclose __P((struct pipe *cpipe));
16176364Salfredstatic void pipe_free_kmem __P((struct pipe *cpipe));
16276364Salfredstatic int pipe_create __P((struct pipe **cpipep));
16313907Sdysonstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
16413675Sdysonstatic __inline void pipeunlock __P((struct pipe *cpipe));
16514122Speterstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
16614037Sdyson#ifndef PIPE_NODIRECT
16713907Sdysonstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
16813907Sdysonstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
16913907Sdysonstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
17013907Sdysonstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
17114037Sdyson#endif
17276364Salfredstatic int pipespace __P((struct pipe *cpipe, int size));
17313675Sdyson
17433181Seivindstatic vm_zone_t pipe_zone;
17527899Sdyson
17691372SalfredSYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
17791372Salfred
17891372Salfredstatic void
17991372Salfredpipeinit(void *dummy __unused)
18091372Salfred{
18191372Salfred
18291372Salfred	pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
18391372Salfred}
18491372Salfred
18513675Sdyson/*
18613675Sdyson * The pipe system call for the DTYPE_PIPE type of pipes
18713675Sdyson */
18813675Sdyson
18913675Sdyson/* ARGSUSED */
19013675Sdysonint
19183366Sjulianpipe(td, uap)
19283366Sjulian	struct thread *td;
19313675Sdyson	struct pipe_args /* {
19413675Sdyson		int	dummy;
19513675Sdyson	} */ *uap;
19613675Sdyson{
19783366Sjulian	struct filedesc *fdp = td->td_proc->p_fd;
19813675Sdyson	struct file *rf, *wf;
19913675Sdyson	struct pipe *rpipe, *wpipe;
20013675Sdyson	int fd, error;
20191362Salfred
20291372Salfred	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
20327899Sdyson
20476756Salfred	rpipe = wpipe = NULL;
20576364Salfred	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
20676364Salfred		pipeclose(rpipe);
20776364Salfred		pipeclose(wpipe);
20876364Salfred		return (ENFILE);
20976364Salfred	}
21076364Salfred
21113907Sdyson	rpipe->pipe_state |= PIPE_DIRECTOK;
21213907Sdyson	wpipe->pipe_state |= PIPE_DIRECTOK;
21313675Sdyson
21483366Sjulian	error = falloc(td, &rf, &fd);
21570915Sdwmalone	if (error) {
21670915Sdwmalone		pipeclose(rpipe);
21770915Sdwmalone		pipeclose(wpipe);
21870915Sdwmalone		return (error);
21970915Sdwmalone	}
22070915Sdwmalone	fhold(rf);
22183366Sjulian	td->td_retval[0] = fd;
22270915Sdwmalone
22370803Sdwmalone	/*
22470803Sdwmalone	 * Warning: once we've gotten past allocation of the fd for the
22570803Sdwmalone	 * read-side, we can only drop the read side via fdrop() in order
22670803Sdwmalone	 * to avoid races against processes which manage to dup() the read
22770803Sdwmalone	 * side while we are blocked trying to allocate the write side.
22870803Sdwmalone	 */
22989306Salfred	FILE_LOCK(rf);
23013675Sdyson	rf->f_flag = FREAD | FWRITE;
23113675Sdyson	rf->f_type = DTYPE_PIPE;
23249413Sgreen	rf->f_data = (caddr_t)rpipe;
23313675Sdyson	rf->f_ops = &pipeops;
23489306Salfred	FILE_UNLOCK(rf);
23583366Sjulian	error = falloc(td, &wf, &fd);
23670915Sdwmalone	if (error) {
23789306Salfred		FILEDESC_LOCK(fdp);
23883366Sjulian		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
23983366Sjulian			fdp->fd_ofiles[td->td_retval[0]] = NULL;
24089306Salfred			FILEDESC_UNLOCK(fdp);
24183366Sjulian			fdrop(rf, td);
24289306Salfred		} else
24389306Salfred			FILEDESC_UNLOCK(fdp);
24483366Sjulian		fdrop(rf, td);
24570915Sdwmalone		/* rpipe has been closed by fdrop(). */
24670915Sdwmalone		pipeclose(wpipe);
24770915Sdwmalone		return (error);
24870915Sdwmalone	}
24989306Salfred	FILE_LOCK(wf);
25013675Sdyson	wf->f_flag = FREAD | FWRITE;
25113675Sdyson	wf->f_type = DTYPE_PIPE;
25249413Sgreen	wf->f_data = (caddr_t)wpipe;
25313675Sdyson	wf->f_ops = &pipeops;
25489306Salfred	FILE_UNLOCK(wf);
25583366Sjulian	td->td_retval[1] = fd;
25613675Sdyson	rpipe->pipe_peer = wpipe;
25713675Sdyson	wpipe->pipe_peer = rpipe;
25891362Salfred	rpipe->pipe_mtxp = wpipe->pipe_mtxp = mtx_pool_alloc();
25983366Sjulian	fdrop(rf, td);
26013675Sdyson
26113675Sdyson	return (0);
26213675Sdyson}
26313675Sdyson
26413909Sdyson/*
26513909Sdyson * Allocate kva for pipe circular buffer, the space is pageable
26676364Salfred * This routine will 'realloc' the size of a pipe safely, if it fails
26776364Salfred * it will retain the old buffer.
26876364Salfred * If it fails it will return ENOMEM.
26913909Sdyson */
27076364Salfredstatic int
27176364Salfredpipespace(cpipe, size)
27213675Sdyson	struct pipe *cpipe;
27376364Salfred	int size;
27413675Sdyson{
27576364Salfred	struct vm_object *object;
27676364Salfred	caddr_t buffer;
27713688Sdyson	int npages, error;
27813675Sdyson
27979224Sdillon	GIANT_REQUIRED;
28091412Salfred	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
28191412Salfred	       ("pipespace: pipe mutex locked"));
28279224Sdillon
28376364Salfred	npages = round_page(size)/PAGE_SIZE;
28413675Sdyson	/*
28513675Sdyson	 * Create an object, I don't like the idea of paging to/from
28613675Sdyson	 * kernel_object.
28714037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
28813675Sdyson	 */
28976364Salfred	object = vm_object_allocate(OBJT_DEFAULT, npages);
29076364Salfred	buffer = (caddr_t) vm_map_min(kernel_map);
29113675Sdyson
29213675Sdyson	/*
29313675Sdyson	 * Insert the object into the kernel map, and allocate kva for it.
29413675Sdyson	 * The map entry is, by default, pageable.
29514037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
29613675Sdyson	 */
29776364Salfred	error = vm_map_find(kernel_map, object, 0,
29876364Salfred		(vm_offset_t *) &buffer, size, 1,
29913688Sdyson		VM_PROT_ALL, VM_PROT_ALL, 0);
30013675Sdyson
30176364Salfred	if (error != KERN_SUCCESS) {
30276364Salfred		vm_object_deallocate(object);
30376364Salfred		return (ENOMEM);
30476364Salfred	}
30576364Salfred
30676364Salfred	/* free old resources if we're resizing */
30776364Salfred	pipe_free_kmem(cpipe);
30876364Salfred	cpipe->pipe_buffer.object = object;
30976364Salfred	cpipe->pipe_buffer.buffer = buffer;
31076364Salfred	cpipe->pipe_buffer.size = size;
31176364Salfred	cpipe->pipe_buffer.in = 0;
31276364Salfred	cpipe->pipe_buffer.out = 0;
31376364Salfred	cpipe->pipe_buffer.cnt = 0;
31413907Sdyson	amountpipekva += cpipe->pipe_buffer.size;
31576364Salfred	return (0);
31613907Sdyson}
31713688Sdyson
31813907Sdyson/*
31913907Sdyson * initialize and allocate VM and memory for pipe
32013907Sdyson */
32176364Salfredstatic int
32276364Salfredpipe_create(cpipep)
32376364Salfred	struct pipe **cpipep;
32476364Salfred{
32513907Sdyson	struct pipe *cpipe;
32676364Salfred	int error;
32713907Sdyson
32876364Salfred	*cpipep = zalloc(pipe_zone);
32976364Salfred	if (*cpipep == NULL)
33076364Salfred		return (ENOMEM);
33117163Sdyson
33276364Salfred	cpipe = *cpipep;
33376364Salfred
33476364Salfred	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
33576364Salfred	cpipe->pipe_buffer.object = NULL;
33676364Salfred#ifndef PIPE_NODIRECT
33776364Salfred	cpipe->pipe_map.kva = NULL;
33876364Salfred#endif
33976364Salfred	/*
34076364Salfred	 * protect so pipeclose() doesn't follow a junk pointer
34176364Salfred	 * if pipespace() fails.
34276364Salfred	 */
34376754Salfred	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
34413675Sdyson	cpipe->pipe_state = 0;
34513675Sdyson	cpipe->pipe_peer = NULL;
34613675Sdyson	cpipe->pipe_busy = 0;
34713907Sdyson
34814037Sdyson#ifndef PIPE_NODIRECT
34913907Sdyson	/*
35013907Sdyson	 * pipe data structure initializations to support direct pipe I/O
35113907Sdyson	 */
35213907Sdyson	cpipe->pipe_map.cnt = 0;
35313907Sdyson	cpipe->pipe_map.kva = 0;
35413907Sdyson	cpipe->pipe_map.pos = 0;
35513907Sdyson	cpipe->pipe_map.npages = 0;
35617124Sbde	/* cpipe->pipe_map.ms[] = invalid */
35714037Sdyson#endif
35876364Salfred
35991412Salfred	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
36076364Salfred	error = pipespace(cpipe, PIPE_SIZE);
36176760Salfred	if (error)
36276364Salfred		return (error);
36376364Salfred
36476364Salfred	vfs_timestamp(&cpipe->pipe_ctime);
36576364Salfred	cpipe->pipe_atime = cpipe->pipe_ctime;
36676364Salfred	cpipe->pipe_mtime = cpipe->pipe_ctime;
36776364Salfred
36876364Salfred	return (0);
36913675Sdyson}
37013675Sdyson
37113675Sdyson
37213675Sdyson/*
37313675Sdyson * lock a pipe for I/O, blocking other access
37413675Sdyson */
37513675Sdysonstatic __inline int
37613907Sdysonpipelock(cpipe, catch)
37713675Sdyson	struct pipe *cpipe;
37813907Sdyson	int catch;
37913675Sdyson{
38013776Sdyson	int error;
38176364Salfred
38291362Salfred	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
38391362Salfred	while (cpipe->pipe_state & PIPE_LOCKFL) {
38413675Sdyson		cpipe->pipe_state |= PIPE_LWANT;
38591362Salfred		error = msleep(cpipe, PIPE_MTX(cpipe),
38691362Salfred		    catch ? (PRIBIO | PCATCH) : PRIBIO,
38776760Salfred		    "pipelk", 0);
38876760Salfred		if (error != 0)
38976760Salfred			return (error);
39013675Sdyson	}
39191362Salfred	cpipe->pipe_state |= PIPE_LOCKFL;
39276760Salfred	return (0);
39313675Sdyson}
39413675Sdyson
39513675Sdyson/*
39613675Sdyson * unlock a pipe I/O lock
39713675Sdyson */
39813675Sdysonstatic __inline void
39913675Sdysonpipeunlock(cpipe)
40013675Sdyson	struct pipe *cpipe;
40113675Sdyson{
40276364Salfred
40391362Salfred	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
40491362Salfred	cpipe->pipe_state &= ~PIPE_LOCKFL;
40513675Sdyson	if (cpipe->pipe_state & PIPE_LWANT) {
40613675Sdyson		cpipe->pipe_state &= ~PIPE_LWANT;
40714177Sdyson		wakeup(cpipe);
40813675Sdyson	}
40913675Sdyson}
41013675Sdyson
41114037Sdysonstatic __inline void
41214037Sdysonpipeselwakeup(cpipe)
41314037Sdyson	struct pipe *cpipe;
41414037Sdyson{
41576364Salfred
41614037Sdyson	if (cpipe->pipe_state & PIPE_SEL) {
41714037Sdyson		cpipe->pipe_state &= ~PIPE_SEL;
41814037Sdyson		selwakeup(&cpipe->pipe_sel);
41914037Sdyson	}
42041086Struckman	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
42141086Struckman		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
42259288Sjlemon	KNOTE(&cpipe->pipe_sel.si_note, 0);
42314037Sdyson}
42414037Sdyson
42513675Sdyson/* ARGSUSED */
42613675Sdysonstatic int
42783366Sjulianpipe_read(fp, uio, cred, flags, td)
42813675Sdyson	struct file *fp;
42913675Sdyson	struct uio *uio;
43013675Sdyson	struct ucred *cred;
43183366Sjulian	struct thread *td;
43245311Sdt	int flags;
43313675Sdyson{
43413675Sdyson	struct pipe *rpipe = (struct pipe *) fp->f_data;
43547748Salc	int error;
43613675Sdyson	int nread = 0;
43718863Sdyson	u_int size;
43813675Sdyson
43991362Salfred	PIPE_LOCK(rpipe);
44013675Sdyson	++rpipe->pipe_busy;
44147748Salc	error = pipelock(rpipe, 1);
44247748Salc	if (error)
44347748Salc		goto unlocked_error;
44447748Salc
44513675Sdyson	while (uio->uio_resid) {
44613907Sdyson		/*
44713907Sdyson		 * normal pipe buffer receive
44813907Sdyson		 */
44913675Sdyson		if (rpipe->pipe_buffer.cnt > 0) {
45018863Sdyson			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
45113675Sdyson			if (size > rpipe->pipe_buffer.cnt)
45213675Sdyson				size = rpipe->pipe_buffer.cnt;
45318863Sdyson			if (size > (u_int) uio->uio_resid)
45418863Sdyson				size = (u_int) uio->uio_resid;
45547748Salc
45691362Salfred			PIPE_UNLOCK(rpipe);
45747748Salc			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
45813675Sdyson					size, uio);
45991362Salfred			PIPE_LOCK(rpipe);
46076760Salfred			if (error)
46113675Sdyson				break;
46276760Salfred
46313675Sdyson			rpipe->pipe_buffer.out += size;
46413675Sdyson			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
46513675Sdyson				rpipe->pipe_buffer.out = 0;
46613675Sdyson
46713675Sdyson			rpipe->pipe_buffer.cnt -= size;
46847748Salc
46947748Salc			/*
47047748Salc			 * If there is no more to read in the pipe, reset
47147748Salc			 * its pointers to the beginning.  This improves
47247748Salc			 * cache hit stats.
47347748Salc			 */
47447748Salc			if (rpipe->pipe_buffer.cnt == 0) {
47547748Salc				rpipe->pipe_buffer.in = 0;
47647748Salc				rpipe->pipe_buffer.out = 0;
47747748Salc			}
47813675Sdyson			nread += size;
47914037Sdyson#ifndef PIPE_NODIRECT
48013907Sdyson		/*
48113907Sdyson		 * Direct copy, bypassing a kernel buffer.
48213907Sdyson		 */
48313907Sdyson		} else if ((size = rpipe->pipe_map.cnt) &&
48447748Salc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
48547748Salc			caddr_t	va;
48618863Sdyson			if (size > (u_int) uio->uio_resid)
48718863Sdyson				size = (u_int) uio->uio_resid;
48847748Salc
48976760Salfred			va = (caddr_t) rpipe->pipe_map.kva +
49076760Salfred			    rpipe->pipe_map.pos;
49191362Salfred			PIPE_UNLOCK(rpipe);
49247748Salc			error = uiomove(va, size, uio);
49391362Salfred			PIPE_LOCK(rpipe);
49413907Sdyson			if (error)
49513907Sdyson				break;
49613907Sdyson			nread += size;
49713907Sdyson			rpipe->pipe_map.pos += size;
49813907Sdyson			rpipe->pipe_map.cnt -= size;
49913907Sdyson			if (rpipe->pipe_map.cnt == 0) {
50013907Sdyson				rpipe->pipe_state &= ~PIPE_DIRECTW;
50113907Sdyson				wakeup(rpipe);
50213907Sdyson			}
50314037Sdyson#endif
50413675Sdyson		} else {
50513675Sdyson			/*
50613675Sdyson			 * detect EOF condition
50776760Salfred			 * read returns 0 on EOF, no need to set error
50813675Sdyson			 */
50976760Salfred			if (rpipe->pipe_state & PIPE_EOF)
51013675Sdyson				break;
51143623Sdillon
51213675Sdyson			/*
51313675Sdyson			 * If the "write-side" has been blocked, wake it up now.
51413675Sdyson			 */
51513675Sdyson			if (rpipe->pipe_state & PIPE_WANTW) {
51613675Sdyson				rpipe->pipe_state &= ~PIPE_WANTW;
51713675Sdyson				wakeup(rpipe);
51813675Sdyson			}
51943623Sdillon
52043623Sdillon			/*
52147748Salc			 * Break if some data was read.
52243623Sdillon			 */
52347748Salc			if (nread > 0)
52413675Sdyson				break;
52516960Sdyson
52643623Sdillon			/*
52747748Salc			 * Unlock the pipe buffer for our remaining processing.  We
52847748Salc			 * will either break out with an error or we will sleep and
52947748Salc			 * relock to loop.
53043623Sdillon			 */
53147748Salc			pipeunlock(rpipe);
53243623Sdillon
53313675Sdyson			/*
53447748Salc			 * Handle non-blocking mode operation or
53547748Salc			 * wait for more data.
53613675Sdyson			 */
53776760Salfred			if (fp->f_flag & FNONBLOCK) {
53847748Salc				error = EAGAIN;
53976760Salfred			} else {
54047748Salc				rpipe->pipe_state |= PIPE_WANTR;
54191362Salfred				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
54291362Salfred				    PRIBIO | PCATCH,
54377140Salfred				    "piperd", 0)) == 0)
54447748Salc					error = pipelock(rpipe, 1);
54513675Sdyson			}
54647748Salc			if (error)
54747748Salc				goto unlocked_error;
54813675Sdyson		}
54913675Sdyson	}
55047748Salc	pipeunlock(rpipe);
55113675Sdyson
55291362Salfred	/* XXX: should probably do this before getting any locks. */
55324101Sbde	if (error == 0)
55455112Sbde		vfs_timestamp(&rpipe->pipe_atime);
55547748Salcunlocked_error:
55647748Salc	--rpipe->pipe_busy;
55713913Sdyson
55847748Salc	/*
55947748Salc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
56047748Salc	 */
56113675Sdyson	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
56213675Sdyson		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
56313675Sdyson		wakeup(rpipe);
56413675Sdyson	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
56513675Sdyson		/*
56647748Salc		 * Handle write blocking hysteresis.
56713675Sdyson		 */
56813675Sdyson		if (rpipe->pipe_state & PIPE_WANTW) {
56913675Sdyson			rpipe->pipe_state &= ~PIPE_WANTW;
57013675Sdyson			wakeup(rpipe);
57113675Sdyson		}
57213675Sdyson	}
57314037Sdyson
57414802Sdyson	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
57514037Sdyson		pipeselwakeup(rpipe);
57614037Sdyson
57791362Salfred	PIPE_UNLOCK(rpipe);
57876760Salfred	return (error);
57913675Sdyson}
58013675Sdyson
58114037Sdyson#ifndef PIPE_NODIRECT
58213907Sdyson/*
58313907Sdyson * Map the sending processes' buffer into kernel space and wire it.
58413907Sdyson * This is similar to a physical write operation.
58513907Sdyson */
58613675Sdysonstatic int
58713907Sdysonpipe_build_write_buffer(wpipe, uio)
58813907Sdyson	struct pipe *wpipe;
58913675Sdyson	struct uio *uio;
59013675Sdyson{
59118863Sdyson	u_int size;
59213907Sdyson	int i;
59313907Sdyson	vm_offset_t addr, endaddr, paddr;
59413907Sdyson
59579224Sdillon	GIANT_REQUIRED;
59691412Salfred	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
59779224Sdillon
59818863Sdyson	size = (u_int) uio->uio_iov->iov_len;
59913907Sdyson	if (size > wpipe->pipe_buffer.size)
60013907Sdyson		size = wpipe->pipe_buffer.size;
60113907Sdyson
60240286Sdg	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
60376760Salfred	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
60476760Salfred	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
60513907Sdyson		vm_page_t m;
60613907Sdyson
60751474Sdillon		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
60851474Sdillon		    (paddr = pmap_kextract(addr)) == 0) {
60913907Sdyson			int j;
61076760Salfred
61176760Salfred			for (j = 0; j < i; j++)
61240700Sdg				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
61376760Salfred			return (EFAULT);
61413907Sdyson		}
61513907Sdyson
61613907Sdyson		m = PHYS_TO_VM_PAGE(paddr);
61713907Sdyson		vm_page_wire(m);
61813907Sdyson		wpipe->pipe_map.ms[i] = m;
61913907Sdyson	}
62013907Sdyson
62113907Sdyson/*
62213907Sdyson * set up the control block
62313907Sdyson */
62413907Sdyson	wpipe->pipe_map.npages = i;
62576760Salfred	wpipe->pipe_map.pos =
62676760Salfred	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
62713907Sdyson	wpipe->pipe_map.cnt = size;
62813907Sdyson
62913907Sdyson/*
63013907Sdyson * and map the buffer
63113907Sdyson */
63213907Sdyson	if (wpipe->pipe_map.kva == 0) {
63313912Sdyson		/*
63413912Sdyson		 * We need to allocate space for an extra page because the
63513912Sdyson		 * address range might (will) span pages at times.
63613912Sdyson		 */
63713907Sdyson		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
63813912Sdyson			wpipe->pipe_buffer.size + PAGE_SIZE);
63913912Sdyson		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
64013907Sdyson	}
64113907Sdyson	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
64213907Sdyson		wpipe->pipe_map.npages);
64313907Sdyson
64413907Sdyson/*
64513907Sdyson * and update the uio data
64613907Sdyson */
64713907Sdyson
64813907Sdyson	uio->uio_iov->iov_len -= size;
64913907Sdyson	uio->uio_iov->iov_base += size;
65013907Sdyson	if (uio->uio_iov->iov_len == 0)
65113907Sdyson		uio->uio_iov++;
65213907Sdyson	uio->uio_resid -= size;
65313907Sdyson	uio->uio_offset += size;
65476760Salfred	return (0);
65513907Sdyson}
65613907Sdyson
65713907Sdyson/*
65813907Sdyson * unmap and unwire the process buffer
65913907Sdyson */
66013907Sdysonstatic void
66113907Sdysonpipe_destroy_write_buffer(wpipe)
66276760Salfred	struct pipe *wpipe;
66313907Sdyson{
66413907Sdyson	int i;
66576364Salfred
66679224Sdillon	GIANT_REQUIRED;
66791412Salfred	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
66879224Sdillon
66917163Sdyson	if (wpipe->pipe_map.kva) {
67017163Sdyson		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
67113907Sdyson
67213907Sdyson		if (amountpipekva > MAXPIPEKVA) {
67313907Sdyson			vm_offset_t kva = wpipe->pipe_map.kva;
67413907Sdyson			wpipe->pipe_map.kva = 0;
67513907Sdyson			kmem_free(kernel_map, kva,
67613912Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
67713912Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
67813907Sdyson		}
67913907Sdyson	}
68076760Salfred	for (i = 0; i < wpipe->pipe_map.npages; i++)
68140700Sdg		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
68213907Sdyson}
68313907Sdyson
68413907Sdyson/*
68513907Sdyson * In the case of a signal, the writing process might go away.  This
68613907Sdyson * code copies the data into the circular buffer so that the source
68713907Sdyson * pages can be freed without loss of data.
68813907Sdyson */
68913907Sdysonstatic void
69013907Sdysonpipe_clone_write_buffer(wpipe)
69176364Salfred	struct pipe *wpipe;
69213907Sdyson{
69313907Sdyson	int size;
69413907Sdyson	int pos;
69513907Sdyson
69691362Salfred	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
69713907Sdyson	size = wpipe->pipe_map.cnt;
69813907Sdyson	pos = wpipe->pipe_map.pos;
69976760Salfred	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
70076760Salfred	    (caddr_t) wpipe->pipe_buffer.buffer, size);
70113907Sdyson
70213907Sdyson	wpipe->pipe_buffer.in = size;
70313907Sdyson	wpipe->pipe_buffer.out = 0;
70413907Sdyson	wpipe->pipe_buffer.cnt = size;
70513907Sdyson	wpipe->pipe_state &= ~PIPE_DIRECTW;
70613907Sdyson
70791412Salfred	PIPE_GET_GIANT(wpipe);
70813907Sdyson	pipe_destroy_write_buffer(wpipe);
70991412Salfred	PIPE_DROP_GIANT(wpipe);
71013907Sdyson}
71113907Sdyson
71213907Sdyson/*
71313907Sdyson * This implements the pipe buffer write mechanism.  Note that only
71413907Sdyson * a direct write OR a normal pipe write can be pending at any given time.
71513907Sdyson * If there are any characters in the pipe buffer, the direct write will
71613907Sdyson * be deferred until the receiving process grabs all of the bytes from
71713907Sdyson * the pipe buffer.  Then the direct mapping write is set-up.
71813907Sdyson */
71913907Sdysonstatic int
72013907Sdysonpipe_direct_write(wpipe, uio)
72113907Sdyson	struct pipe *wpipe;
72213907Sdyson	struct uio *uio;
72313907Sdyson{
72413907Sdyson	int error;
72576364Salfred
72613951Sdysonretry:
72791362Salfred	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
72813907Sdyson	while (wpipe->pipe_state & PIPE_DIRECTW) {
72976760Salfred		if (wpipe->pipe_state & PIPE_WANTR) {
73013951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
73113951Sdyson			wakeup(wpipe);
73213951Sdyson		}
73313992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
73491362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe),
73591362Salfred		    PRIBIO | PCATCH, "pipdww", 0);
73614802Sdyson		if (error)
73713907Sdyson			goto error1;
73814802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
73914802Sdyson			error = EPIPE;
74014802Sdyson			goto error1;
74114802Sdyson		}
74213907Sdyson	}
74313907Sdyson	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
74413951Sdyson	if (wpipe->pipe_buffer.cnt > 0) {
74576760Salfred		if (wpipe->pipe_state & PIPE_WANTR) {
74613951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
74713951Sdyson			wakeup(wpipe);
74813951Sdyson		}
74913951Sdyson
75013992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
75191362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe),
75291362Salfred		    PRIBIO | PCATCH, "pipdwc", 0);
75314802Sdyson		if (error)
75413907Sdyson			goto error1;
75514802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
75614802Sdyson			error = EPIPE;
75714802Sdyson			goto error1;
75813907Sdyson		}
75913951Sdyson		goto retry;
76013907Sdyson	}
76113907Sdyson
76213951Sdyson	wpipe->pipe_state |= PIPE_DIRECTW;
76313951Sdyson
76491362Salfred	PIPE_GET_GIANT(wpipe);
76513907Sdyson	error = pipe_build_write_buffer(wpipe, uio);
76691362Salfred	PIPE_DROP_GIANT(wpipe);
76713907Sdyson	if (error) {
76813907Sdyson		wpipe->pipe_state &= ~PIPE_DIRECTW;
76913907Sdyson		goto error1;
77013907Sdyson	}
77113907Sdyson
77213907Sdyson	error = 0;
77313907Sdyson	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
77413907Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
77513907Sdyson			pipelock(wpipe, 0);
77691362Salfred			PIPE_GET_GIANT(wpipe);
77713907Sdyson			pipe_destroy_write_buffer(wpipe);
77891362Salfred			PIPE_DROP_GIANT(wpipe);
77913907Sdyson			pipeunlock(wpipe);
78014037Sdyson			pipeselwakeup(wpipe);
78114802Sdyson			error = EPIPE;
78214802Sdyson			goto error1;
78313907Sdyson		}
78413992Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
78513992Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
78613992Sdyson			wakeup(wpipe);
78713992Sdyson		}
78814037Sdyson		pipeselwakeup(wpipe);
78991362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
79091362Salfred		    "pipdwt", 0);
79113907Sdyson	}
79213907Sdyson
79313907Sdyson	pipelock(wpipe,0);
79413907Sdyson	if (wpipe->pipe_state & PIPE_DIRECTW) {
79513907Sdyson		/*
79613907Sdyson		 * this bit of trickery substitutes a kernel buffer for
79713907Sdyson		 * the process that might be going away.
79813907Sdyson		 */
79913907Sdyson		pipe_clone_write_buffer(wpipe);
80013907Sdyson	} else {
80191412Salfred		PIPE_GET_GIANT(wpipe);
80213907Sdyson		pipe_destroy_write_buffer(wpipe);
80391412Salfred		PIPE_DROP_GIANT(wpipe);
80413907Sdyson	}
80513907Sdyson	pipeunlock(wpipe);
80676760Salfred	return (error);
80713907Sdyson
80813907Sdysonerror1:
80913907Sdyson	wakeup(wpipe);
81076760Salfred	return (error);
81113907Sdyson}
81214037Sdyson#endif
81313907Sdyson
81416960Sdysonstatic int
81583366Sjulianpipe_write(fp, uio, cred, flags, td)
81616960Sdyson	struct file *fp;
81713907Sdyson	struct uio *uio;
81816960Sdyson	struct ucred *cred;
81983366Sjulian	struct thread *td;
82045311Sdt	int flags;
82113907Sdyson{
82213675Sdyson	int error = 0;
82313913Sdyson	int orig_resid;
82416960Sdyson	struct pipe *wpipe, *rpipe;
82516960Sdyson
82616960Sdyson	rpipe = (struct pipe *) fp->f_data;
82716960Sdyson	wpipe = rpipe->pipe_peer;
82816960Sdyson
82991395Salfred	PIPE_LOCK(rpipe);
83013675Sdyson	/*
83113675Sdyson	 * detect loss of pipe read side, issue SIGPIPE if lost.
83213675Sdyson	 */
83316960Sdyson	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
83491395Salfred		PIPE_UNLOCK(rpipe);
83576760Salfred		return (EPIPE);
83613675Sdyson	}
83777676Sdillon	++wpipe->pipe_busy;
83813675Sdyson
83917163Sdyson	/*
84017163Sdyson	 * If it is advantageous to resize the pipe buffer, do
84117163Sdyson	 * so.
84217163Sdyson	 */
84317163Sdyson	if ((uio->uio_resid > PIPE_SIZE) &&
84417163Sdyson		(nbigpipe < LIMITBIGPIPES) &&
84517163Sdyson		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
84617163Sdyson		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
84717163Sdyson		(wpipe->pipe_buffer.cnt == 0)) {
84817163Sdyson
84913907Sdyson		if ((error = pipelock(wpipe,1)) == 0) {
85091395Salfred			PIPE_GET_GIANT(rpipe);
85176364Salfred			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
85276364Salfred				nbigpipe++;
85391395Salfred			PIPE_DROP_GIANT(rpipe);
85413907Sdyson			pipeunlock(wpipe);
85513907Sdyson		}
85613907Sdyson	}
85777676Sdillon
85877676Sdillon	/*
85977676Sdillon	 * If an early error occured unbusy and return, waking up any pending
86077676Sdillon	 * readers.
86177676Sdillon	 */
86277676Sdillon	if (error) {
86377676Sdillon		--wpipe->pipe_busy;
86477676Sdillon		if ((wpipe->pipe_busy == 0) &&
86577676Sdillon		    (wpipe->pipe_state & PIPE_WANT)) {
86677676Sdillon			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
86777676Sdillon			wakeup(wpipe);
86877676Sdillon		}
86991395Salfred		PIPE_UNLOCK(rpipe);
87077676Sdillon		return(error);
87177676Sdillon	}
87276364Salfred
87376364Salfred	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
87413907Sdyson
87513913Sdyson	orig_resid = uio->uio_resid;
87677676Sdillon
87713675Sdyson	while (uio->uio_resid) {
87813907Sdyson		int space;
87976760Salfred
88014037Sdyson#ifndef PIPE_NODIRECT
88113907Sdyson		/*
88213907Sdyson		 * If the transfer is large, we can gain performance if
88313907Sdyson		 * we do process-to-process copies directly.
88416416Sdyson		 * If the write is non-blocking, we don't use the
88516416Sdyson		 * direct write mechanism.
88658505Sdillon		 *
88758505Sdillon		 * The direct write mechanism will detect the reader going
88858505Sdillon		 * away on us.
88913907Sdyson		 */
89017163Sdyson		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
89117163Sdyson		    (fp->f_flag & FNONBLOCK) == 0 &&
89217163Sdyson			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
89313907Sdyson			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
89413907Sdyson			error = pipe_direct_write( wpipe, uio);
89576760Salfred			if (error)
89613907Sdyson				break;
89713907Sdyson			continue;
89891362Salfred		}
89914037Sdyson#endif
90013907Sdyson
90113907Sdyson		/*
90213907Sdyson		 * Pipe buffered writes cannot be coincidental with
90313907Sdyson		 * direct writes.  We wait until the currently executing
90413907Sdyson		 * direct write is completed before we start filling the
90558505Sdillon		 * pipe buffer.  We break out if a signal occurs or the
90658505Sdillon		 * reader goes away.
90713907Sdyson		 */
90813907Sdyson	retrywrite:
90913907Sdyson		while (wpipe->pipe_state & PIPE_DIRECTW) {
91013992Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
91113992Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
91213992Sdyson				wakeup(wpipe);
91313992Sdyson			}
91491395Salfred			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
91591362Salfred			    "pipbww", 0);
91658505Sdillon			if (wpipe->pipe_state & PIPE_EOF)
91758505Sdillon				break;
91813907Sdyson			if (error)
91913907Sdyson				break;
92013907Sdyson		}
92158505Sdillon		if (wpipe->pipe_state & PIPE_EOF) {
92258505Sdillon			error = EPIPE;
92358505Sdillon			break;
92458505Sdillon		}
92513907Sdyson
92613907Sdyson		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
92714644Sdyson
92814644Sdyson		/* Writes of size <= PIPE_BUF must be atomic. */
92913913Sdyson		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
93013913Sdyson			space = 0;
93113907Sdyson
93217163Sdyson		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
93313907Sdyson			if ((error = pipelock(wpipe,1)) == 0) {
93454534Stegge				int size;	/* Transfer size */
93554534Stegge				int segsize;	/* first segment to transfer */
93676760Salfred
93713907Sdyson				/*
93813907Sdyson				 * It is possible for a direct write to
93913907Sdyson				 * slip in on us... handle it here...
94013907Sdyson				 */
94113907Sdyson				if (wpipe->pipe_state & PIPE_DIRECTW) {
94213907Sdyson					pipeunlock(wpipe);
94313907Sdyson					goto retrywrite;
94413907Sdyson				}
94554534Stegge				/*
94654534Stegge				 * If a process blocked in uiomove, our
94754534Stegge				 * value for space might be bad.
94858505Sdillon				 *
94958505Sdillon				 * XXX will we be ok if the reader has gone
95058505Sdillon				 * away here?
95154534Stegge				 */
95254534Stegge				if (space > wpipe->pipe_buffer.size -
95354534Stegge				    wpipe->pipe_buffer.cnt) {
95454534Stegge					pipeunlock(wpipe);
95554534Stegge					goto retrywrite;
95654534Stegge				}
95754534Stegge
95854534Stegge				/*
95954534Stegge				 * Transfer size is minimum of uio transfer
96054534Stegge				 * and free space in pipe buffer.
96154534Stegge				 */
96254534Stegge				if (space > uio->uio_resid)
96354534Stegge					size = uio->uio_resid;
96454534Stegge				else
96554534Stegge					size = space;
96654534Stegge				/*
96754534Stegge				 * First segment to transfer is minimum of
96854534Stegge				 * transfer size and contiguous space in
96954534Stegge				 * pipe buffer.  If first segment to transfer
97054534Stegge				 * is less than the transfer size, we've got
97154534Stegge				 * a wraparound in the buffer.
97254534Stegge				 */
97354534Stegge				segsize = wpipe->pipe_buffer.size -
97454534Stegge					wpipe->pipe_buffer.in;
97554534Stegge				if (segsize > size)
97654534Stegge					segsize = size;
97754534Stegge
97854534Stegge				/* Transfer first segment */
97954534Stegge
98091395Salfred				PIPE_UNLOCK(rpipe);
98154534Stegge				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
98254534Stegge						segsize, uio);
98391395Salfred				PIPE_LOCK(rpipe);
98454534Stegge
98554534Stegge				if (error == 0 && segsize < size) {
98654534Stegge					/*
98754534Stegge					 * Transfer remaining part now, to
98854534Stegge					 * support atomic writes.  Wraparound
98954534Stegge					 * happened.
99054534Stegge					 */
99154534Stegge					if (wpipe->pipe_buffer.in + segsize !=
99254534Stegge					    wpipe->pipe_buffer.size)
99354534Stegge						panic("Expected pipe buffer wraparound disappeared");
99454534Stegge
99591395Salfred					PIPE_UNLOCK(rpipe);
99654534Stegge					error = uiomove(&wpipe->pipe_buffer.buffer[0],
99754534Stegge							size - segsize, uio);
99891395Salfred					PIPE_LOCK(rpipe);
99954534Stegge				}
100054534Stegge				if (error == 0) {
100154534Stegge					wpipe->pipe_buffer.in += size;
100254534Stegge					if (wpipe->pipe_buffer.in >=
100354534Stegge					    wpipe->pipe_buffer.size) {
100454534Stegge						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
100554534Stegge							panic("Expected wraparound bad");
100654534Stegge						wpipe->pipe_buffer.in = size - segsize;
100754534Stegge					}
100854534Stegge
100954534Stegge					wpipe->pipe_buffer.cnt += size;
101054534Stegge					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
101154534Stegge						panic("Pipe buffer overflow");
101254534Stegge
101354534Stegge				}
101413675Sdyson				pipeunlock(wpipe);
101513675Sdyson			}
101613675Sdyson			if (error)
101713675Sdyson				break;
101813675Sdyson
101913675Sdyson		} else {
102013675Sdyson			/*
102113675Sdyson			 * If the "read-side" has been blocked, wake it up now.
102213675Sdyson			 */
102313675Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
102413675Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
102513675Sdyson				wakeup(wpipe);
102613675Sdyson			}
102714037Sdyson
102813675Sdyson			/*
102913675Sdyson			 * don't block on non-blocking I/O
103013675Sdyson			 */
103116960Sdyson			if (fp->f_flag & FNONBLOCK) {
103213907Sdyson				error = EAGAIN;
103313675Sdyson				break;
103413675Sdyson			}
103513907Sdyson
103614037Sdyson			/*
103714037Sdyson			 * We have no more space and have something to offer,
103829356Speter			 * wake up select/poll.
103914037Sdyson			 */
104014037Sdyson			pipeselwakeup(wpipe);
104114037Sdyson
104213675Sdyson			wpipe->pipe_state |= PIPE_WANTW;
104391395Salfred			error = msleep(wpipe, PIPE_MTX(rpipe),
104491362Salfred			    PRIBIO | PCATCH, "pipewr", 0);
104576760Salfred			if (error != 0)
104613675Sdyson				break;
104713675Sdyson			/*
104813675Sdyson			 * If read side wants to go away, we just issue a signal
104913675Sdyson			 * to ourselves.
105013675Sdyson			 */
105113675Sdyson			if (wpipe->pipe_state & PIPE_EOF) {
105213774Sdyson				error = EPIPE;
105313907Sdyson				break;
105413675Sdyson			}
105513675Sdyson		}
105613675Sdyson	}
105713675Sdyson
105814644Sdyson	--wpipe->pipe_busy;
105977676Sdillon
106076760Salfred	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
106176760Salfred		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
106213675Sdyson		wakeup(wpipe);
106313675Sdyson	} else if (wpipe->pipe_buffer.cnt > 0) {
106413675Sdyson		/*
106513675Sdyson		 * If we have put any characters in the buffer, we wake up
106613675Sdyson		 * the reader.
106713675Sdyson		 */
106813675Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
106913675Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
107013675Sdyson			wakeup(wpipe);
107113675Sdyson		}
107213675Sdyson	}
107313909Sdyson
107413909Sdyson	/*
107513909Sdyson	 * Don't return EPIPE if I/O was successful
107613909Sdyson	 */
107713907Sdyson	if ((wpipe->pipe_buffer.cnt == 0) &&
107877676Sdillon	    (uio->uio_resid == 0) &&
107977676Sdillon	    (error == EPIPE)) {
108013907Sdyson		error = 0;
108177676Sdillon	}
108213913Sdyson
108324101Sbde	if (error == 0)
108455112Sbde		vfs_timestamp(&wpipe->pipe_mtime);
108524101Sbde
108614037Sdyson	/*
108714037Sdyson	 * We have something to offer,
108829356Speter	 * wake up select/poll.
108914037Sdyson	 */
109014177Sdyson	if (wpipe->pipe_buffer.cnt)
109114037Sdyson		pipeselwakeup(wpipe);
109213907Sdyson
109391395Salfred	PIPE_UNLOCK(rpipe);
109476760Salfred	return (error);
109513675Sdyson}
109613675Sdyson
109713675Sdyson/*
109813675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets.
109913675Sdyson */
110013675Sdysonint
110183366Sjulianpipe_ioctl(fp, cmd, data, td)
110213675Sdyson	struct file *fp;
110336735Sdfr	u_long cmd;
110476364Salfred	caddr_t data;
110583366Sjulian	struct thread *td;
110613675Sdyson{
110776364Salfred	struct pipe *mpipe = (struct pipe *)fp->f_data;
110813675Sdyson
110913675Sdyson	switch (cmd) {
111013675Sdyson
111113675Sdyson	case FIONBIO:
111213675Sdyson		return (0);
111313675Sdyson
111413675Sdyson	case FIOASYNC:
111591362Salfred		PIPE_LOCK(mpipe);
111613675Sdyson		if (*(int *)data) {
111713675Sdyson			mpipe->pipe_state |= PIPE_ASYNC;
111813675Sdyson		} else {
111913675Sdyson			mpipe->pipe_state &= ~PIPE_ASYNC;
112013675Sdyson		}
112191362Salfred		PIPE_UNLOCK(mpipe);
112213675Sdyson		return (0);
112313675Sdyson
112413675Sdyson	case FIONREAD:
112591362Salfred		PIPE_LOCK(mpipe);
112614037Sdyson		if (mpipe->pipe_state & PIPE_DIRECTW)
112714037Sdyson			*(int *)data = mpipe->pipe_map.cnt;
112814037Sdyson		else
112914037Sdyson			*(int *)data = mpipe->pipe_buffer.cnt;
113091362Salfred		PIPE_UNLOCK(mpipe);
113113675Sdyson		return (0);
113213675Sdyson
113341086Struckman	case FIOSETOWN:
113441086Struckman		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
113541086Struckman
113641086Struckman	case FIOGETOWN:
113741086Struckman		*(int *)data = fgetown(mpipe->pipe_sigio);
113813675Sdyson		return (0);
113913675Sdyson
114041086Struckman	/* This is deprecated, FIOSETOWN should be used instead. */
114141086Struckman	case TIOCSPGRP:
114241086Struckman		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
114341086Struckman
114441086Struckman	/* This is deprecated, FIOGETOWN should be used instead. */
114518863Sdyson	case TIOCGPGRP:
114641086Struckman		*(int *)data = -fgetown(mpipe->pipe_sigio);
114713675Sdyson		return (0);
114813675Sdyson
114913675Sdyson	}
115017124Sbde	return (ENOTTY);
115113675Sdyson}
115213675Sdyson
115313675Sdysonint
115483366Sjulianpipe_poll(fp, events, cred, td)
115513675Sdyson	struct file *fp;
115629356Speter	int events;
115729356Speter	struct ucred *cred;
115883366Sjulian	struct thread *td;
115913675Sdyson{
116076364Salfred	struct pipe *rpipe = (struct pipe *)fp->f_data;
116113675Sdyson	struct pipe *wpipe;
116229356Speter	int revents = 0;
116313675Sdyson
116413675Sdyson	wpipe = rpipe->pipe_peer;
116591362Salfred	PIPE_LOCK(rpipe);
116629356Speter	if (events & (POLLIN | POLLRDNORM))
116729356Speter		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
116829356Speter		    (rpipe->pipe_buffer.cnt > 0) ||
116929356Speter		    (rpipe->pipe_state & PIPE_EOF))
117029356Speter			revents |= events & (POLLIN | POLLRDNORM);
117113675Sdyson
117229356Speter	if (events & (POLLOUT | POLLWRNORM))
117329356Speter		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
117443311Sdillon		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
117543311Sdillon		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
117629356Speter			revents |= events & (POLLOUT | POLLWRNORM);
117713675Sdyson
117829356Speter	if ((rpipe->pipe_state & PIPE_EOF) ||
117929356Speter	    (wpipe == NULL) ||
118029356Speter	    (wpipe->pipe_state & PIPE_EOF))
118129356Speter		revents |= POLLHUP;
118229356Speter
118329356Speter	if (revents == 0) {
118429356Speter		if (events & (POLLIN | POLLRDNORM)) {
118583805Sjhb			selrecord(td, &rpipe->pipe_sel);
118629356Speter			rpipe->pipe_state |= PIPE_SEL;
118713675Sdyson		}
118813675Sdyson
118929356Speter		if (events & (POLLOUT | POLLWRNORM)) {
119083805Sjhb			selrecord(td, &wpipe->pipe_sel);
119130164Speter			wpipe->pipe_state |= PIPE_SEL;
119213907Sdyson		}
119313675Sdyson	}
119491362Salfred	PIPE_UNLOCK(rpipe);
119529356Speter
119629356Speter	return (revents);
119713675Sdyson}
119813675Sdyson
119952983Speterstatic int
120083366Sjulianpipe_stat(fp, ub, td)
120152983Speter	struct file *fp;
120252983Speter	struct stat *ub;
120383366Sjulian	struct thread *td;
120413675Sdyson{
120552983Speter	struct pipe *pipe = (struct pipe *)fp->f_data;
120652983Speter
120776760Salfred	bzero((caddr_t)ub, sizeof(*ub));
120817124Sbde	ub->st_mode = S_IFIFO;
120913907Sdyson	ub->st_blksize = pipe->pipe_buffer.size;
121013675Sdyson	ub->st_size = pipe->pipe_buffer.cnt;
121113675Sdyson	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
121234901Sphk	ub->st_atimespec = pipe->pipe_atime;
121334901Sphk	ub->st_mtimespec = pipe->pipe_mtime;
121434901Sphk	ub->st_ctimespec = pipe->pipe_ctime;
121560404Schris	ub->st_uid = fp->f_cred->cr_uid;
121660404Schris	ub->st_gid = fp->f_cred->cr_gid;
121717124Sbde	/*
121860404Schris	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
121917124Sbde	 * XXX (st_dev, st_ino) should be unique.
122017124Sbde	 */
122176760Salfred	return (0);
122213675Sdyson}
122313675Sdyson
122413675Sdyson/* ARGSUSED */
122513675Sdysonstatic int
122683366Sjulianpipe_close(fp, td)
122713675Sdyson	struct file *fp;
122883366Sjulian	struct thread *td;
122913675Sdyson{
123013675Sdyson	struct pipe *cpipe = (struct pipe *)fp->f_data;
123116322Sgpalmer
123249413Sgreen	fp->f_ops = &badfileops;
123349413Sgreen	fp->f_data = NULL;
123441086Struckman	funsetown(cpipe->pipe_sigio);
123513675Sdyson	pipeclose(cpipe);
123676760Salfred	return (0);
123713675Sdyson}
123813675Sdyson
123976364Salfredstatic void
124076364Salfredpipe_free_kmem(cpipe)
124176364Salfred	struct pipe *cpipe;
124276364Salfred{
124391412Salfred
124479224Sdillon	GIANT_REQUIRED;
124591412Salfred	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
124691412Salfred	       ("pipespace: pipe mutex locked"));
124776364Salfred
124876364Salfred	if (cpipe->pipe_buffer.buffer != NULL) {
124976364Salfred		if (cpipe->pipe_buffer.size > PIPE_SIZE)
125076364Salfred			--nbigpipe;
125176364Salfred		amountpipekva -= cpipe->pipe_buffer.size;
125276364Salfred		kmem_free(kernel_map,
125376364Salfred			(vm_offset_t)cpipe->pipe_buffer.buffer,
125476364Salfred			cpipe->pipe_buffer.size);
125576364Salfred		cpipe->pipe_buffer.buffer = NULL;
125676364Salfred	}
125776364Salfred#ifndef PIPE_NODIRECT
125876364Salfred	if (cpipe->pipe_map.kva != NULL) {
125976364Salfred		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
126076364Salfred		kmem_free(kernel_map,
126176364Salfred			cpipe->pipe_map.kva,
126276364Salfred			cpipe->pipe_buffer.size + PAGE_SIZE);
126376364Salfred		cpipe->pipe_map.cnt = 0;
126476364Salfred		cpipe->pipe_map.kva = 0;
126576364Salfred		cpipe->pipe_map.pos = 0;
126676364Salfred		cpipe->pipe_map.npages = 0;
126776364Salfred	}
126876364Salfred#endif
126976364Salfred}
127076364Salfred
127113675Sdyson/*
127213675Sdyson * shutdown the pipe
127313675Sdyson */
127413675Sdysonstatic void
127513675Sdysonpipeclose(cpipe)
127613675Sdyson	struct pipe *cpipe;
127713675Sdyson{
127813907Sdyson	struct pipe *ppipe;
127976364Salfred
128013675Sdyson	if (cpipe) {
128191362Salfred		PIPE_LOCK(cpipe);
128213907Sdyson
128314037Sdyson		pipeselwakeup(cpipe);
128413907Sdyson
128513675Sdyson		/*
128613675Sdyson		 * If the other side is blocked, wake it up saying that
128713675Sdyson		 * we want to close it down.
128813675Sdyson		 */
128913675Sdyson		while (cpipe->pipe_busy) {
129013675Sdyson			wakeup(cpipe);
129176760Salfred			cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
129291362Salfred			msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
129313675Sdyson		}
129413675Sdyson
129513675Sdyson		/*
129613675Sdyson		 * Disconnect from peer
129713675Sdyson		 */
129843301Sdillon		if ((ppipe = cpipe->pipe_peer) != NULL) {
129914037Sdyson			pipeselwakeup(ppipe);
130013907Sdyson
130113907Sdyson			ppipe->pipe_state |= PIPE_EOF;
130213907Sdyson			wakeup(ppipe);
130386598Ssobomax			KNOTE(&ppipe->pipe_sel.si_note, 0);
130413907Sdyson			ppipe->pipe_peer = NULL;
130513675Sdyson		}
130613675Sdyson		/*
130713675Sdyson		 * free resources
130813675Sdyson		 */
130991362Salfred		PIPE_UNLOCK(cpipe);
131091362Salfred		mtx_lock(&Giant);
131176364Salfred		pipe_free_kmem(cpipe);
131227899Sdyson		zfree(pipe_zone, cpipe);
131391362Salfred		mtx_unlock(&Giant);
131413675Sdyson	}
131513675Sdyson}
131659288Sjlemon
131772521Sjlemon/*ARGSUSED*/
131859288Sjlemonstatic int
131972521Sjlemonpipe_kqfilter(struct file *fp, struct knote *kn)
132059288Sjlemon{
132189306Salfred	struct pipe *cpipe;
132259288Sjlemon
132389306Salfred	cpipe = (struct pipe *)kn->kn_fp->f_data;
132472521Sjlemon	switch (kn->kn_filter) {
132572521Sjlemon	case EVFILT_READ:
132672521Sjlemon		kn->kn_fop = &pipe_rfiltops;
132772521Sjlemon		break;
132872521Sjlemon	case EVFILT_WRITE:
132972521Sjlemon		kn->kn_fop = &pipe_wfiltops;
133078292Sjlemon		cpipe = cpipe->pipe_peer;
133172521Sjlemon		break;
133272521Sjlemon	default:
133372521Sjlemon		return (1);
133472521Sjlemon	}
133578292Sjlemon	kn->kn_hook = (caddr_t)cpipe;
133678292Sjlemon
133791372Salfred	PIPE_LOCK(cpipe);
133878292Sjlemon	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
133991372Salfred	PIPE_UNLOCK(cpipe);
134059288Sjlemon	return (0);
134159288Sjlemon}
134259288Sjlemon
134359288Sjlemonstatic void
134459288Sjlemonfilt_pipedetach(struct knote *kn)
134559288Sjlemon{
134678292Sjlemon	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
134759288Sjlemon
134891372Salfred	PIPE_LOCK(cpipe);
134978292Sjlemon	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
135091372Salfred	PIPE_UNLOCK(cpipe);
135159288Sjlemon}
135259288Sjlemon
135359288Sjlemon/*ARGSUSED*/
135459288Sjlemonstatic int
135559288Sjlemonfilt_piperead(struct knote *kn, long hint)
135659288Sjlemon{
135759288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
135859288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
135959288Sjlemon
136091372Salfred	PIPE_LOCK(rpipe);
136159288Sjlemon	kn->kn_data = rpipe->pipe_buffer.cnt;
136259288Sjlemon	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
136359288Sjlemon		kn->kn_data = rpipe->pipe_map.cnt;
136459288Sjlemon
136559288Sjlemon	if ((rpipe->pipe_state & PIPE_EOF) ||
136659288Sjlemon	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
136791372Salfred		kn->kn_flags |= EV_EOF;
136891372Salfred		PIPE_UNLOCK(rpipe);
136959288Sjlemon		return (1);
137059288Sjlemon	}
137191372Salfred	PIPE_UNLOCK(rpipe);
137259288Sjlemon	return (kn->kn_data > 0);
137359288Sjlemon}
137459288Sjlemon
137559288Sjlemon/*ARGSUSED*/
137659288Sjlemonstatic int
137759288Sjlemonfilt_pipewrite(struct knote *kn, long hint)
137859288Sjlemon{
137959288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
138059288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
138159288Sjlemon
138291372Salfred	PIPE_LOCK(rpipe);
138359288Sjlemon	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
138459288Sjlemon		kn->kn_data = 0;
138559288Sjlemon		kn->kn_flags |= EV_EOF;
138691372Salfred		PIPE_UNLOCK(rpipe);
138759288Sjlemon		return (1);
138859288Sjlemon	}
138959288Sjlemon	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
139065855Sjlemon	if (wpipe->pipe_state & PIPE_DIRECTW)
139159288Sjlemon		kn->kn_data = 0;
139259288Sjlemon
139391372Salfred	PIPE_UNLOCK(rpipe);
139459288Sjlemon	return (kn->kn_data >= PIPE_BUF);
139559288Sjlemon}
1396