sys_pipe.c revision 92751
113675Sdyson/*
213675Sdyson * Copyright (c) 1996 John S. Dyson
313675Sdyson * All rights reserved.
413675Sdyson *
513675Sdyson * Redistribution and use in source and binary forms, with or without
613675Sdyson * modification, are permitted provided that the following conditions
713675Sdyson * are met:
813675Sdyson * 1. Redistributions of source code must retain the above copyright
913675Sdyson *    notice immediately at the beginning of the file, without modification,
1013675Sdyson *    this list of conditions, and the following disclaimer.
1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright
1213675Sdyson *    notice, this list of conditions and the following disclaimer in the
1313675Sdyson *    documentation and/or other materials provided with the distribution.
1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author
1513675Sdyson *    John S. Dyson.
1614037Sdyson * 4. Modifications may be freely made to this file if the above conditions
1713675Sdyson *    are met.
1813675Sdyson *
1950477Speter * $FreeBSD: head/sys/kern/sys_pipe.c 92751 2002-03-20 04:09:59Z jeff $
2013675Sdyson */
2113675Sdyson
2213675Sdyson/*
2313675Sdyson * This file contains a high-performance replacement for the socket-based
2413675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
2513675Sdyson * all features of sockets, but does do everything that pipes normally
2613675Sdyson * do.
2713675Sdyson */
2813675Sdyson
2913907Sdyson/*
3013907Sdyson * This code has two modes of operation, a small write mode and a large
3113907Sdyson * write mode.  The small write mode acts like conventional pipes with
3213907Sdyson * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
3313907Sdyson * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
3413907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
3513907Sdyson * the receiving process can copy it directly from the pages in the sending
3613907Sdyson * process.
3713907Sdyson *
3813907Sdyson * If the sending process receives a signal, it is possible that it will
3913913Sdyson * go away, and certainly its address space can change, because control
4013907Sdyson * is returned back to the user-mode side.  In that case, the pipe code
4113907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable
4213907Sdyson * kernel buffer, and the receiving process will grab the data from the
4313907Sdyson * pageable kernel buffer.  Since signals don't happen all that often,
4413907Sdyson * the copy operation is normally eliminated.
4513907Sdyson *
4613907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4713907Sdyson * happen for small transfers so that the system will not spend all of
4813913Sdyson * its time context switching.  PIPE_SIZE is constrained by the
4913907Sdyson * amount of kernel virtual memory.
5013907Sdyson */
5113907Sdyson
5213675Sdyson#include <sys/param.h>
5313675Sdyson#include <sys/systm.h>
5424131Sbde#include <sys/fcntl.h>
5513675Sdyson#include <sys/file.h>
5613675Sdyson#include <sys/filedesc.h>
5724206Sbde#include <sys/filio.h>
5891372Salfred#include <sys/kernel.h>
5976166Smarkm#include <sys/lock.h>
6076827Salfred#include <sys/mutex.h>
6124206Sbde#include <sys/ttycom.h>
6213675Sdyson#include <sys/stat.h>
6391968Salfred#include <sys/malloc.h>
6429356Speter#include <sys/poll.h>
6570834Swollman#include <sys/selinfo.h>
6613675Sdyson#include <sys/signalvar.h>
6713675Sdyson#include <sys/sysproto.h>
6813675Sdyson#include <sys/pipe.h>
6976166Smarkm#include <sys/proc.h>
7055112Sbde#include <sys/vnode.h>
7134924Sbde#include <sys/uio.h>
7259288Sjlemon#include <sys/event.h>
7313675Sdyson
7413675Sdyson#include <vm/vm.h>
7513675Sdyson#include <vm/vm_param.h>
7613675Sdyson#include <vm/vm_object.h>
7713675Sdyson#include <vm/vm_kern.h>
7813675Sdyson#include <vm/vm_extern.h>
7913675Sdyson#include <vm/pmap.h>
8013675Sdyson#include <vm/vm_map.h>
8113907Sdyson#include <vm/vm_page.h>
8292751Sjeff#include <vm/uma.h>
8313675Sdyson
8414037Sdyson/*
8514037Sdyson * Use this define if you want to disable *fancy* VM things.  Expect an
8614037Sdyson * approx 30% decrease in transfer rate.  This could be useful for
8714037Sdyson * NetBSD or OpenBSD.
8814037Sdyson */
8914037Sdyson/* #define PIPE_NODIRECT */
9014037Sdyson
9114037Sdyson/*
9214037Sdyson * interfaces to the outside world
9314037Sdyson */
9491413Salfredstatic int pipe_read(struct file *fp, struct uio *uio,
9591413Salfred		struct ucred *cred, int flags, struct thread *td);
9691413Salfredstatic int pipe_write(struct file *fp, struct uio *uio,
9791413Salfred		struct ucred *cred, int flags, struct thread *td);
9891413Salfredstatic int pipe_close(struct file *fp, struct thread *td);
9991413Salfredstatic int pipe_poll(struct file *fp, int events, struct ucred *cred,
10091413Salfred		struct thread *td);
10191413Salfredstatic int pipe_kqfilter(struct file *fp, struct knote *kn);
10291413Salfredstatic int pipe_stat(struct file *fp, struct stat *sb, struct thread *td);
10391413Salfredstatic int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td);
10413675Sdyson
10572521Sjlemonstatic struct fileops pipeops = {
10672521Sjlemon	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
10772521Sjlemon	pipe_stat, pipe_close
10872521Sjlemon};
10913675Sdyson
11059288Sjlemonstatic void	filt_pipedetach(struct knote *kn);
11159288Sjlemonstatic int	filt_piperead(struct knote *kn, long hint);
11259288Sjlemonstatic int	filt_pipewrite(struct knote *kn, long hint);
11359288Sjlemon
11472521Sjlemonstatic struct filterops pipe_rfiltops =
11572521Sjlemon	{ 1, NULL, filt_pipedetach, filt_piperead };
11672521Sjlemonstatic struct filterops pipe_wfiltops =
11772521Sjlemon	{ 1, NULL, filt_pipedetach, filt_pipewrite };
11859288Sjlemon
11992305Salfred#define PIPE_GET_GIANT(pipe)						\
12091362Salfred	do {								\
12192305Salfred		KASSERT(((pipe)->pipe_state & PIPE_LOCKFL) != 0,	\
12292305Salfred		    ("%s:%d PIPE_GET_GIANT: line pipe not locked",	\
12392305Salfred		     __FILE__, __LINE__));				\
12492305Salfred		PIPE_UNLOCK(pipe);					\
12591362Salfred		mtx_lock(&Giant);					\
12691362Salfred	} while (0)
12772521Sjlemon
12891362Salfred#define PIPE_DROP_GIANT(pipe)						\
12991362Salfred	do {								\
13091362Salfred		mtx_unlock(&Giant);					\
13192305Salfred		PIPE_LOCK(pipe);					\
13291362Salfred	} while (0)
13391362Salfred
13413675Sdyson/*
13513675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe
13613675Sdyson * space is pageable.  The pipe code will try to maintain locality of
13713675Sdyson * reference for performance reasons, so small amounts of outstanding I/O
13813675Sdyson * will not wipe the cache.
13913675Sdyson */
14013907Sdyson#define MINPIPESIZE (PIPE_SIZE/3)
14113907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3)
14213675Sdyson
14313907Sdyson/*
14413907Sdyson * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
14513907Sdyson * is there so that on large systems, we don't exhaust it.
14613907Sdyson */
14713907Sdyson#define MAXPIPEKVA (8*1024*1024)
14813907Sdyson
14913907Sdyson/*
15013907Sdyson * Limit for direct transfers, we cannot, of course limit
15113907Sdyson * the amount of kva for pipes in general though.
15213907Sdyson */
15313907Sdyson#define LIMITPIPEKVA (16*1024*1024)
15417163Sdyson
15517163Sdyson/*
15617163Sdyson * Limit the number of "big" pipes
15717163Sdyson */
15817163Sdyson#define LIMITBIGPIPES	32
15933181Seivindstatic int nbigpipe;
16017163Sdyson
16117124Sbdestatic int amountpipekva;
16213907Sdyson
16391413Salfredstatic void pipeinit(void *dummy __unused);
16491413Salfredstatic void pipeclose(struct pipe *cpipe);
16591413Salfredstatic void pipe_free_kmem(struct pipe *cpipe);
16691413Salfredstatic int pipe_create(struct pipe **cpipep);
16791413Salfredstatic __inline int pipelock(struct pipe *cpipe, int catch);
16891413Salfredstatic __inline void pipeunlock(struct pipe *cpipe);
16991413Salfredstatic __inline void pipeselwakeup(struct pipe *cpipe);
17014037Sdyson#ifndef PIPE_NODIRECT
17191413Salfredstatic int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
17291413Salfredstatic void pipe_destroy_write_buffer(struct pipe *wpipe);
17391413Salfredstatic int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
17491413Salfredstatic void pipe_clone_write_buffer(struct pipe *wpipe);
17514037Sdyson#endif
17691413Salfredstatic int pipespace(struct pipe *cpipe, int size);
17713675Sdyson
17892751Sjeffstatic uma_zone_t pipe_zone;
17927899Sdyson
18091372SalfredSYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
18191372Salfred
18291372Salfredstatic void
18391372Salfredpipeinit(void *dummy __unused)
18491372Salfred{
18592654Sjeff	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
18692654Sjeff	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
18791372Salfred}
18891372Salfred
18913675Sdyson/*
19013675Sdyson * The pipe system call for the DTYPE_PIPE type of pipes
19113675Sdyson */
19213675Sdyson
19313675Sdyson/* ARGSUSED */
19413675Sdysonint
19583366Sjulianpipe(td, uap)
19683366Sjulian	struct thread *td;
19713675Sdyson	struct pipe_args /* {
19813675Sdyson		int	dummy;
19913675Sdyson	} */ *uap;
20013675Sdyson{
20183366Sjulian	struct filedesc *fdp = td->td_proc->p_fd;
20213675Sdyson	struct file *rf, *wf;
20313675Sdyson	struct pipe *rpipe, *wpipe;
20491968Salfred	struct mtx *pmtx;
20513675Sdyson	int fd, error;
20691362Salfred
20791372Salfred	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
20827899Sdyson
20991968Salfred	pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
21091968Salfred
21176756Salfred	rpipe = wpipe = NULL;
21276364Salfred	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
21376364Salfred		pipeclose(rpipe);
21476364Salfred		pipeclose(wpipe);
21591968Salfred		free(pmtx, M_TEMP);
21676364Salfred		return (ENFILE);
21776364Salfred	}
21876364Salfred
21913907Sdyson	rpipe->pipe_state |= PIPE_DIRECTOK;
22013907Sdyson	wpipe->pipe_state |= PIPE_DIRECTOK;
22113675Sdyson
22283366Sjulian	error = falloc(td, &rf, &fd);
22370915Sdwmalone	if (error) {
22470915Sdwmalone		pipeclose(rpipe);
22570915Sdwmalone		pipeclose(wpipe);
22691968Salfred		free(pmtx, M_TEMP);
22770915Sdwmalone		return (error);
22870915Sdwmalone	}
22970915Sdwmalone	fhold(rf);
23083366Sjulian	td->td_retval[0] = fd;
23170915Sdwmalone
23270803Sdwmalone	/*
23370803Sdwmalone	 * Warning: once we've gotten past allocation of the fd for the
23470803Sdwmalone	 * read-side, we can only drop the read side via fdrop() in order
23570803Sdwmalone	 * to avoid races against processes which manage to dup() the read
23670803Sdwmalone	 * side while we are blocked trying to allocate the write side.
23770803Sdwmalone	 */
23889306Salfred	FILE_LOCK(rf);
23913675Sdyson	rf->f_flag = FREAD | FWRITE;
24013675Sdyson	rf->f_type = DTYPE_PIPE;
24149413Sgreen	rf->f_data = (caddr_t)rpipe;
24213675Sdyson	rf->f_ops = &pipeops;
24389306Salfred	FILE_UNLOCK(rf);
24483366Sjulian	error = falloc(td, &wf, &fd);
24570915Sdwmalone	if (error) {
24689306Salfred		FILEDESC_LOCK(fdp);
24783366Sjulian		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
24883366Sjulian			fdp->fd_ofiles[td->td_retval[0]] = NULL;
24989306Salfred			FILEDESC_UNLOCK(fdp);
25083366Sjulian			fdrop(rf, td);
25189306Salfred		} else
25289306Salfred			FILEDESC_UNLOCK(fdp);
25383366Sjulian		fdrop(rf, td);
25470915Sdwmalone		/* rpipe has been closed by fdrop(). */
25570915Sdwmalone		pipeclose(wpipe);
25691968Salfred		free(pmtx, M_TEMP);
25770915Sdwmalone		return (error);
25870915Sdwmalone	}
25989306Salfred	FILE_LOCK(wf);
26013675Sdyson	wf->f_flag = FREAD | FWRITE;
26113675Sdyson	wf->f_type = DTYPE_PIPE;
26249413Sgreen	wf->f_data = (caddr_t)wpipe;
26313675Sdyson	wf->f_ops = &pipeops;
26489306Salfred	FILE_UNLOCK(wf);
26583366Sjulian	td->td_retval[1] = fd;
26613675Sdyson	rpipe->pipe_peer = wpipe;
26713675Sdyson	wpipe->pipe_peer = rpipe;
26891968Salfred	mtx_init(pmtx, "pipe mutex", MTX_DEF);
26991968Salfred	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
27083366Sjulian	fdrop(rf, td);
27113675Sdyson
27213675Sdyson	return (0);
27313675Sdyson}
27413675Sdyson
27513909Sdyson/*
27613909Sdyson * Allocate kva for pipe circular buffer, the space is pageable
27776364Salfred * This routine will 'realloc' the size of a pipe safely, if it fails
27876364Salfred * it will retain the old buffer.
27976364Salfred * If it fails it will return ENOMEM.
28013909Sdyson */
28176364Salfredstatic int
28276364Salfredpipespace(cpipe, size)
28313675Sdyson	struct pipe *cpipe;
28476364Salfred	int size;
28513675Sdyson{
28676364Salfred	struct vm_object *object;
28776364Salfred	caddr_t buffer;
28813688Sdyson	int npages, error;
28913675Sdyson
29079224Sdillon	GIANT_REQUIRED;
29191412Salfred	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
29291412Salfred	       ("pipespace: pipe mutex locked"));
29379224Sdillon
29476364Salfred	npages = round_page(size)/PAGE_SIZE;
29513675Sdyson	/*
29613675Sdyson	 * Create an object, I don't like the idea of paging to/from
29713675Sdyson	 * kernel_object.
29814037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
29913675Sdyson	 */
30076364Salfred	object = vm_object_allocate(OBJT_DEFAULT, npages);
30176364Salfred	buffer = (caddr_t) vm_map_min(kernel_map);
30213675Sdyson
30313675Sdyson	/*
30413675Sdyson	 * Insert the object into the kernel map, and allocate kva for it.
30513675Sdyson	 * The map entry is, by default, pageable.
30614037Sdyson	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
30713675Sdyson	 */
30876364Salfred	error = vm_map_find(kernel_map, object, 0,
30976364Salfred		(vm_offset_t *) &buffer, size, 1,
31013688Sdyson		VM_PROT_ALL, VM_PROT_ALL, 0);
31113675Sdyson
31276364Salfred	if (error != KERN_SUCCESS) {
31376364Salfred		vm_object_deallocate(object);
31476364Salfred		return (ENOMEM);
31576364Salfred	}
31676364Salfred
31776364Salfred	/* free old resources if we're resizing */
31876364Salfred	pipe_free_kmem(cpipe);
31976364Salfred	cpipe->pipe_buffer.object = object;
32076364Salfred	cpipe->pipe_buffer.buffer = buffer;
32176364Salfred	cpipe->pipe_buffer.size = size;
32276364Salfred	cpipe->pipe_buffer.in = 0;
32376364Salfred	cpipe->pipe_buffer.out = 0;
32476364Salfred	cpipe->pipe_buffer.cnt = 0;
32513907Sdyson	amountpipekva += cpipe->pipe_buffer.size;
32676364Salfred	return (0);
32713907Sdyson}
32813688Sdyson
32913907Sdyson/*
33013907Sdyson * initialize and allocate VM and memory for pipe
33113907Sdyson */
33276364Salfredstatic int
33376364Salfredpipe_create(cpipep)
33476364Salfred	struct pipe **cpipep;
33576364Salfred{
33613907Sdyson	struct pipe *cpipe;
33776364Salfred	int error;
33813907Sdyson
33992751Sjeff	*cpipep = uma_zalloc(pipe_zone, M_WAITOK);
34076364Salfred	if (*cpipep == NULL)
34176364Salfred		return (ENOMEM);
34217163Sdyson
34376364Salfred	cpipe = *cpipep;
34476364Salfred
34576364Salfred	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
34676364Salfred	cpipe->pipe_buffer.object = NULL;
34776364Salfred#ifndef PIPE_NODIRECT
34876364Salfred	cpipe->pipe_map.kva = NULL;
34976364Salfred#endif
35076364Salfred	/*
35176364Salfred	 * protect so pipeclose() doesn't follow a junk pointer
35276364Salfred	 * if pipespace() fails.
35376364Salfred	 */
35476754Salfred	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
35513675Sdyson	cpipe->pipe_state = 0;
35613675Sdyson	cpipe->pipe_peer = NULL;
35713675Sdyson	cpipe->pipe_busy = 0;
35813907Sdyson
35914037Sdyson#ifndef PIPE_NODIRECT
36013907Sdyson	/*
36113907Sdyson	 * pipe data structure initializations to support direct pipe I/O
36213907Sdyson	 */
36313907Sdyson	cpipe->pipe_map.cnt = 0;
36413907Sdyson	cpipe->pipe_map.kva = 0;
36513907Sdyson	cpipe->pipe_map.pos = 0;
36613907Sdyson	cpipe->pipe_map.npages = 0;
36717124Sbde	/* cpipe->pipe_map.ms[] = invalid */
36814037Sdyson#endif
36976364Salfred
37091412Salfred	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
37176364Salfred	error = pipespace(cpipe, PIPE_SIZE);
37276760Salfred	if (error)
37376364Salfred		return (error);
37476364Salfred
37576364Salfred	vfs_timestamp(&cpipe->pipe_ctime);
37676364Salfred	cpipe->pipe_atime = cpipe->pipe_ctime;
37776364Salfred	cpipe->pipe_mtime = cpipe->pipe_ctime;
37876364Salfred
37976364Salfred	return (0);
38013675Sdyson}
38113675Sdyson
38213675Sdyson
38313675Sdyson/*
38413675Sdyson * lock a pipe for I/O, blocking other access
38513675Sdyson */
38613675Sdysonstatic __inline int
38713907Sdysonpipelock(cpipe, catch)
38813675Sdyson	struct pipe *cpipe;
38913907Sdyson	int catch;
39013675Sdyson{
39113776Sdyson	int error;
39276364Salfred
39391362Salfred	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
39491362Salfred	while (cpipe->pipe_state & PIPE_LOCKFL) {
39513675Sdyson		cpipe->pipe_state |= PIPE_LWANT;
39691362Salfred		error = msleep(cpipe, PIPE_MTX(cpipe),
39791362Salfred		    catch ? (PRIBIO | PCATCH) : PRIBIO,
39876760Salfred		    "pipelk", 0);
39976760Salfred		if (error != 0)
40076760Salfred			return (error);
40113675Sdyson	}
40291362Salfred	cpipe->pipe_state |= PIPE_LOCKFL;
40376760Salfred	return (0);
40413675Sdyson}
40513675Sdyson
40613675Sdyson/*
40713675Sdyson * unlock a pipe I/O lock
40813675Sdyson */
40913675Sdysonstatic __inline void
41013675Sdysonpipeunlock(cpipe)
41113675Sdyson	struct pipe *cpipe;
41213675Sdyson{
41376364Salfred
41491362Salfred	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
41591362Salfred	cpipe->pipe_state &= ~PIPE_LOCKFL;
41613675Sdyson	if (cpipe->pipe_state & PIPE_LWANT) {
41713675Sdyson		cpipe->pipe_state &= ~PIPE_LWANT;
41814177Sdyson		wakeup(cpipe);
41913675Sdyson	}
42013675Sdyson}
42113675Sdyson
42214037Sdysonstatic __inline void
42314037Sdysonpipeselwakeup(cpipe)
42414037Sdyson	struct pipe *cpipe;
42514037Sdyson{
42676364Salfred
42714037Sdyson	if (cpipe->pipe_state & PIPE_SEL) {
42814037Sdyson		cpipe->pipe_state &= ~PIPE_SEL;
42914037Sdyson		selwakeup(&cpipe->pipe_sel);
43014037Sdyson	}
43141086Struckman	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
43241086Struckman		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
43359288Sjlemon	KNOTE(&cpipe->pipe_sel.si_note, 0);
43414037Sdyson}
43514037Sdyson
43613675Sdyson/* ARGSUSED */
43713675Sdysonstatic int
43883366Sjulianpipe_read(fp, uio, cred, flags, td)
43913675Sdyson	struct file *fp;
44013675Sdyson	struct uio *uio;
44113675Sdyson	struct ucred *cred;
44283366Sjulian	struct thread *td;
44345311Sdt	int flags;
44413675Sdyson{
44513675Sdyson	struct pipe *rpipe = (struct pipe *) fp->f_data;
44647748Salc	int error;
44713675Sdyson	int nread = 0;
44818863Sdyson	u_int size;
44913675Sdyson
45091362Salfred	PIPE_LOCK(rpipe);
45113675Sdyson	++rpipe->pipe_busy;
45247748Salc	error = pipelock(rpipe, 1);
45347748Salc	if (error)
45447748Salc		goto unlocked_error;
45547748Salc
45613675Sdyson	while (uio->uio_resid) {
45713907Sdyson		/*
45813907Sdyson		 * normal pipe buffer receive
45913907Sdyson		 */
46013675Sdyson		if (rpipe->pipe_buffer.cnt > 0) {
46118863Sdyson			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
46213675Sdyson			if (size > rpipe->pipe_buffer.cnt)
46313675Sdyson				size = rpipe->pipe_buffer.cnt;
46418863Sdyson			if (size > (u_int) uio->uio_resid)
46518863Sdyson				size = (u_int) uio->uio_resid;
46647748Salc
46791362Salfred			PIPE_UNLOCK(rpipe);
46847748Salc			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
46913675Sdyson					size, uio);
47091362Salfred			PIPE_LOCK(rpipe);
47176760Salfred			if (error)
47213675Sdyson				break;
47376760Salfred
47413675Sdyson			rpipe->pipe_buffer.out += size;
47513675Sdyson			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
47613675Sdyson				rpipe->pipe_buffer.out = 0;
47713675Sdyson
47813675Sdyson			rpipe->pipe_buffer.cnt -= size;
47947748Salc
48047748Salc			/*
48147748Salc			 * If there is no more to read in the pipe, reset
48247748Salc			 * its pointers to the beginning.  This improves
48347748Salc			 * cache hit stats.
48447748Salc			 */
48547748Salc			if (rpipe->pipe_buffer.cnt == 0) {
48647748Salc				rpipe->pipe_buffer.in = 0;
48747748Salc				rpipe->pipe_buffer.out = 0;
48847748Salc			}
48913675Sdyson			nread += size;
49014037Sdyson#ifndef PIPE_NODIRECT
49113907Sdyson		/*
49213907Sdyson		 * Direct copy, bypassing a kernel buffer.
49313907Sdyson		 */
49413907Sdyson		} else if ((size = rpipe->pipe_map.cnt) &&
49547748Salc			   (rpipe->pipe_state & PIPE_DIRECTW)) {
49647748Salc			caddr_t	va;
49718863Sdyson			if (size > (u_int) uio->uio_resid)
49818863Sdyson				size = (u_int) uio->uio_resid;
49947748Salc
50076760Salfred			va = (caddr_t) rpipe->pipe_map.kva +
50176760Salfred			    rpipe->pipe_map.pos;
50291362Salfred			PIPE_UNLOCK(rpipe);
50347748Salc			error = uiomove(va, size, uio);
50491362Salfred			PIPE_LOCK(rpipe);
50513907Sdyson			if (error)
50613907Sdyson				break;
50713907Sdyson			nread += size;
50813907Sdyson			rpipe->pipe_map.pos += size;
50913907Sdyson			rpipe->pipe_map.cnt -= size;
51013907Sdyson			if (rpipe->pipe_map.cnt == 0) {
51113907Sdyson				rpipe->pipe_state &= ~PIPE_DIRECTW;
51213907Sdyson				wakeup(rpipe);
51313907Sdyson			}
51414037Sdyson#endif
51513675Sdyson		} else {
51613675Sdyson			/*
51713675Sdyson			 * detect EOF condition
51876760Salfred			 * read returns 0 on EOF, no need to set error
51913675Sdyson			 */
52076760Salfred			if (rpipe->pipe_state & PIPE_EOF)
52113675Sdyson				break;
52243623Sdillon
52313675Sdyson			/*
52413675Sdyson			 * If the "write-side" has been blocked, wake it up now.
52513675Sdyson			 */
52613675Sdyson			if (rpipe->pipe_state & PIPE_WANTW) {
52713675Sdyson				rpipe->pipe_state &= ~PIPE_WANTW;
52813675Sdyson				wakeup(rpipe);
52913675Sdyson			}
53043623Sdillon
53143623Sdillon			/*
53247748Salc			 * Break if some data was read.
53343623Sdillon			 */
53447748Salc			if (nread > 0)
53513675Sdyson				break;
53616960Sdyson
53743623Sdillon			/*
53847748Salc			 * Unlock the pipe buffer for our remaining processing.  We
53947748Salc			 * will either break out with an error or we will sleep and
54047748Salc			 * relock to loop.
54143623Sdillon			 */
54247748Salc			pipeunlock(rpipe);
54343623Sdillon
54413675Sdyson			/*
54547748Salc			 * Handle non-blocking mode operation or
54647748Salc			 * wait for more data.
54713675Sdyson			 */
54876760Salfred			if (fp->f_flag & FNONBLOCK) {
54947748Salc				error = EAGAIN;
55076760Salfred			} else {
55147748Salc				rpipe->pipe_state |= PIPE_WANTR;
55291362Salfred				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
55391362Salfred				    PRIBIO | PCATCH,
55477140Salfred				    "piperd", 0)) == 0)
55547748Salc					error = pipelock(rpipe, 1);
55613675Sdyson			}
55747748Salc			if (error)
55847748Salc				goto unlocked_error;
55913675Sdyson		}
56013675Sdyson	}
56147748Salc	pipeunlock(rpipe);
56213675Sdyson
56391362Salfred	/* XXX: should probably do this before getting any locks. */
56424101Sbde	if (error == 0)
56555112Sbde		vfs_timestamp(&rpipe->pipe_atime);
56647748Salcunlocked_error:
56747748Salc	--rpipe->pipe_busy;
56813913Sdyson
56947748Salc	/*
57047748Salc	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
57147748Salc	 */
57213675Sdyson	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
57313675Sdyson		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
57413675Sdyson		wakeup(rpipe);
57513675Sdyson	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
57613675Sdyson		/*
57747748Salc		 * Handle write blocking hysteresis.
57813675Sdyson		 */
57913675Sdyson		if (rpipe->pipe_state & PIPE_WANTW) {
58013675Sdyson			rpipe->pipe_state &= ~PIPE_WANTW;
58113675Sdyson			wakeup(rpipe);
58213675Sdyson		}
58313675Sdyson	}
58414037Sdyson
58514802Sdyson	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
58614037Sdyson		pipeselwakeup(rpipe);
58714037Sdyson
58891362Salfred	PIPE_UNLOCK(rpipe);
58976760Salfred	return (error);
59013675Sdyson}
59113675Sdyson
59214037Sdyson#ifndef PIPE_NODIRECT
59313907Sdyson/*
59413907Sdyson * Map the sending processes' buffer into kernel space and wire it.
59513907Sdyson * This is similar to a physical write operation.
59613907Sdyson */
59713675Sdysonstatic int
59813907Sdysonpipe_build_write_buffer(wpipe, uio)
59913907Sdyson	struct pipe *wpipe;
60013675Sdyson	struct uio *uio;
60113675Sdyson{
60218863Sdyson	u_int size;
60313907Sdyson	int i;
60413907Sdyson	vm_offset_t addr, endaddr, paddr;
60513907Sdyson
60679224Sdillon	GIANT_REQUIRED;
60791412Salfred	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
60879224Sdillon
60918863Sdyson	size = (u_int) uio->uio_iov->iov_len;
61013907Sdyson	if (size > wpipe->pipe_buffer.size)
61113907Sdyson		size = wpipe->pipe_buffer.size;
61213907Sdyson
61340286Sdg	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
61476760Salfred	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
61576760Salfred	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
61613907Sdyson		vm_page_t m;
61713907Sdyson
61851474Sdillon		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
61951474Sdillon		    (paddr = pmap_kextract(addr)) == 0) {
62013907Sdyson			int j;
62176760Salfred
62276760Salfred			for (j = 0; j < i; j++)
62340700Sdg				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
62476760Salfred			return (EFAULT);
62513907Sdyson		}
62613907Sdyson
62713907Sdyson		m = PHYS_TO_VM_PAGE(paddr);
62813907Sdyson		vm_page_wire(m);
62913907Sdyson		wpipe->pipe_map.ms[i] = m;
63013907Sdyson	}
63113907Sdyson
63213907Sdyson/*
63313907Sdyson * set up the control block
63413907Sdyson */
63513907Sdyson	wpipe->pipe_map.npages = i;
63676760Salfred	wpipe->pipe_map.pos =
63776760Salfred	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
63813907Sdyson	wpipe->pipe_map.cnt = size;
63913907Sdyson
64013907Sdyson/*
64113907Sdyson * and map the buffer
64213907Sdyson */
64313907Sdyson	if (wpipe->pipe_map.kva == 0) {
64413912Sdyson		/*
64513912Sdyson		 * We need to allocate space for an extra page because the
64613912Sdyson		 * address range might (will) span pages at times.
64713912Sdyson		 */
64813907Sdyson		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
64913912Sdyson			wpipe->pipe_buffer.size + PAGE_SIZE);
65013912Sdyson		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
65113907Sdyson	}
65213907Sdyson	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
65313907Sdyson		wpipe->pipe_map.npages);
65413907Sdyson
65513907Sdyson/*
65613907Sdyson * and update the uio data
65713907Sdyson */
65813907Sdyson
65913907Sdyson	uio->uio_iov->iov_len -= size;
66013907Sdyson	uio->uio_iov->iov_base += size;
66113907Sdyson	if (uio->uio_iov->iov_len == 0)
66213907Sdyson		uio->uio_iov++;
66313907Sdyson	uio->uio_resid -= size;
66413907Sdyson	uio->uio_offset += size;
66576760Salfred	return (0);
66613907Sdyson}
66713907Sdyson
66813907Sdyson/*
66913907Sdyson * unmap and unwire the process buffer
67013907Sdyson */
67113907Sdysonstatic void
67213907Sdysonpipe_destroy_write_buffer(wpipe)
67376760Salfred	struct pipe *wpipe;
67413907Sdyson{
67513907Sdyson	int i;
67676364Salfred
67779224Sdillon	GIANT_REQUIRED;
67891412Salfred	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
67979224Sdillon
68017163Sdyson	if (wpipe->pipe_map.kva) {
68117163Sdyson		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
68213907Sdyson
68313907Sdyson		if (amountpipekva > MAXPIPEKVA) {
68413907Sdyson			vm_offset_t kva = wpipe->pipe_map.kva;
68513907Sdyson			wpipe->pipe_map.kva = 0;
68613907Sdyson			kmem_free(kernel_map, kva,
68713912Sdyson				wpipe->pipe_buffer.size + PAGE_SIZE);
68813912Sdyson			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
68913907Sdyson		}
69013907Sdyson	}
69176760Salfred	for (i = 0; i < wpipe->pipe_map.npages; i++)
69240700Sdg		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
69391653Stanimura	wpipe->pipe_map.npages = 0;
69413907Sdyson}
69513907Sdyson
69613907Sdyson/*
69713907Sdyson * In the case of a signal, the writing process might go away.  This
69813907Sdyson * code copies the data into the circular buffer so that the source
69913907Sdyson * pages can be freed without loss of data.
70013907Sdyson */
70113907Sdysonstatic void
70213907Sdysonpipe_clone_write_buffer(wpipe)
70376364Salfred	struct pipe *wpipe;
70413907Sdyson{
70513907Sdyson	int size;
70613907Sdyson	int pos;
70713907Sdyson
70891362Salfred	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
70913907Sdyson	size = wpipe->pipe_map.cnt;
71013907Sdyson	pos = wpipe->pipe_map.pos;
71176760Salfred	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
71276760Salfred	    (caddr_t) wpipe->pipe_buffer.buffer, size);
71313907Sdyson
71413907Sdyson	wpipe->pipe_buffer.in = size;
71513907Sdyson	wpipe->pipe_buffer.out = 0;
71613907Sdyson	wpipe->pipe_buffer.cnt = size;
71713907Sdyson	wpipe->pipe_state &= ~PIPE_DIRECTW;
71813907Sdyson
71991412Salfred	PIPE_GET_GIANT(wpipe);
72013907Sdyson	pipe_destroy_write_buffer(wpipe);
72191412Salfred	PIPE_DROP_GIANT(wpipe);
72213907Sdyson}
72313907Sdyson
72413907Sdyson/*
72513907Sdyson * This implements the pipe buffer write mechanism.  Note that only
72613907Sdyson * a direct write OR a normal pipe write can be pending at any given time.
72713907Sdyson * If there are any characters in the pipe buffer, the direct write will
72813907Sdyson * be deferred until the receiving process grabs all of the bytes from
72913907Sdyson * the pipe buffer.  Then the direct mapping write is set-up.
73013907Sdyson */
73113907Sdysonstatic int
73213907Sdysonpipe_direct_write(wpipe, uio)
73313907Sdyson	struct pipe *wpipe;
73413907Sdyson	struct uio *uio;
73513907Sdyson{
73613907Sdyson	int error;
73776364Salfred
73813951Sdysonretry:
73991362Salfred	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
74013907Sdyson	while (wpipe->pipe_state & PIPE_DIRECTW) {
74176760Salfred		if (wpipe->pipe_state & PIPE_WANTR) {
74213951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
74313951Sdyson			wakeup(wpipe);
74413951Sdyson		}
74513992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
74691362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe),
74791362Salfred		    PRIBIO | PCATCH, "pipdww", 0);
74814802Sdyson		if (error)
74913907Sdyson			goto error1;
75014802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
75114802Sdyson			error = EPIPE;
75214802Sdyson			goto error1;
75314802Sdyson		}
75413907Sdyson	}
75513907Sdyson	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
75613951Sdyson	if (wpipe->pipe_buffer.cnt > 0) {
75776760Salfred		if (wpipe->pipe_state & PIPE_WANTR) {
75813951Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
75913951Sdyson			wakeup(wpipe);
76013951Sdyson		}
76113951Sdyson
76213992Sdyson		wpipe->pipe_state |= PIPE_WANTW;
76391362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe),
76491362Salfred		    PRIBIO | PCATCH, "pipdwc", 0);
76514802Sdyson		if (error)
76613907Sdyson			goto error1;
76714802Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
76814802Sdyson			error = EPIPE;
76914802Sdyson			goto error1;
77013907Sdyson		}
77113951Sdyson		goto retry;
77213907Sdyson	}
77313907Sdyson
77413951Sdyson	wpipe->pipe_state |= PIPE_DIRECTW;
77513951Sdyson
77692305Salfred	pipelock(wpipe, 0);
77791362Salfred	PIPE_GET_GIANT(wpipe);
77813907Sdyson	error = pipe_build_write_buffer(wpipe, uio);
77991362Salfred	PIPE_DROP_GIANT(wpipe);
78092305Salfred	pipeunlock(wpipe);
78113907Sdyson	if (error) {
78213907Sdyson		wpipe->pipe_state &= ~PIPE_DIRECTW;
78313907Sdyson		goto error1;
78413907Sdyson	}
78513907Sdyson
78613907Sdyson	error = 0;
78713907Sdyson	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
78813907Sdyson		if (wpipe->pipe_state & PIPE_EOF) {
78913907Sdyson			pipelock(wpipe, 0);
79091362Salfred			PIPE_GET_GIANT(wpipe);
79113907Sdyson			pipe_destroy_write_buffer(wpipe);
79291362Salfred			PIPE_DROP_GIANT(wpipe);
79313907Sdyson			pipeunlock(wpipe);
79414037Sdyson			pipeselwakeup(wpipe);
79514802Sdyson			error = EPIPE;
79614802Sdyson			goto error1;
79713907Sdyson		}
79813992Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
79913992Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
80013992Sdyson			wakeup(wpipe);
80113992Sdyson		}
80214037Sdyson		pipeselwakeup(wpipe);
80391362Salfred		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
80491362Salfred		    "pipdwt", 0);
80513907Sdyson	}
80613907Sdyson
80713907Sdyson	pipelock(wpipe,0);
80813907Sdyson	if (wpipe->pipe_state & PIPE_DIRECTW) {
80913907Sdyson		/*
81013907Sdyson		 * this bit of trickery substitutes a kernel buffer for
81113907Sdyson		 * the process that might be going away.
81213907Sdyson		 */
81313907Sdyson		pipe_clone_write_buffer(wpipe);
81413907Sdyson	} else {
81591412Salfred		PIPE_GET_GIANT(wpipe);
81613907Sdyson		pipe_destroy_write_buffer(wpipe);
81791412Salfred		PIPE_DROP_GIANT(wpipe);
81813907Sdyson	}
81913907Sdyson	pipeunlock(wpipe);
82076760Salfred	return (error);
82113907Sdyson
82213907Sdysonerror1:
82313907Sdyson	wakeup(wpipe);
82476760Salfred	return (error);
82513907Sdyson}
82614037Sdyson#endif
82713907Sdyson
82816960Sdysonstatic int
82983366Sjulianpipe_write(fp, uio, cred, flags, td)
83016960Sdyson	struct file *fp;
83113907Sdyson	struct uio *uio;
83216960Sdyson	struct ucred *cred;
83383366Sjulian	struct thread *td;
83445311Sdt	int flags;
83513907Sdyson{
83613675Sdyson	int error = 0;
83713913Sdyson	int orig_resid;
83816960Sdyson	struct pipe *wpipe, *rpipe;
83916960Sdyson
84016960Sdyson	rpipe = (struct pipe *) fp->f_data;
84116960Sdyson	wpipe = rpipe->pipe_peer;
84216960Sdyson
84391395Salfred	PIPE_LOCK(rpipe);
84413675Sdyson	/*
84513675Sdyson	 * detect loss of pipe read side, issue SIGPIPE if lost.
84613675Sdyson	 */
84716960Sdyson	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
84891395Salfred		PIPE_UNLOCK(rpipe);
84976760Salfred		return (EPIPE);
85013675Sdyson	}
85177676Sdillon	++wpipe->pipe_busy;
85213675Sdyson
85317163Sdyson	/*
85417163Sdyson	 * If it is advantageous to resize the pipe buffer, do
85517163Sdyson	 * so.
85617163Sdyson	 */
85717163Sdyson	if ((uio->uio_resid > PIPE_SIZE) &&
85817163Sdyson		(nbigpipe < LIMITBIGPIPES) &&
85917163Sdyson		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
86017163Sdyson		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
86117163Sdyson		(wpipe->pipe_buffer.cnt == 0)) {
86217163Sdyson
86313907Sdyson		if ((error = pipelock(wpipe,1)) == 0) {
86492305Salfred			PIPE_GET_GIANT(wpipe);
86576364Salfred			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
86676364Salfred				nbigpipe++;
86792305Salfred			PIPE_DROP_GIANT(wpipe);
86813907Sdyson			pipeunlock(wpipe);
86913907Sdyson		}
87013907Sdyson	}
87177676Sdillon
87277676Sdillon	/*
87377676Sdillon	 * If an early error occured unbusy and return, waking up any pending
87477676Sdillon	 * readers.
87577676Sdillon	 */
87677676Sdillon	if (error) {
87777676Sdillon		--wpipe->pipe_busy;
87877676Sdillon		if ((wpipe->pipe_busy == 0) &&
87977676Sdillon		    (wpipe->pipe_state & PIPE_WANT)) {
88077676Sdillon			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
88177676Sdillon			wakeup(wpipe);
88277676Sdillon		}
88391395Salfred		PIPE_UNLOCK(rpipe);
88477676Sdillon		return(error);
88577676Sdillon	}
88676364Salfred
88776364Salfred	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
88813907Sdyson
88913913Sdyson	orig_resid = uio->uio_resid;
89077676Sdillon
89113675Sdyson	while (uio->uio_resid) {
89213907Sdyson		int space;
89376760Salfred
89414037Sdyson#ifndef PIPE_NODIRECT
89513907Sdyson		/*
89613907Sdyson		 * If the transfer is large, we can gain performance if
89713907Sdyson		 * we do process-to-process copies directly.
89816416Sdyson		 * If the write is non-blocking, we don't use the
89916416Sdyson		 * direct write mechanism.
90058505Sdillon		 *
90158505Sdillon		 * The direct write mechanism will detect the reader going
90258505Sdillon		 * away on us.
90313907Sdyson		 */
90417163Sdyson		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
90517163Sdyson		    (fp->f_flag & FNONBLOCK) == 0 &&
90617163Sdyson			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
90713907Sdyson			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
90813907Sdyson			error = pipe_direct_write( wpipe, uio);
90976760Salfred			if (error)
91013907Sdyson				break;
91113907Sdyson			continue;
91291362Salfred		}
91314037Sdyson#endif
91413907Sdyson
91513907Sdyson		/*
91613907Sdyson		 * Pipe buffered writes cannot be coincidental with
91713907Sdyson		 * direct writes.  We wait until the currently executing
91813907Sdyson		 * direct write is completed before we start filling the
91958505Sdillon		 * pipe buffer.  We break out if a signal occurs or the
92058505Sdillon		 * reader goes away.
92113907Sdyson		 */
92213907Sdyson	retrywrite:
92313907Sdyson		while (wpipe->pipe_state & PIPE_DIRECTW) {
92413992Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
92513992Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
92613992Sdyson				wakeup(wpipe);
92713992Sdyson			}
92891395Salfred			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
92991362Salfred			    "pipbww", 0);
93058505Sdillon			if (wpipe->pipe_state & PIPE_EOF)
93158505Sdillon				break;
93213907Sdyson			if (error)
93313907Sdyson				break;
93413907Sdyson		}
93558505Sdillon		if (wpipe->pipe_state & PIPE_EOF) {
93658505Sdillon			error = EPIPE;
93758505Sdillon			break;
93858505Sdillon		}
93913907Sdyson
94013907Sdyson		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
94114644Sdyson
94214644Sdyson		/* Writes of size <= PIPE_BUF must be atomic. */
94313913Sdyson		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
94413913Sdyson			space = 0;
94513907Sdyson
94617163Sdyson		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
94713907Sdyson			if ((error = pipelock(wpipe,1)) == 0) {
94854534Stegge				int size;	/* Transfer size */
94954534Stegge				int segsize;	/* first segment to transfer */
95076760Salfred
95113907Sdyson				/*
95213907Sdyson				 * It is possible for a direct write to
95313907Sdyson				 * slip in on us... handle it here...
95413907Sdyson				 */
95513907Sdyson				if (wpipe->pipe_state & PIPE_DIRECTW) {
95613907Sdyson					pipeunlock(wpipe);
95713907Sdyson					goto retrywrite;
95813907Sdyson				}
95954534Stegge				/*
96054534Stegge				 * If a process blocked in uiomove, our
96154534Stegge				 * value for space might be bad.
96258505Sdillon				 *
96358505Sdillon				 * XXX will we be ok if the reader has gone
96458505Sdillon				 * away here?
96554534Stegge				 */
96654534Stegge				if (space > wpipe->pipe_buffer.size -
96754534Stegge				    wpipe->pipe_buffer.cnt) {
96854534Stegge					pipeunlock(wpipe);
96954534Stegge					goto retrywrite;
97054534Stegge				}
97154534Stegge
97254534Stegge				/*
97354534Stegge				 * Transfer size is minimum of uio transfer
97454534Stegge				 * and free space in pipe buffer.
97554534Stegge				 */
97654534Stegge				if (space > uio->uio_resid)
97754534Stegge					size = uio->uio_resid;
97854534Stegge				else
97954534Stegge					size = space;
98054534Stegge				/*
98154534Stegge				 * First segment to transfer is minimum of
98254534Stegge				 * transfer size and contiguous space in
98354534Stegge				 * pipe buffer.  If first segment to transfer
98454534Stegge				 * is less than the transfer size, we've got
98554534Stegge				 * a wraparound in the buffer.
98654534Stegge				 */
98754534Stegge				segsize = wpipe->pipe_buffer.size -
98854534Stegge					wpipe->pipe_buffer.in;
98954534Stegge				if (segsize > size)
99054534Stegge					segsize = size;
99154534Stegge
99254534Stegge				/* Transfer first segment */
99354534Stegge
99491395Salfred				PIPE_UNLOCK(rpipe);
99554534Stegge				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
99654534Stegge						segsize, uio);
99791395Salfred				PIPE_LOCK(rpipe);
99854534Stegge
99954534Stegge				if (error == 0 && segsize < size) {
100054534Stegge					/*
100154534Stegge					 * Transfer remaining part now, to
100254534Stegge					 * support atomic writes.  Wraparound
100354534Stegge					 * happened.
100454534Stegge					 */
100554534Stegge					if (wpipe->pipe_buffer.in + segsize !=
100654534Stegge					    wpipe->pipe_buffer.size)
100754534Stegge						panic("Expected pipe buffer wraparound disappeared");
100854534Stegge
100991395Salfred					PIPE_UNLOCK(rpipe);
101054534Stegge					error = uiomove(&wpipe->pipe_buffer.buffer[0],
101154534Stegge							size - segsize, uio);
101291395Salfred					PIPE_LOCK(rpipe);
101354534Stegge				}
101454534Stegge				if (error == 0) {
101554534Stegge					wpipe->pipe_buffer.in += size;
101654534Stegge					if (wpipe->pipe_buffer.in >=
101754534Stegge					    wpipe->pipe_buffer.size) {
101854534Stegge						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
101954534Stegge							panic("Expected wraparound bad");
102054534Stegge						wpipe->pipe_buffer.in = size - segsize;
102154534Stegge					}
102254534Stegge
102354534Stegge					wpipe->pipe_buffer.cnt += size;
102454534Stegge					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
102554534Stegge						panic("Pipe buffer overflow");
102654534Stegge
102754534Stegge				}
102813675Sdyson				pipeunlock(wpipe);
102913675Sdyson			}
103013675Sdyson			if (error)
103113675Sdyson				break;
103213675Sdyson
103313675Sdyson		} else {
103413675Sdyson			/*
103513675Sdyson			 * If the "read-side" has been blocked, wake it up now.
103613675Sdyson			 */
103713675Sdyson			if (wpipe->pipe_state & PIPE_WANTR) {
103813675Sdyson				wpipe->pipe_state &= ~PIPE_WANTR;
103913675Sdyson				wakeup(wpipe);
104013675Sdyson			}
104114037Sdyson
104213675Sdyson			/*
104313675Sdyson			 * don't block on non-blocking I/O
104413675Sdyson			 */
104516960Sdyson			if (fp->f_flag & FNONBLOCK) {
104613907Sdyson				error = EAGAIN;
104713675Sdyson				break;
104813675Sdyson			}
104913907Sdyson
105014037Sdyson			/*
105114037Sdyson			 * We have no more space and have something to offer,
105229356Speter			 * wake up select/poll.
105314037Sdyson			 */
105414037Sdyson			pipeselwakeup(wpipe);
105514037Sdyson
105613675Sdyson			wpipe->pipe_state |= PIPE_WANTW;
105791395Salfred			error = msleep(wpipe, PIPE_MTX(rpipe),
105891362Salfred			    PRIBIO | PCATCH, "pipewr", 0);
105976760Salfred			if (error != 0)
106013675Sdyson				break;
106113675Sdyson			/*
106213675Sdyson			 * If read side wants to go away, we just issue a signal
106313675Sdyson			 * to ourselves.
106413675Sdyson			 */
106513675Sdyson			if (wpipe->pipe_state & PIPE_EOF) {
106613774Sdyson				error = EPIPE;
106713907Sdyson				break;
106813675Sdyson			}
106913675Sdyson		}
107013675Sdyson	}
107113675Sdyson
107214644Sdyson	--wpipe->pipe_busy;
107377676Sdillon
107476760Salfred	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
107576760Salfred		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
107613675Sdyson		wakeup(wpipe);
107713675Sdyson	} else if (wpipe->pipe_buffer.cnt > 0) {
107813675Sdyson		/*
107913675Sdyson		 * If we have put any characters in the buffer, we wake up
108013675Sdyson		 * the reader.
108113675Sdyson		 */
108213675Sdyson		if (wpipe->pipe_state & PIPE_WANTR) {
108313675Sdyson			wpipe->pipe_state &= ~PIPE_WANTR;
108413675Sdyson			wakeup(wpipe);
108513675Sdyson		}
108613675Sdyson	}
108713909Sdyson
108813909Sdyson	/*
108913909Sdyson	 * Don't return EPIPE if I/O was successful
109013909Sdyson	 */
109113907Sdyson	if ((wpipe->pipe_buffer.cnt == 0) &&
109277676Sdillon	    (uio->uio_resid == 0) &&
109377676Sdillon	    (error == EPIPE)) {
109413907Sdyson		error = 0;
109577676Sdillon	}
109613913Sdyson
109724101Sbde	if (error == 0)
109855112Sbde		vfs_timestamp(&wpipe->pipe_mtime);
109924101Sbde
110014037Sdyson	/*
110114037Sdyson	 * We have something to offer,
110229356Speter	 * wake up select/poll.
110314037Sdyson	 */
110414177Sdyson	if (wpipe->pipe_buffer.cnt)
110514037Sdyson		pipeselwakeup(wpipe);
110613907Sdyson
110791395Salfred	PIPE_UNLOCK(rpipe);
110876760Salfred	return (error);
110913675Sdyson}
111013675Sdyson
111113675Sdyson/*
111213675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets.
111313675Sdyson */
111413675Sdysonint
111583366Sjulianpipe_ioctl(fp, cmd, data, td)
111613675Sdyson	struct file *fp;
111736735Sdfr	u_long cmd;
111876364Salfred	caddr_t data;
111983366Sjulian	struct thread *td;
112013675Sdyson{
112176364Salfred	struct pipe *mpipe = (struct pipe *)fp->f_data;
112213675Sdyson
112313675Sdyson	switch (cmd) {
112413675Sdyson
112513675Sdyson	case FIONBIO:
112613675Sdyson		return (0);
112713675Sdyson
112813675Sdyson	case FIOASYNC:
112991362Salfred		PIPE_LOCK(mpipe);
113013675Sdyson		if (*(int *)data) {
113113675Sdyson			mpipe->pipe_state |= PIPE_ASYNC;
113213675Sdyson		} else {
113313675Sdyson			mpipe->pipe_state &= ~PIPE_ASYNC;
113413675Sdyson		}
113591362Salfred		PIPE_UNLOCK(mpipe);
113613675Sdyson		return (0);
113713675Sdyson
113813675Sdyson	case FIONREAD:
113991362Salfred		PIPE_LOCK(mpipe);
114014037Sdyson		if (mpipe->pipe_state & PIPE_DIRECTW)
114114037Sdyson			*(int *)data = mpipe->pipe_map.cnt;
114214037Sdyson		else
114314037Sdyson			*(int *)data = mpipe->pipe_buffer.cnt;
114491362Salfred		PIPE_UNLOCK(mpipe);
114513675Sdyson		return (0);
114613675Sdyson
114741086Struckman	case FIOSETOWN:
114841086Struckman		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
114941086Struckman
115041086Struckman	case FIOGETOWN:
115141086Struckman		*(int *)data = fgetown(mpipe->pipe_sigio);
115213675Sdyson		return (0);
115313675Sdyson
115441086Struckman	/* This is deprecated, FIOSETOWN should be used instead. */
115541086Struckman	case TIOCSPGRP:
115641086Struckman		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
115741086Struckman
115841086Struckman	/* This is deprecated, FIOGETOWN should be used instead. */
115918863Sdyson	case TIOCGPGRP:
116041086Struckman		*(int *)data = -fgetown(mpipe->pipe_sigio);
116113675Sdyson		return (0);
116213675Sdyson
116313675Sdyson	}
116417124Sbde	return (ENOTTY);
116513675Sdyson}
116613675Sdyson
116713675Sdysonint
116883366Sjulianpipe_poll(fp, events, cred, td)
116913675Sdyson	struct file *fp;
117029356Speter	int events;
117129356Speter	struct ucred *cred;
117283366Sjulian	struct thread *td;
117313675Sdyson{
117476364Salfred	struct pipe *rpipe = (struct pipe *)fp->f_data;
117513675Sdyson	struct pipe *wpipe;
117629356Speter	int revents = 0;
117713675Sdyson
117813675Sdyson	wpipe = rpipe->pipe_peer;
117991362Salfred	PIPE_LOCK(rpipe);
118029356Speter	if (events & (POLLIN | POLLRDNORM))
118129356Speter		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
118229356Speter		    (rpipe->pipe_buffer.cnt > 0) ||
118329356Speter		    (rpipe->pipe_state & PIPE_EOF))
118429356Speter			revents |= events & (POLLIN | POLLRDNORM);
118513675Sdyson
118629356Speter	if (events & (POLLOUT | POLLWRNORM))
118729356Speter		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
118843311Sdillon		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
118943311Sdillon		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
119029356Speter			revents |= events & (POLLOUT | POLLWRNORM);
119113675Sdyson
119229356Speter	if ((rpipe->pipe_state & PIPE_EOF) ||
119329356Speter	    (wpipe == NULL) ||
119429356Speter	    (wpipe->pipe_state & PIPE_EOF))
119529356Speter		revents |= POLLHUP;
119629356Speter
119729356Speter	if (revents == 0) {
119829356Speter		if (events & (POLLIN | POLLRDNORM)) {
119983805Sjhb			selrecord(td, &rpipe->pipe_sel);
120029356Speter			rpipe->pipe_state |= PIPE_SEL;
120113675Sdyson		}
120213675Sdyson
120329356Speter		if (events & (POLLOUT | POLLWRNORM)) {
120483805Sjhb			selrecord(td, &wpipe->pipe_sel);
120530164Speter			wpipe->pipe_state |= PIPE_SEL;
120613907Sdyson		}
120713675Sdyson	}
120891362Salfred	PIPE_UNLOCK(rpipe);
120929356Speter
121029356Speter	return (revents);
121113675Sdyson}
121213675Sdyson
121352983Speterstatic int
121483366Sjulianpipe_stat(fp, ub, td)
121552983Speter	struct file *fp;
121652983Speter	struct stat *ub;
121783366Sjulian	struct thread *td;
121813675Sdyson{
121952983Speter	struct pipe *pipe = (struct pipe *)fp->f_data;
122052983Speter
122176760Salfred	bzero((caddr_t)ub, sizeof(*ub));
122217124Sbde	ub->st_mode = S_IFIFO;
122313907Sdyson	ub->st_blksize = pipe->pipe_buffer.size;
122413675Sdyson	ub->st_size = pipe->pipe_buffer.cnt;
122513675Sdyson	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
122634901Sphk	ub->st_atimespec = pipe->pipe_atime;
122734901Sphk	ub->st_mtimespec = pipe->pipe_mtime;
122834901Sphk	ub->st_ctimespec = pipe->pipe_ctime;
122960404Schris	ub->st_uid = fp->f_cred->cr_uid;
123060404Schris	ub->st_gid = fp->f_cred->cr_gid;
123117124Sbde	/*
123260404Schris	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
123317124Sbde	 * XXX (st_dev, st_ino) should be unique.
123417124Sbde	 */
123576760Salfred	return (0);
123613675Sdyson}
123713675Sdyson
123813675Sdyson/* ARGSUSED */
123913675Sdysonstatic int
124083366Sjulianpipe_close(fp, td)
124113675Sdyson	struct file *fp;
124283366Sjulian	struct thread *td;
124313675Sdyson{
124413675Sdyson	struct pipe *cpipe = (struct pipe *)fp->f_data;
124516322Sgpalmer
124649413Sgreen	fp->f_ops = &badfileops;
124749413Sgreen	fp->f_data = NULL;
124841086Struckman	funsetown(cpipe->pipe_sigio);
124913675Sdyson	pipeclose(cpipe);
125076760Salfred	return (0);
125113675Sdyson}
125213675Sdyson
125376364Salfredstatic void
125476364Salfredpipe_free_kmem(cpipe)
125576364Salfred	struct pipe *cpipe;
125676364Salfred{
125791412Salfred
125879224Sdillon	GIANT_REQUIRED;
125991412Salfred	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
126091412Salfred	       ("pipespace: pipe mutex locked"));
126176364Salfred
126276364Salfred	if (cpipe->pipe_buffer.buffer != NULL) {
126376364Salfred		if (cpipe->pipe_buffer.size > PIPE_SIZE)
126476364Salfred			--nbigpipe;
126576364Salfred		amountpipekva -= cpipe->pipe_buffer.size;
126676364Salfred		kmem_free(kernel_map,
126776364Salfred			(vm_offset_t)cpipe->pipe_buffer.buffer,
126876364Salfred			cpipe->pipe_buffer.size);
126976364Salfred		cpipe->pipe_buffer.buffer = NULL;
127076364Salfred	}
127176364Salfred#ifndef PIPE_NODIRECT
127276364Salfred	if (cpipe->pipe_map.kva != NULL) {
127376364Salfred		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
127476364Salfred		kmem_free(kernel_map,
127576364Salfred			cpipe->pipe_map.kva,
127676364Salfred			cpipe->pipe_buffer.size + PAGE_SIZE);
127776364Salfred		cpipe->pipe_map.cnt = 0;
127876364Salfred		cpipe->pipe_map.kva = 0;
127976364Salfred		cpipe->pipe_map.pos = 0;
128076364Salfred		cpipe->pipe_map.npages = 0;
128176364Salfred	}
128276364Salfred#endif
128376364Salfred}
128476364Salfred
128513675Sdyson/*
128613675Sdyson * shutdown the pipe
128713675Sdyson */
128813675Sdysonstatic void
128913675Sdysonpipeclose(cpipe)
129013675Sdyson	struct pipe *cpipe;
129113675Sdyson{
129213907Sdyson	struct pipe *ppipe;
129391968Salfred	int hadpeer;
129476364Salfred
129591968Salfred	if (cpipe == NULL)
129691968Salfred		return;
129791968Salfred
129891968Salfred	hadpeer = 0;
129991968Salfred
130091968Salfred	/* partially created pipes won't have a valid mutex. */
130191968Salfred	if (PIPE_MTX(cpipe) != NULL)
130291362Salfred		PIPE_LOCK(cpipe);
130313907Sdyson
130491968Salfred	pipeselwakeup(cpipe);
130513907Sdyson
130691968Salfred	/*
130791968Salfred	 * If the other side is blocked, wake it up saying that
130891968Salfred	 * we want to close it down.
130991968Salfred	 */
131091968Salfred	while (cpipe->pipe_busy) {
131191968Salfred		wakeup(cpipe);
131291968Salfred		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
131391968Salfred		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
131491968Salfred	}
131513675Sdyson
131691968Salfred	/*
131791968Salfred	 * Disconnect from peer
131891968Salfred	 */
131991968Salfred	if ((ppipe = cpipe->pipe_peer) != NULL) {
132091968Salfred		hadpeer++;
132191968Salfred		pipeselwakeup(ppipe);
132213907Sdyson
132391968Salfred		ppipe->pipe_state |= PIPE_EOF;
132491968Salfred		wakeup(ppipe);
132591968Salfred		KNOTE(&ppipe->pipe_sel.si_note, 0);
132691968Salfred		ppipe->pipe_peer = NULL;
132791968Salfred	}
132891968Salfred	/*
132991968Salfred	 * free resources
133091968Salfred	 */
133191968Salfred	if (PIPE_MTX(cpipe) != NULL) {
133291968Salfred		PIPE_UNLOCK(cpipe);
133391968Salfred		if (!hadpeer) {
133491968Salfred			mtx_destroy(PIPE_MTX(cpipe));
133591968Salfred			free(PIPE_MTX(cpipe), M_TEMP);
133613675Sdyson		}
133713675Sdyson	}
133891968Salfred	mtx_lock(&Giant);
133991968Salfred	pipe_free_kmem(cpipe);
134092751Sjeff	uma_zfree(pipe_zone, cpipe);
134191968Salfred	mtx_unlock(&Giant);
134213675Sdyson}
134359288Sjlemon
134472521Sjlemon/*ARGSUSED*/
134559288Sjlemonstatic int
134672521Sjlemonpipe_kqfilter(struct file *fp, struct knote *kn)
134759288Sjlemon{
134889306Salfred	struct pipe *cpipe;
134959288Sjlemon
135089306Salfred	cpipe = (struct pipe *)kn->kn_fp->f_data;
135172521Sjlemon	switch (kn->kn_filter) {
135272521Sjlemon	case EVFILT_READ:
135372521Sjlemon		kn->kn_fop = &pipe_rfiltops;
135472521Sjlemon		break;
135572521Sjlemon	case EVFILT_WRITE:
135672521Sjlemon		kn->kn_fop = &pipe_wfiltops;
135778292Sjlemon		cpipe = cpipe->pipe_peer;
135872521Sjlemon		break;
135972521Sjlemon	default:
136072521Sjlemon		return (1);
136172521Sjlemon	}
136278292Sjlemon	kn->kn_hook = (caddr_t)cpipe;
136378292Sjlemon
136491372Salfred	PIPE_LOCK(cpipe);
136578292Sjlemon	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
136691372Salfred	PIPE_UNLOCK(cpipe);
136759288Sjlemon	return (0);
136859288Sjlemon}
136959288Sjlemon
137059288Sjlemonstatic void
137159288Sjlemonfilt_pipedetach(struct knote *kn)
137259288Sjlemon{
137378292Sjlemon	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
137459288Sjlemon
137591372Salfred	PIPE_LOCK(cpipe);
137678292Sjlemon	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
137791372Salfred	PIPE_UNLOCK(cpipe);
137859288Sjlemon}
137959288Sjlemon
138059288Sjlemon/*ARGSUSED*/
138159288Sjlemonstatic int
138259288Sjlemonfilt_piperead(struct knote *kn, long hint)
138359288Sjlemon{
138459288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
138559288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
138659288Sjlemon
138791372Salfred	PIPE_LOCK(rpipe);
138859288Sjlemon	kn->kn_data = rpipe->pipe_buffer.cnt;
138959288Sjlemon	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
139059288Sjlemon		kn->kn_data = rpipe->pipe_map.cnt;
139159288Sjlemon
139259288Sjlemon	if ((rpipe->pipe_state & PIPE_EOF) ||
139359288Sjlemon	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
139491372Salfred		kn->kn_flags |= EV_EOF;
139591372Salfred		PIPE_UNLOCK(rpipe);
139659288Sjlemon		return (1);
139759288Sjlemon	}
139891372Salfred	PIPE_UNLOCK(rpipe);
139959288Sjlemon	return (kn->kn_data > 0);
140059288Sjlemon}
140159288Sjlemon
140259288Sjlemon/*ARGSUSED*/
140359288Sjlemonstatic int
140459288Sjlemonfilt_pipewrite(struct knote *kn, long hint)
140559288Sjlemon{
140659288Sjlemon	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
140759288Sjlemon	struct pipe *wpipe = rpipe->pipe_peer;
140859288Sjlemon
140991372Salfred	PIPE_LOCK(rpipe);
141059288Sjlemon	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
141159288Sjlemon		kn->kn_data = 0;
141259288Sjlemon		kn->kn_flags |= EV_EOF;
141391372Salfred		PIPE_UNLOCK(rpipe);
141459288Sjlemon		return (1);
141559288Sjlemon	}
141659288Sjlemon	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
141765855Sjlemon	if (wpipe->pipe_state & PIPE_DIRECTW)
141859288Sjlemon		kn->kn_data = 0;
141959288Sjlemon
142091372Salfred	PIPE_UNLOCK(rpipe);
142159288Sjlemon	return (kn->kn_data >= PIPE_BUF);
142259288Sjlemon}
1423