sys_pipe.c revision 77140
11541Srgrimes/*
21541Srgrimes * Copyright (c) 1996 John S. Dyson
31541Srgrimes * All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice immediately at the beginning of the file, without modification,
101541Srgrimes *    this list of conditions, and the following disclaimer.
111541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
121541Srgrimes *    notice, this list of conditions and the following disclaimer in the
131541Srgrimes *    documentation and/or other materials provided with the distribution.
141541Srgrimes * 3. Absolutely no warranty of function or purpose is made by the author
151541Srgrimes *    John S. Dyson.
161541Srgrimes * 4. Modifications may be freely made to this file if the above conditions
171541Srgrimes *    are met.
181541Srgrimes *
191541Srgrimes * $FreeBSD: head/sys/kern/sys_pipe.c 77140 2001-05-24 18:06:22Z alfred $
201541Srgrimes */
211541Srgrimes
221541Srgrimes/*
231541Srgrimes * This file contains a high-performance replacement for the socket-based
241541Srgrimes * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
251541Srgrimes * all features of sockets, but does do everything that pipes normally
261541Srgrimes * do.
271541Srgrimes */
281541Srgrimes
291541Srgrimes/*
301541Srgrimes * This code has two modes of operation, a small write mode and a large
311541Srgrimes * write mode.  The small write mode acts like conventional pipes with
321541Srgrimes * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
331541Srgrimes * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
341541Srgrimes * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
351541Srgrimes * the receiving process can copy it directly from the pages in the sending
361541Srgrimes * process.
371541Srgrimes *
381541Srgrimes * If the sending process receives a signal, it is possible that it will
3950477Speter * go away, and certainly its address space can change, because control
401541Srgrimes * is returned back to the user-mode side.  In that case, the pipe code
411541Srgrimes * arranges to copy the buffer supplied by the user process, to a pageable
421541Srgrimes * kernel buffer, and the receiving process will grab the data from the
431541Srgrimes * pageable kernel buffer.  Since signals don't happen all that often,
441541Srgrimes * the copy operation is normally eliminated.
451541Srgrimes *
4631778Seivind * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
4731778Seivind * happen for small transfers so that the system will not spend all of
481541Srgrimes * its time context switching.  PIPE_SIZE is constrained by the
491541Srgrimes * amount of kernel virtual memory.
501541Srgrimes */
5112221Sbde
5241059Speter#include <sys/param.h>
531541Srgrimes#include <sys/systm.h>
541541Srgrimes#include <sys/fcntl.h>
5531891Ssef#include <sys/file.h>
5661287Srwatson#include <sys/filedesc.h>
571541Srgrimes#include <sys/filio.h>
5830354Sphk#include <sys/lock.h>
5930354Sphk#include <sys/mutex.h>
6012221Sbde#include <sys/ttycom.h>
6111332Sswallace#include <sys/stat.h>
621541Srgrimes#include <sys/poll.h>
631541Srgrimes#include <sys/selinfo.h>
6412221Sbde#include <sys/signalvar.h>
651541Srgrimes#include <sys/sysproto.h>
6658717Sdillon#include <sys/pipe.h>
6758717Sdillon#include <sys/proc.h>
6858717Sdillon#include <sys/vnode.h>
691541Srgrimes#include <sys/uio.h>
701549Srgrimes#include <sys/event.h>
7130994Sphk
721541Srgrimes#include <vm/vm.h>
7311332Sswallace#include <vm/vm_param.h>
741541Srgrimes#include <vm/vm_object.h>
751541Srgrimes#include <vm/vm_kern.h>
7630994Sphk#include <vm/vm_extern.h>
771541Srgrimes#include <vm/pmap.h>
7830994Sphk#include <vm/vm_map.h>
791541Srgrimes#include <vm/vm_page.h>
801541Srgrimes#include <vm/vm_zone.h>
811541Srgrimes
821541Srgrimes/*
8312221Sbde * Use this define if you want to disable *fancy* VM things.  Expect an
8411332Sswallace * approx 30% decrease in transfer rate.  This could be useful for
8511332Sswallace * NetBSD or OpenBSD.
8611332Sswallace */
8712221Sbde/* #define PIPE_NODIRECT */
881541Srgrimes
891549Srgrimes/*
9030994Sphk * interfaces to the outside world
911541Srgrimes */
9211332Sswallacestatic int pipe_read __P((struct file *fp, struct uio *uio,
931541Srgrimes		struct ucred *cred, int flags, struct proc *p));
941541Srgrimesstatic int pipe_write __P((struct file *fp, struct uio *uio,
9530994Sphk		struct ucred *cred, int flags, struct proc *p));
961541Srgrimesstatic int pipe_close __P((struct file *fp, struct proc *p));
971541Srgrimesstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
981541Srgrimes		struct proc *p));
9958717Sdillonstatic int pipe_kqfilter __P((struct file *fp, struct knote *kn));
10058717Sdillonstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
10158717Sdillonstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
10258717Sdillon
10358717Sdillonstatic struct fileops pipeops = {
10412221Sbde	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
10511332Sswallace	pipe_stat, pipe_close
10611332Sswallace};
10711332Sswallace
10812221Sbdestatic void	filt_pipedetach(struct knote *kn);
10911332Sswallacestatic int	filt_piperead(struct knote *kn, long hint);
1101549Srgrimesstatic int	filt_pipewrite(struct knote *kn, long hint);
11130994Sphk
1121541Srgrimesstatic struct filterops pipe_rfiltops =
11311332Sswallace	{ 1, NULL, filt_pipedetach, filt_piperead };
1141541Srgrimesstatic struct filterops pipe_wfiltops =
1151541Srgrimes	{ 1, NULL, filt_pipedetach, filt_pipewrite };
11630994Sphk
1171541Srgrimes
1181541Srgrimes/*
1191541Srgrimes * Default pipe buffer size(s), this can be kind-of large now because pipe
12028401Speter * space is pageable.  The pipe code will try to maintain locality of
12112221Sbde * reference for performance reasons, so small amounts of outstanding I/O
12228401Speter * will not wipe the cache.
12328401Speter */
12428401Speter#define MINPIPESIZE (PIPE_SIZE/3)
12528401Speter#define MAXPIPESIZE (2*PIPE_SIZE/3)
12628401Speter
12728401Speter/*
12830994Sphk * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
12928401Speter * is there so that on large systems, we don't exhaust it.
13028401Speter */
13128401Speter#define MAXPIPEKVA (8*1024*1024)
13241726Struckman
13341726Struckman/*
13441726Struckman * Limit for direct transfers, we cannot, of course limit
13528401Speter * the amount of kva for pipes in general though.
13628401Speter */
13728401Speter#define LIMITPIPEKVA (16*1024*1024)
13841726Struckman
13928401Speter/*
14028401Speter * Limit the number of "big" pipes
14141726Struckman */
14228401Speter#define LIMITBIGPIPES	32
14328401Speterstatic int nbigpipe;
14428401Speter
14528401Speterstatic int amountpipekva;
14628401Speter
14728401Speterstatic void pipeclose __P((struct pipe *cpipe));
14828401Speterstatic void pipe_free_kmem __P((struct pipe *cpipe));
14928401Speterstatic int pipe_create __P((struct pipe **cpipep));
15028401Speterstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
15128401Speterstatic __inline void pipeunlock __P((struct pipe *cpipe));
15228401Speterstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
15328401Speter#ifndef PIPE_NODIRECT
15428401Speterstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
15530994Sphkstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
15628401Speterstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
15728401Speterstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
15828401Speter#endif
15941726Struckmanstatic int pipespace __P((struct pipe *cpipe, int size));
16041726Struckman
16141726Struckmanstatic vm_zone_t pipe_zone;
16228401Speter
16328401Speter/*
16428401Speter * The pipe system call for the DTYPE_PIPE type of pipes
16541726Struckman */
16628401Speter
16728401Speter/* ARGSUSED */
16841726Struckmanint
16928401Speterpipe(p, uap)
17028401Speter	struct proc *p;
17128401Speter	struct pipe_args /* {
17228401Speter		int	dummy;
17358941Sdillon	} */ *uap;
17458941Sdillon{
17558941Sdillon	struct filedesc *fdp = p->p_fd;
17628401Speter	struct file *rf, *wf;
17711332Sswallace	struct pipe *rpipe, *wpipe;
17811332Sswallace	int fd, error;
17911332Sswallace
18012221Sbde	if (pipe_zone == NULL)
18111332Sswallace		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
1821541Srgrimes
1831549Srgrimes	rpipe = wpipe = NULL;
18430994Sphk	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
1851541Srgrimes		pipeclose(rpipe);
18611332Sswallace		pipeclose(wpipe);
1871541Srgrimes		return (ENFILE);
1881541Srgrimes	}
18930994Sphk
1901541Srgrimes	rpipe->pipe_state |= PIPE_DIRECTOK;
19130994Sphk	wpipe->pipe_state |= PIPE_DIRECTOK;
1921541Srgrimes
1931541Srgrimes	error = falloc(p, &rf, &fd);
1941541Srgrimes	if (error) {
1951541Srgrimes		pipeclose(rpipe);
19658941Sdillon		pipeclose(wpipe);
19758941Sdillon		return (error);
19858941Sdillon	}
19912221Sbde	fhold(rf);
20011332Sswallace	p->p_retval[0] = fd;
20111332Sswallace
20211332Sswallace	/*
20312221Sbde	 * Warning: once we've gotten past allocation of the fd for the
20411332Sswallace	 * read-side, we can only drop the read side via fdrop() in order
2051541Srgrimes	 * to avoid races against processes which manage to dup() the read
2061549Srgrimes	 * side while we are blocked trying to allocate the write side.
20730994Sphk	 */
2081541Srgrimes	rf->f_flag = FREAD | FWRITE;
20911332Sswallace	rf->f_type = DTYPE_PIPE;
2101541Srgrimes	rf->f_data = (caddr_t)rpipe;
2111541Srgrimes	rf->f_ops = &pipeops;
21230994Sphk	error = falloc(p, &wf, &fd);
2131541Srgrimes	if (error) {
2141541Srgrimes		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
2151541Srgrimes			fdp->fd_ofiles[p->p_retval[0]] = NULL;
21658941Sdillon			fdrop(rf, p);
21758941Sdillon		}
21858941Sdillon		fdrop(rf, p);
21912221Sbde		/* rpipe has been closed by fdrop(). */
22011332Sswallace		pipeclose(wpipe);
22111332Sswallace		return (error);
22211332Sswallace	}
22312221Sbde	wf->f_flag = FREAD | FWRITE;
22411332Sswallace	wf->f_type = DTYPE_PIPE;
2251541Srgrimes	wf->f_data = (caddr_t)wpipe;
2261549Srgrimes	wf->f_ops = &pipeops;
22730994Sphk	p->p_retval[1] = fd;
2281541Srgrimes
22911332Sswallace	rpipe->pipe_peer = wpipe;
2301541Srgrimes	wpipe->pipe_peer = rpipe;
2311541Srgrimes	fdrop(rf, p);
23230994Sphk
2331541Srgrimes	return (0);
23430994Sphk}
2351541Srgrimes
2361541Srgrimes/*
2371541Srgrimes * Allocate kva for pipe circular buffer, the space is pageable
2381541Srgrimes * This routine will 'realloc' the size of a pipe safely, if it fails
2391541Srgrimes * it will retain the old buffer.
2401541Srgrimes * If it fails it will return ENOMEM.
2411541Srgrimes */
2421541Srgrimesstatic int
2431541Srgrimespipespace(cpipe, size)
24412221Sbde	struct pipe *cpipe;
24511332Sswallace	int size;
24611332Sswallace{
24711332Sswallace	struct vm_object *object;
24812221Sbde	caddr_t buffer;
24911332Sswallace	int npages, error;
2501541Srgrimes
2511549Srgrimes	npages = round_page(size)/PAGE_SIZE;
25230994Sphk	/*
2531541Srgrimes	 * Create an object, I don't like the idea of paging to/from
25411332Sswallace	 * kernel_object.
2551541Srgrimes	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
2561541Srgrimes	 */
25730994Sphk	mtx_lock(&vm_mtx);
2581541Srgrimes	object = vm_object_allocate(OBJT_DEFAULT, npages);
2591541Srgrimes	buffer = (caddr_t) vm_map_min(kernel_map);
2601541Srgrimes
26112221Sbde	/*
2621541Srgrimes	 * Insert the object into the kernel map, and allocate kva for it.
2631541Srgrimes	 * The map entry is, by default, pageable.
2641541Srgrimes	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
2651541Srgrimes	 */
26612221Sbde	error = vm_map_find(kernel_map, object, 0,
2671549Srgrimes		(vm_offset_t *) &buffer, size, 1,
26830994Sphk		VM_PROT_ALL, VM_PROT_ALL, 0);
2691541Srgrimes
2701541Srgrimes	if (error != KERN_SUCCESS) {
2711541Srgrimes		vm_object_deallocate(object);
2721541Srgrimes		mtx_unlock(&vm_mtx);
2731541Srgrimes		return (ENOMEM);
2741541Srgrimes	}
2751541Srgrimes
2761541Srgrimes	/* free old resources if we're resizing */
27730994Sphk	pipe_free_kmem(cpipe);
2781541Srgrimes	mtx_unlock(&vm_mtx);
2791541Srgrimes	cpipe->pipe_buffer.object = object;
2801541Srgrimes	cpipe->pipe_buffer.buffer = buffer;
2811541Srgrimes	cpipe->pipe_buffer.size = size;
2821541Srgrimes	cpipe->pipe_buffer.in = 0;
2833098Sphk	cpipe->pipe_buffer.out = 0;
2843098Sphk	cpipe->pipe_buffer.cnt = 0;
2851541Srgrimes	amountpipekva += cpipe->pipe_buffer.size;
28630994Sphk	return (0);
2871541Srgrimes}
2881541Srgrimes
2891541Srgrimes/*
29012221Sbde * initialize and allocate VM and memory for pipe
29112207Sbde */
29211332Sswallacestatic int
29311332Sswallacepipe_create(cpipep)
29412221Sbde	struct pipe **cpipep;
29511332Sswallace{
2961541Srgrimes	struct pipe *cpipe;
2971549Srgrimes	int error;
29830994Sphk
2991541Srgrimes	*cpipep = zalloc(pipe_zone);
30012207Sbde	if (*cpipep == NULL)
3011541Srgrimes		return (ENOMEM);
3021541Srgrimes
3031541Srgrimes	cpipe = *cpipep;
3041541Srgrimes
3051541Srgrimes	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
3061541Srgrimes	cpipe->pipe_buffer.object = NULL;
30730994Sphk#ifndef PIPE_NODIRECT
3081541Srgrimes	cpipe->pipe_map.kva = NULL;
3091541Srgrimes#endif
3101541Srgrimes	/*
3111541Srgrimes	 * protect so pipeclose() doesn't follow a junk pointer
3121541Srgrimes	 * if pipespace() fails.
3131541Srgrimes	 */
3141541Srgrimes	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
3151541Srgrimes	cpipe->pipe_state = 0;
3161541Srgrimes	cpipe->pipe_peer = NULL;
3171541Srgrimes	cpipe->pipe_busy = 0;
3181541Srgrimes
3191541Srgrimes#ifndef PIPE_NODIRECT
3201541Srgrimes	/*
3211541Srgrimes	 * pipe data structure initializations to support direct pipe I/O
3221541Srgrimes	 */
3231541Srgrimes	cpipe->pipe_map.cnt = 0;
3241541Srgrimes	cpipe->pipe_map.kva = 0;
32512221Sbde	cpipe->pipe_map.pos = 0;
3261541Srgrimes	cpipe->pipe_map.npages = 0;
3271541Srgrimes	/* cpipe->pipe_map.ms[] = invalid */
3281541Srgrimes#endif
3291541Srgrimes
33012221Sbde	error = pipespace(cpipe, PIPE_SIZE);
3311541Srgrimes	if (error)
3321549Srgrimes		return (error);
33330994Sphk
3341541Srgrimes	vfs_timestamp(&cpipe->pipe_ctime);
3351541Srgrimes	cpipe->pipe_atime = cpipe->pipe_ctime;
3361541Srgrimes	cpipe->pipe_mtime = cpipe->pipe_ctime;
3371541Srgrimes
3381541Srgrimes	return (0);
3391541Srgrimes}
34020677Sbde
34120677Sbde
3421541Srgrimes/*
3431541Srgrimes * lock a pipe for I/O, blocking other access
3441541Srgrimes */
34515985Sdgstatic __inline int
3461541Srgrimespipelock(cpipe, catch)
3471541Srgrimes	struct pipe *cpipe;
3481541Srgrimes	int catch;
3491541Srgrimes{
3501541Srgrimes	int error;
3511541Srgrimes
3521541Srgrimes	while (cpipe->pipe_state & PIPE_LOCK) {
3531541Srgrimes		cpipe->pipe_state |= PIPE_LWANT;
3541541Srgrimes		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
3551541Srgrimes		    "pipelk", 0);
3561541Srgrimes		if (error != 0)
3571541Srgrimes			return (error);
3581541Srgrimes	}
3591541Srgrimes	cpipe->pipe_state |= PIPE_LOCK;
3601541Srgrimes	return (0);
3611541Srgrimes}
36224448Speter
36324448Speter/*
36424448Speter * unlock a pipe I/O lock
36524448Speter */
36624448Speterstatic __inline void
36724448Speterpipeunlock(cpipe)
36824448Speter	struct pipe *cpipe;
36924448Speter{
37024448Speter
37124448Speter	cpipe->pipe_state &= ~PIPE_LOCK;
37224448Speter	if (cpipe->pipe_state & PIPE_LWANT) {
37324448Speter		cpipe->pipe_state &= ~PIPE_LWANT;
37412221Sbde		wakeup(cpipe);
3751541Srgrimes	}
3761541Srgrimes}
3771541Srgrimes
37812221Sbdestatic __inline void
3791541Srgrimespipeselwakeup(cpipe)
3801549Srgrimes	struct pipe *cpipe;
38130994Sphk{
3821541Srgrimes
3831541Srgrimes	if (cpipe->pipe_state & PIPE_SEL) {
3841541Srgrimes		cpipe->pipe_state &= ~PIPE_SEL;
3851541Srgrimes		selwakeup(&cpipe->pipe_sel);
3861541Srgrimes	}
3871541Srgrimes	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
3881541Srgrimes		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
38924448Speter	KNOTE(&cpipe->pipe_sel.si_note, 0);
39024448Speter}
39124448Speter
39224448Speter/* ARGSUSED */
39324448Speterstatic int
39424448Speterpipe_read(fp, uio, cred, flags, p)
39524448Speter	struct file *fp;
39624448Speter	struct uio *uio;
39724448Speter	struct ucred *cred;
39824448Speter	struct proc *p;
39924448Speter	int flags;
40024448Speter{
40124448Speter	struct pipe *rpipe = (struct pipe *) fp->f_data;
40224448Speter	int error;
40324448Speter	int nread = 0;
40424448Speter	u_int size;
40524448Speter
4061541Srgrimes	++rpipe->pipe_busy;
40724448Speter	error = pipelock(rpipe, 1);
40817994Sache	if (error)
40924448Speter		goto unlocked_error;
41017994Sache
41124448Speter	while (uio->uio_resid) {
41224448Speter		/*
41324448Speter		 * normal pipe buffer receive
41446155Sphk		 */
4151541Srgrimes		if (rpipe->pipe_buffer.cnt > 0) {
41624448Speter			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
41724448Speter			if (size > rpipe->pipe_buffer.cnt)
4181541Srgrimes				size = rpipe->pipe_buffer.cnt;
41924448Speter			if (size > (u_int) uio->uio_resid)
42024448Speter				size = (u_int) uio->uio_resid;
4211541Srgrimes
42217994Sache			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
42324448Speter					size, uio);
42424448Speter			if (error)
42517994Sache				break;
42646155Sphk
42717994Sache			rpipe->pipe_buffer.out += size;
42824448Speter			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
42924448Speter				rpipe->pipe_buffer.out = 0;
43024448Speter
43124448Speter			rpipe->pipe_buffer.cnt -= size;
43224448Speter
43324448Speter			/*
43424448Speter			 * If there is no more to read in the pipe, reset
43524448Speter			 * its pointers to the beginning.  This improves
43624448Speter			 * cache hit stats.
43724448Speter			 */
43824448Speter			if (rpipe->pipe_buffer.cnt == 0) {
43924448Speter				rpipe->pipe_buffer.in = 0;
44024448Speter				rpipe->pipe_buffer.out = 0;
44131891Ssef			}
44224448Speter			nread += size;
44324448Speter#ifndef PIPE_NODIRECT
44424448Speter		/*
44524448Speter		 * Direct copy, bypassing a kernel buffer.
44624448Speter		 */
44724448Speter		} else if ((size = rpipe->pipe_map.cnt) &&
44824448Speter			   (rpipe->pipe_state & PIPE_DIRECTW)) {
44924448Speter			caddr_t	va;
45024448Speter			if (size > (u_int) uio->uio_resid)
45124448Speter				size = (u_int) uio->uio_resid;
45231891Ssef
45324448Speter			va = (caddr_t) rpipe->pipe_map.kva +
4548141Sache			    rpipe->pipe_map.pos;
45524448Speter			error = uiomove(va, size, uio);
45624448Speter			if (error)
45724448Speter				break;
45824448Speter			nread += size;
45924448Speter			rpipe->pipe_map.pos += size;
46024448Speter			rpipe->pipe_map.cnt -= size;
46124448Speter			if (rpipe->pipe_map.cnt == 0) {
46224448Speter				rpipe->pipe_state &= ~PIPE_DIRECTW;
46331891Ssef				wakeup(rpipe);
46424448Speter			}
4651541Srgrimes#endif
4661541Srgrimes		} else {
4671541Srgrimes			/*
46812221Sbde			 * detect EOF condition
4691541Srgrimes			 * read returns 0 on EOF, no need to set error
4701541Srgrimes			 */
4711541Srgrimes			if (rpipe->pipe_state & PIPE_EOF)
47212221Sbde				break;
4731541Srgrimes
4741549Srgrimes			/*
47530994Sphk			 * If the "write-side" has been blocked, wake it up now.
4761541Srgrimes			 */
4771541Srgrimes			if (rpipe->pipe_state & PIPE_WANTW) {
4781541Srgrimes				rpipe->pipe_state &= ~PIPE_WANTW;
4791541Srgrimes				wakeup(rpipe);
4801541Srgrimes			}
4811541Srgrimes
4821541Srgrimes			/*
4831541Srgrimes			 * Break if some data was read.
48424449Speter			 */
48524449Speter			if (nread > 0)
48646155Sphk				break;
4871541Srgrimes
4881541Srgrimes			/*
4891541Srgrimes			 * Unlock the pipe buffer for our remaining processing.  We
4901541Srgrimes			 * will either break out with an error or we will sleep and
4911541Srgrimes			 * relock to loop.
49224449Speter			 */
49324449Speter			pipeunlock(rpipe);
49424449Speter
49531891Ssef			/*
49624449Speter			 * Handle non-blocking mode operation or
4971541Srgrimes			 * wait for more data.
4981541Srgrimes			 */
4991541Srgrimes			if (fp->f_flag & FNONBLOCK) {
50012221Sbde				error = EAGAIN;
5011541Srgrimes			} else {
5021541Srgrimes				rpipe->pipe_state |= PIPE_WANTR;
5031541Srgrimes				if ((error = tsleep(rpipe, PRIBIO | PCATCH,
50412221Sbde				    "piperd", 0)) == 0)
5051541Srgrimes					error = pipelock(rpipe, 1);
5061549Srgrimes			}
50730994Sphk			if (error)
5081541Srgrimes				goto unlocked_error;
5091541Srgrimes		}
5101541Srgrimes	}
5111541Srgrimes	pipeunlock(rpipe);
5121541Srgrimes
5131541Srgrimes	if (error == 0)
5141541Srgrimes		vfs_timestamp(&rpipe->pipe_atime);
51524448Speterunlocked_error:
51624448Speter	--rpipe->pipe_busy;
51724448Speter
51824448Speter	/*
51924448Speter	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
52024448Speter	 */
52124448Speter	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
52224448Speter		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
52324448Speter		wakeup(rpipe);
52424448Speter	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
52524448Speter		/*
5261541Srgrimes		 * Handle write blocking hysteresis.
52724448Speter		 */
52817994Sache		if (rpipe->pipe_state & PIPE_WANTW) {
52924448Speter			rpipe->pipe_state &= ~PIPE_WANTW;
53017994Sache			wakeup(rpipe);
53124448Speter		}
53224448Speter	}
53324448Speter
53446155Sphk	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
5351541Srgrimes		pipeselwakeup(rpipe);
53624448Speter
53717994Sache	return (error);
53824448Speter}
53924448Speter
54024448Speter#ifndef PIPE_NODIRECT
54124448Speter/*
54224448Speter * Map the sending processes' buffer into kernel space and wire it.
54324448Speter * This is similar to a physical write operation.
54424448Speter */
54517994Sachestatic int
54646155Sphkpipe_build_write_buffer(wpipe, uio)
54724448Speter	struct pipe *wpipe;
54824448Speter	struct uio *uio;
54924448Speter{
55024448Speter	u_int size;
55124448Speter	int i;
55224448Speter	vm_offset_t addr, endaddr, paddr;
55324448Speter
55431891Ssef	size = (u_int) uio->uio_iov->iov_len;
55524448Speter	if (size > wpipe->pipe_buffer.size)
55624448Speter		size = wpipe->pipe_buffer.size;
55724448Speter
55824448Speter	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
55924448Speter	mtx_lock(&vm_mtx);
56024448Speter	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
56124448Speter	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
56224448Speter		vm_page_t m;
56324448Speter
56424448Speter		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
56531891Ssef		    (paddr = pmap_kextract(addr)) == 0) {
56624448Speter			int j;
5678141Sache
56824448Speter			for (j = 0; j < i; j++)
56924448Speter				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
57024448Speter			mtx_unlock(&vm_mtx);
57124448Speter			return (EFAULT);
57224448Speter		}
57324448Speter
57424448Speter		m = PHYS_TO_VM_PAGE(paddr);
57531891Ssef		vm_page_wire(m);
57624448Speter		wpipe->pipe_map.ms[i] = m;
5771541Srgrimes	}
5781541Srgrimes
5791541Srgrimes/*
58012221Sbde * set up the control block
5811541Srgrimes */
5821541Srgrimes	wpipe->pipe_map.npages = i;
5831541Srgrimes	wpipe->pipe_map.pos =
58412221Sbde	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
5851541Srgrimes	wpipe->pipe_map.cnt = size;
5861549Srgrimes
58730994Sphk/*
5881541Srgrimes * and map the buffer
5891541Srgrimes */
5901541Srgrimes	if (wpipe->pipe_map.kva == 0) {
5911541Srgrimes		/*
5921541Srgrimes		 * We need to allocate space for an extra page because the
5931541Srgrimes		 * address range might (will) span pages at times.
5941541Srgrimes		 */
5951541Srgrimes		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
59624449Speter			wpipe->pipe_buffer.size + PAGE_SIZE);
59724449Speter		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
59846155Sphk	}
5991541Srgrimes	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
60024449Speter		wpipe->pipe_map.npages);
60124449Speter
60224449Speter	mtx_unlock(&vm_mtx);
60331891Ssef/*
60424449Speter * and update the uio data
6051541Srgrimes */
6061541Srgrimes
6071541Srgrimes	uio->uio_iov->iov_len -= size;
60812221Sbde	uio->uio_iov->iov_base += size;
6091541Srgrimes	if (uio->uio_iov->iov_len == 0)
6101541Srgrimes		uio->uio_iov++;
6111541Srgrimes	uio->uio_resid -= size;
6121541Srgrimes	uio->uio_offset += size;
61312221Sbde	return (0);
6141541Srgrimes}
6151549Srgrimes
61630994Sphk/*
6171541Srgrimes * unmap and unwire the process buffer
6181541Srgrimes */
6191541Srgrimesstatic void
6201541Srgrimespipe_destroy_write_buffer(wpipe)
6211541Srgrimes	struct pipe *wpipe;
6221541Srgrimes{
6231541Srgrimes	int i;
62446155Sphk
6251541Srgrimes	mtx_lock(&vm_mtx);
62612063Sdg	if (wpipe->pipe_map.kva) {
62724447Speter		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
6281541Srgrimes
62924447Speter		if (amountpipekva > MAXPIPEKVA) {
63024447Speter			vm_offset_t kva = wpipe->pipe_map.kva;
63124447Speter			wpipe->pipe_map.kva = 0;
63224447Speter			kmem_free(kernel_map, kva,
6331541Srgrimes				wpipe->pipe_buffer.size + PAGE_SIZE);
63424447Speter			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
63524447Speter		}
63624447Speter	}
63724447Speter	for (i = 0; i < wpipe->pipe_map.npages; i++)
63824447Speter		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
63924447Speter	mtx_unlock(&vm_mtx);
64024447Speter}
64124447Speter
64224447Speter/*
64324447Speter * In the case of a signal, the writing process might go away.  This
64424447Speter * code copies the data into the circular buffer so that the source
64524447Speter * pages can be freed without loss of data.
64624447Speter */
64724447Speterstatic void
64831891Ssefpipe_clone_write_buffer(wpipe)
6491541Srgrimes	struct pipe *wpipe;
6501541Srgrimes{
6511541Srgrimes	int size;
65212221Sbde	int pos;
6531541Srgrimes
6549238Sache	size = wpipe->pipe_map.cnt;
6559238Sache	pos = wpipe->pipe_map.pos;
6561541Srgrimes	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
65712221Sbde	    (caddr_t) wpipe->pipe_buffer.buffer, size);
6581541Srgrimes
6591549Srgrimes	wpipe->pipe_buffer.in = size;
66030994Sphk	wpipe->pipe_buffer.out = 0;
6611541Srgrimes	wpipe->pipe_buffer.cnt = size;
6621541Srgrimes	wpipe->pipe_state &= ~PIPE_DIRECTW;
6631541Srgrimes
6641541Srgrimes	pipe_destroy_write_buffer(wpipe);
6659238Sache}
6668135Sache
6671541Srgrimes/*
6689238Sache * This implements the pipe buffer write mechanism.  Note that only
6699238Sache * a direct write OR a normal pipe write can be pending at any given time.
67043311Sdillon * If there are any characters in the pipe buffer, the direct write will
67143311Sdillon * be deferred until the receiving process grabs all of the bytes from
67243311Sdillon * the pipe buffer.  Then the direct mapping write is set-up.
67346155Sphk */
6748135Sachestatic int
6759238Sachepipe_direct_write(wpipe, uio)
67624450Speter	struct pipe *wpipe;
67724450Speter	struct uio *uio;
6789238Sache{
67931891Ssef	int error;
68024450Speter
68124450Speterretry:
6829238Sache	while (wpipe->pipe_state & PIPE_DIRECTW) {
6839238Sache		if (wpipe->pipe_state & PIPE_WANTR) {
6849238Sache			wpipe->pipe_state &= ~PIPE_WANTR;
68531891Ssef			wakeup(wpipe);
6868135Sache		}
68724559Speter		wpipe->pipe_state |= PIPE_WANTW;
68824559Speter		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
6898111Sache		if (error)
69031891Ssef			goto error1;
69124450Speter		if (wpipe->pipe_state & PIPE_EOF) {
6928135Sache			error = EPIPE;
6931541Srgrimes			goto error1;
6941541Srgrimes		}
69512221Sbde	}
6961541Srgrimes	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
6979238Sache	if (wpipe->pipe_buffer.cnt > 0) {
6989238Sache		if (wpipe->pipe_state & PIPE_WANTR) {
6991541Srgrimes			wpipe->pipe_state &= ~PIPE_WANTR;
70012221Sbde			wakeup(wpipe);
7011541Srgrimes		}
7021549Srgrimes
70330994Sphk		wpipe->pipe_state |= PIPE_WANTW;
7041541Srgrimes		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
7051541Srgrimes		if (error)
7061541Srgrimes			goto error1;
7071541Srgrimes		if (wpipe->pipe_state & PIPE_EOF) {
7089238Sache			error = EPIPE;
7098135Sache			goto error1;
7101541Srgrimes		}
7119238Sache		goto retry;
7129238Sache	}
71343311Sdillon
71443311Sdillon	wpipe->pipe_state |= PIPE_DIRECTW;
71543311Sdillon
71646155Sphk	error = pipe_build_write_buffer(wpipe, uio);
7178135Sache	if (error) {
7189238Sache		wpipe->pipe_state &= ~PIPE_DIRECTW;
71924450Speter		goto error1;
72024450Speter	}
7219238Sache
72231891Ssef	error = 0;
72324450Speter	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
72424450Speter		if (wpipe->pipe_state & PIPE_EOF) {
7259238Sache			pipelock(wpipe, 0);
72631891Ssef			pipe_destroy_write_buffer(wpipe);
72724450Speter			pipeunlock(wpipe);
72824559Speter			pipeselwakeup(wpipe);
72924559Speter			error = EPIPE;
7308111Sache			goto error1;
73131891Ssef		}
73224450Speter		if (wpipe->pipe_state & PIPE_WANTR) {
7338135Sache			wpipe->pipe_state &= ~PIPE_WANTR;
7341541Srgrimes			wakeup(wpipe);
7351541Srgrimes		}
73656115Speter		pipeselwakeup(wpipe);
73756115Speter		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
73856115Speter	}
73956115Speter
74056115Speter	pipelock(wpipe,0);
74124453Speter	if (wpipe->pipe_state & PIPE_DIRECTW) {
74256115Speter		/*
74356115Speter		 * this bit of trickery substitutes a kernel buffer for
74456115Speter		 * the process that might be going away.
74556115Speter		 */
74656115Speter		pipe_clone_write_buffer(wpipe);
74756115Speter	} else {
74856115Speter		pipe_destroy_write_buffer(wpipe);
74956115Speter	}
75056115Speter	pipeunlock(wpipe);
75156115Speter	return (error);
75256115Speter
75356115Spetererror1:
75456115Speter	wakeup(wpipe);
75556115Speter	return (error);
75656115Speter}
75756115Speter#endif
75856115Speter
75956115Speterstatic int
76056115Speterpipe_write(fp, uio, cred, flags, p)
76156115Speter	struct file *fp;
76256115Speter	struct uio *uio;
76356115Speter	struct ucred *cred;
76456115Speter	struct proc *p;
76556115Speter	int flags;
76656115Speter{
76756115Speter	int error = 0;
76856115Speter	int orig_resid;
76956115Speter	struct pipe *wpipe, *rpipe;
77056115Speter
77156115Speter	rpipe = (struct pipe *) fp->f_data;
77256115Speter	wpipe = rpipe->pipe_peer;
77356115Speter
77456115Speter	/*
77556115Speter	 * detect loss of pipe read side, issue SIGPIPE if lost.
77656115Speter	 */
77756115Speter	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
77856115Speter		return (EPIPE);
77956115Speter	}
78056115Speter
78156115Speter	/*
78256115Speter	 * If it is advantageous to resize the pipe buffer, do
78356115Speter	 * so.
78456115Speter	 */
78556115Speter	if ((uio->uio_resid > PIPE_SIZE) &&
78656115Speter		(nbigpipe < LIMITBIGPIPES) &&
78756115Speter		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
78856115Speter		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
78956115Speter		(wpipe->pipe_buffer.cnt == 0)) {
79056115Speter
79156115Speter		if ((error = pipelock(wpipe,1)) == 0) {
79256115Speter			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
79356115Speter				nbigpipe++;
79456115Speter			pipeunlock(wpipe);
79556115Speter		} else {
79656115Speter			return (error);
79756115Speter		}
79856115Speter	}
79956115Speter
80056115Speter	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
80156115Speter
80256115Speter	++wpipe->pipe_busy;
80356115Speter	orig_resid = uio->uio_resid;
80456115Speter	while (uio->uio_resid) {
80556115Speter		int space;
80656115Speter
80756115Speter#ifndef PIPE_NODIRECT
80856115Speter		/*
80956115Speter		 * If the transfer is large, we can gain performance if
81056115Speter		 * we do process-to-process copies directly.
81156115Speter		 * If the write is non-blocking, we don't use the
81256115Speter		 * direct write mechanism.
81356115Speter		 *
81456115Speter		 * The direct write mechanism will detect the reader going
81556115Speter		 * away on us.
81656115Speter		 */
81756115Speter		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
81856115Speter		    (fp->f_flag & FNONBLOCK) == 0 &&
81956115Speter			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
82056115Speter			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
82156115Speter			error = pipe_direct_write( wpipe, uio);
82256115Speter			if (error)
82356115Speter				break;
82456115Speter			continue;
82556115Speter		}
82656115Speter#endif
82756115Speter
82856115Speter		/*
82956115Speter		 * Pipe buffered writes cannot be coincidental with
83056115Speter		 * direct writes.  We wait until the currently executing
83156115Speter		 * direct write is completed before we start filling the
83256115Speter		 * pipe buffer.  We break out if a signal occurs or the
83356115Speter		 * reader goes away.
83456115Speter		 */
83556115Speter	retrywrite:
83656115Speter		while (wpipe->pipe_state & PIPE_DIRECTW) {
83756115Speter			if (wpipe->pipe_state & PIPE_WANTR) {
83856115Speter				wpipe->pipe_state &= ~PIPE_WANTR;
83956115Speter				wakeup(wpipe);
84056115Speter			}
84156115Speter			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
84256115Speter			if (wpipe->pipe_state & PIPE_EOF)
84356115Speter				break;
84456115Speter			if (error)
84556115Speter				break;
84656115Speter		}
84756115Speter		if (wpipe->pipe_state & PIPE_EOF) {
84856115Speter			error = EPIPE;
84956115Speter			break;
85056115Speter		}
85156115Speter
85256115Speter		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
85356115Speter
85456115Speter		/* Writes of size <= PIPE_BUF must be atomic. */
85556115Speter		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
85656115Speter			space = 0;
85756115Speter
85856115Speter		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
85956115Speter			if ((error = pipelock(wpipe,1)) == 0) {
86056115Speter				int size;	/* Transfer size */
86156115Speter				int segsize;	/* first segment to transfer */
86256115Speter
86356115Speter				/*
86456115Speter				 * It is possible for a direct write to
86556115Speter				 * slip in on us... handle it here...
86656115Speter				 */
86756115Speter				if (wpipe->pipe_state & PIPE_DIRECTW) {
86856115Speter					pipeunlock(wpipe);
86956115Speter					goto retrywrite;
87056115Speter				}
87156115Speter				/*
87256115Speter				 * If a process blocked in uiomove, our
87356115Speter				 * value for space might be bad.
87456115Speter				 *
87556115Speter				 * XXX will we be ok if the reader has gone
87656115Speter				 * away here?
87756115Speter				 */
87856115Speter				if (space > wpipe->pipe_buffer.size -
87956115Speter				    wpipe->pipe_buffer.cnt) {
88056115Speter					pipeunlock(wpipe);
88156115Speter					goto retrywrite;
88256115Speter				}
88356115Speter
88456115Speter				/*
88556115Speter				 * Transfer size is minimum of uio transfer
88656115Speter				 * and free space in pipe buffer.
88756115Speter				 */
88856115Speter				if (space > uio->uio_resid)
88956115Speter					size = uio->uio_resid;
89056115Speter				else
89156115Speter					size = space;
89256115Speter				/*
89356115Speter				 * First segment to transfer is minimum of
89456115Speter				 * transfer size and contiguous space in
89524453Speter				 * pipe buffer.  If first segment to transfer
89624453Speter				 * is less than the transfer size, we've got
89724453Speter				 * a wraparound in the buffer.
89824453Speter				 */
89924453Speter				segsize = wpipe->pipe_buffer.size -
90024453Speter					wpipe->pipe_buffer.in;
90130994Sphk				if (segsize > size)
90224453Speter					segsize = size;
90324453Speter
90424453Speter				/* Transfer first segment */
90524453Speter
90624453Speter				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
90724453Speter						segsize, uio);
90824453Speter
90924453Speter				if (error == 0 && segsize < size) {
91024453Speter					/*
91124453Speter					 * Transfer remaining part now, to
91224453Speter					 * support atomic writes.  Wraparound
91360216Speter					 * happened.
91424453Speter					 */
91524453Speter					if (wpipe->pipe_buffer.in + segsize !=
91624453Speter					    wpipe->pipe_buffer.size)
9171541Srgrimes						panic("Expected pipe buffer wraparound disappeared");
9181541Srgrimes
9191541Srgrimes					error = uiomove(&wpipe->pipe_buffer.buffer[0],
9201549Srgrimes							size - segsize, uio);
9211541Srgrimes				}
9221541Srgrimes				if (error == 0) {
9231541Srgrimes					wpipe->pipe_buffer.in += size;
9241541Srgrimes					if (wpipe->pipe_buffer.in >=
9251541Srgrimes					    wpipe->pipe_buffer.size) {
9261541Srgrimes						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
9271541Srgrimes							panic("Expected wraparound bad");
9281541Srgrimes						wpipe->pipe_buffer.in = size - segsize;
9291541Srgrimes					}
9301541Srgrimes
9311541Srgrimes					wpipe->pipe_buffer.cnt += size;
9321541Srgrimes					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
9331541Srgrimes						panic("Pipe buffer overflow");
9341541Srgrimes
93561287Srwatson				}
93661287Srwatson				pipeunlock(wpipe);
93761287Srwatson			}
93861287Srwatson			if (error)
93961287Srwatson				break;
9401541Srgrimes
9411541Srgrimes		} else {
9421541Srgrimes			/*
9431541Srgrimes			 * If the "read-side" has been blocked, wake it up now.
9441541Srgrimes			 */
9451541Srgrimes			if (wpipe->pipe_state & PIPE_WANTR) {
9461549Srgrimes				wpipe->pipe_state &= ~PIPE_WANTR;
94746112Sphk				wakeup(wpipe);
94846112Sphk			}
94946112Sphk
95046155Sphk			/*
95146112Sphk			 * don't block on non-blocking I/O
95246112Sphk			 */
95346112Sphk			if (fp->f_flag & FNONBLOCK) {
95446155Sphk				error = EAGAIN;
9551541Srgrimes				break;
95646155Sphk			}
95746155Sphk
9581541Srgrimes			/*
95961282Srwatson			 * We have no more space and have something to offer,
96061282Srwatson			 * wake up select/poll.
96146155Sphk			 */
96246155Sphk			pipeselwakeup(wpipe);
96346155Sphk
9641541Srgrimes			wpipe->pipe_state |= PIPE_WANTW;
96546155Sphk			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
96646155Sphk			if (error != 0)
96746155Sphk				break;
96846155Sphk			/*
96946155Sphk			 * If read side wants to go away, we just issue a signal
97046155Sphk			 * to ourselves.
97146155Sphk			 */
97246155Sphk			if (wpipe->pipe_state & PIPE_EOF) {
97346155Sphk				error = EPIPE;
9741541Srgrimes				break;
9751541Srgrimes			}
9761541Srgrimes		}
97753518Sphk	}
97853518Sphk
97953518Sphk	--wpipe->pipe_busy;
98053518Sphk	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
98153518Sphk		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
98253518Sphk		wakeup(wpipe);
98353518Sphk	} else if (wpipe->pipe_buffer.cnt > 0) {
98453518Sphk		/*
98553518Sphk		 * If we have put any characters in the buffer, we wake up
98653518Sphk		 * the reader.
98753518Sphk		 */
98853518Sphk		if (wpipe->pipe_state & PIPE_WANTR) {
98953518Sphk			wpipe->pipe_state &= ~PIPE_WANTR;
99053518Sphk			wakeup(wpipe);
99153518Sphk		}
99253518Sphk	}
99353518Sphk
99453518Sphk	/*
99553518Sphk	 * Don't return EPIPE if I/O was successful
99653518Sphk	 */
99753518Sphk	if ((wpipe->pipe_buffer.cnt == 0) &&
99853518Sphk		(uio->uio_resid == 0) &&
99953518Sphk		(error == EPIPE))
100053518Sphk		error = 0;
100153518Sphk
10021541Srgrimes	if (error == 0)
10031541Srgrimes		vfs_timestamp(&wpipe->pipe_mtime);
10041541Srgrimes
10051541Srgrimes	/*
10061541Srgrimes	 * We have something to offer,
10071541Srgrimes	 * wake up select/poll.
10081541Srgrimes	 */
10091541Srgrimes	if (wpipe->pipe_buffer.cnt)
10101541Srgrimes		pipeselwakeup(wpipe);
10111541Srgrimes
10121541Srgrimes	return (error);
10131541Srgrimes}
10141541Srgrimes
10151541Srgrimes/*
10161541Srgrimes * we implement a very minimal set of ioctls for compatibility with sockets.
10171541Srgrimes */
10181541Srgrimesint
10191549Srgrimespipe_ioctl(fp, cmd, data, p)
10201541Srgrimes	struct file *fp;
10211541Srgrimes	u_long cmd;
10221541Srgrimes	caddr_t data;
10231541Srgrimes	struct proc *p;
10241541Srgrimes{
10251541Srgrimes	struct pipe *mpipe = (struct pipe *)fp->f_data;
10261541Srgrimes
10271541Srgrimes	switch (cmd) {
10281541Srgrimes
10291541Srgrimes	case FIONBIO:
10301541Srgrimes		return (0);
10311541Srgrimes
10321541Srgrimes	case FIOASYNC:
10331541Srgrimes		if (*(int *)data) {
10341541Srgrimes			mpipe->pipe_state |= PIPE_ASYNC;
10351541Srgrimes		} else {
10361541Srgrimes			mpipe->pipe_state &= ~PIPE_ASYNC;
10371541Srgrimes		}
10381541Srgrimes		return (0);
10391541Srgrimes
10401541Srgrimes	case FIONREAD:
10411541Srgrimes		if (mpipe->pipe_state & PIPE_DIRECTW)
10421541Srgrimes			*(int *)data = mpipe->pipe_map.cnt;
10431541Srgrimes		else
10441541Srgrimes			*(int *)data = mpipe->pipe_buffer.cnt;
10451541Srgrimes		return (0);
10461541Srgrimes
10471541Srgrimes	case FIOSETOWN:
10481541Srgrimes		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
10491541Srgrimes
10501541Srgrimes	case FIOGETOWN:
10511541Srgrimes		*(int *)data = fgetown(mpipe->pipe_sigio);
10521541Srgrimes		return (0);
10531541Srgrimes
10541541Srgrimes	/* This is deprecated, FIOSETOWN should be used instead. */
10551541Srgrimes	case TIOCSPGRP:
10561541Srgrimes		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
10571541Srgrimes
10581541Srgrimes	/* This is deprecated, FIOGETOWN should be used instead. */
10591541Srgrimes	case TIOCGPGRP:
10601541Srgrimes		*(int *)data = -fgetown(mpipe->pipe_sigio);
10611541Srgrimes		return (0);
10621541Srgrimes
106312221Sbde	}
10641541Srgrimes	return (ENOTTY);
10651541Srgrimes}
10661541Srgrimes
10671541Srgrimesint
106812221Sbdepipe_poll(fp, events, cred, p)
10691541Srgrimes	struct file *fp;
10701549Srgrimes	int events;
107130994Sphk	struct ucred *cred;
10721541Srgrimes	struct proc *p;
10731541Srgrimes{
10741541Srgrimes	struct pipe *rpipe = (struct pipe *)fp->f_data;
10751541Srgrimes	struct pipe *wpipe;
107623358Sache	int revents = 0;
107723359Sache
10781541Srgrimes	wpipe = rpipe->pipe_peer;
10791541Srgrimes	if (events & (POLLIN | POLLRDNORM))
10801541Srgrimes		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
10811541Srgrimes		    (rpipe->pipe_buffer.cnt > 0) ||
10821541Srgrimes		    (rpipe->pipe_state & PIPE_EOF))
10831541Srgrimes			revents |= events & (POLLIN | POLLRDNORM);
10841541Srgrimes
108512221Sbde	if (events & (POLLOUT | POLLWRNORM))
10861541Srgrimes		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
10871541Srgrimes		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
10881541Srgrimes		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
108912221Sbde			revents |= events & (POLLOUT | POLLWRNORM);
10901541Srgrimes
10911549Srgrimes	if ((rpipe->pipe_state & PIPE_EOF) ||
109230994Sphk	    (wpipe == NULL) ||
10931541Srgrimes	    (wpipe->pipe_state & PIPE_EOF))
10941541Srgrimes		revents |= POLLHUP;
10951541Srgrimes
10961541Srgrimes	if (revents == 0) {
109723330Sache		if (events & (POLLIN | POLLRDNORM)) {
10981541Srgrimes			selrecord(p, &rpipe->pipe_sel);
109946155Sphk			rpipe->pipe_state |= PIPE_SEL;
11001541Srgrimes		}
110122522Sdavidn
110236845Sdfr		if (events & (POLLOUT | POLLWRNORM)) {
11031541Srgrimes			selrecord(p, &wpipe->pipe_sel);
11041541Srgrimes			wpipe->pipe_state |= PIPE_SEL;
110522522Sdavidn		}
110622522Sdavidn	}
110723330Sache
11081541Srgrimes	return (revents);
11091541Srgrimes}
111031891Ssef
111131891Ssefstatic int
111231891Ssefpipe_stat(fp, ub, p)
111355338Sphk	struct file *fp;
111431891Ssef	struct stat *ub;
111531891Ssef	struct proc *p;
111655707Ssef{
111731891Ssef	struct pipe *pipe = (struct pipe *)fp->f_data;
111831891Ssef
1119	bzero((caddr_t)ub, sizeof(*ub));
1120	ub->st_mode = S_IFIFO;
1121	ub->st_blksize = pipe->pipe_buffer.size;
1122	ub->st_size = pipe->pipe_buffer.cnt;
1123	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1124	ub->st_atimespec = pipe->pipe_atime;
1125	ub->st_mtimespec = pipe->pipe_mtime;
1126	ub->st_ctimespec = pipe->pipe_ctime;
1127	ub->st_uid = fp->f_cred->cr_uid;
1128	ub->st_gid = fp->f_cred->cr_gid;
1129	/*
1130	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1131	 * XXX (st_dev, st_ino) should be unique.
1132	 */
1133	return (0);
1134}
1135
1136/* ARGSUSED */
1137static int
1138pipe_close(fp, p)
1139	struct file *fp;
1140	struct proc *p;
1141{
1142	struct pipe *cpipe = (struct pipe *)fp->f_data;
1143
1144	fp->f_ops = &badfileops;
1145	fp->f_data = NULL;
1146	funsetown(cpipe->pipe_sigio);
1147	pipeclose(cpipe);
1148	return (0);
1149}
1150
1151static void
1152pipe_free_kmem(cpipe)
1153	struct pipe *cpipe;
1154{
1155
1156	mtx_assert(&vm_mtx, MA_OWNED);
1157	if (cpipe->pipe_buffer.buffer != NULL) {
1158		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1159			--nbigpipe;
1160		amountpipekva -= cpipe->pipe_buffer.size;
1161		kmem_free(kernel_map,
1162			(vm_offset_t)cpipe->pipe_buffer.buffer,
1163			cpipe->pipe_buffer.size);
1164		cpipe->pipe_buffer.buffer = NULL;
1165	}
1166#ifndef PIPE_NODIRECT
1167	if (cpipe->pipe_map.kva != NULL) {
1168		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1169		kmem_free(kernel_map,
1170			cpipe->pipe_map.kva,
1171			cpipe->pipe_buffer.size + PAGE_SIZE);
1172		cpipe->pipe_map.cnt = 0;
1173		cpipe->pipe_map.kva = 0;
1174		cpipe->pipe_map.pos = 0;
1175		cpipe->pipe_map.npages = 0;
1176	}
1177#endif
1178}
1179
1180/*
1181 * shutdown the pipe
1182 */
1183static void
1184pipeclose(cpipe)
1185	struct pipe *cpipe;
1186{
1187	struct pipe *ppipe;
1188
1189	if (cpipe) {
1190
1191		pipeselwakeup(cpipe);
1192
1193		/*
1194		 * If the other side is blocked, wake it up saying that
1195		 * we want to close it down.
1196		 */
1197		while (cpipe->pipe_busy) {
1198			wakeup(cpipe);
1199			cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
1200			tsleep(cpipe, PRIBIO, "pipecl", 0);
1201		}
1202
1203		/*
1204		 * Disconnect from peer
1205		 */
1206		if ((ppipe = cpipe->pipe_peer) != NULL) {
1207			pipeselwakeup(ppipe);
1208
1209			ppipe->pipe_state |= PIPE_EOF;
1210			wakeup(ppipe);
1211			ppipe->pipe_peer = NULL;
1212		}
1213		/*
1214		 * free resources
1215		 */
1216		mtx_lock(&vm_mtx);
1217		pipe_free_kmem(cpipe);
1218		/* XXX: erm, doesn't zalloc already have its own locks and
1219		 * not need the giant vm lock?
1220		 */
1221		zfree(pipe_zone, cpipe);
1222		mtx_unlock(&vm_mtx);
1223	}
1224}
1225
1226/*ARGSUSED*/
1227static int
1228pipe_kqfilter(struct file *fp, struct knote *kn)
1229{
1230	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1231
1232	switch (kn->kn_filter) {
1233	case EVFILT_READ:
1234		kn->kn_fop = &pipe_rfiltops;
1235		break;
1236	case EVFILT_WRITE:
1237		kn->kn_fop = &pipe_wfiltops;
1238		break;
1239	default:
1240		return (1);
1241	}
1242
1243	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
1244	return (0);
1245}
1246
1247static void
1248filt_pipedetach(struct knote *kn)
1249{
1250	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1251
1252	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1253}
1254
1255/*ARGSUSED*/
1256static int
1257filt_piperead(struct knote *kn, long hint)
1258{
1259	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1260	struct pipe *wpipe = rpipe->pipe_peer;
1261
1262	kn->kn_data = rpipe->pipe_buffer.cnt;
1263	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1264		kn->kn_data = rpipe->pipe_map.cnt;
1265
1266	if ((rpipe->pipe_state & PIPE_EOF) ||
1267	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1268		kn->kn_flags |= EV_EOF;
1269		return (1);
1270	}
1271	return (kn->kn_data > 0);
1272}
1273
1274/*ARGSUSED*/
1275static int
1276filt_pipewrite(struct knote *kn, long hint)
1277{
1278	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1279	struct pipe *wpipe = rpipe->pipe_peer;
1280
1281	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1282		kn->kn_data = 0;
1283		kn->kn_flags |= EV_EOF;
1284		return (1);
1285	}
1286	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1287	if (wpipe->pipe_state & PIPE_DIRECTW)
1288		kn->kn_data = 0;
1289
1290	return (kn->kn_data >= PIPE_BUF);
1291}
1292