sys_pipe.c revision 54534
1163953Srrs/*
2185694Srrs * Copyright (c) 1996 John S. Dyson
3163953Srrs * All rights reserved.
4163953Srrs *
5163953Srrs * Redistribution and use in source and binary forms, with or without
6163953Srrs * modification, are permitted provided that the following conditions
7163953Srrs * are met:
8163953Srrs * 1. Redistributions of source code must retain the above copyright
9163953Srrs *    notice immediately at the beginning of the file, without modification,
10163953Srrs *    this list of conditions, and the following disclaimer.
11163953Srrs * 2. Redistributions in binary form must reproduce the above copyright
12163953Srrs *    notice, this list of conditions and the following disclaimer in the
13163953Srrs *    documentation and/or other materials provided with the distribution.
14163953Srrs * 3. Absolutely no warranty of function or purpose is made by the author
15163953Srrs *    John S. Dyson.
16163953Srrs * 4. Modifications may be freely made to this file if the above conditions
17163953Srrs *    are met.
18163953Srrs *
19163953Srrs * $FreeBSD: head/sys/kern/sys_pipe.c 54534 1999-12-13 02:55:47Z tegge $
20163953Srrs */
21163953Srrs
22163953Srrs/*
23163953Srrs * This file contains a high-performance replacement for the socket-based
24163953Srrs * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25163953Srrs * all features of sockets, but does do everything that pipes normally
26163953Srrs * do.
27163953Srrs */
28163953Srrs
29163953Srrs/*
30163953Srrs * This code has two modes of operation, a small write mode and a large
31163953Srrs * write mode.  The small write mode acts like conventional pipes with
32163953Srrs * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
33163953Srrs * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
34163953Srrs * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
35166086Srrs * the receiving process can copy it directly from the pages in the sending
36163953Srrs * process.
37163953Srrs *
38163953Srrs * If the sending process receives a signal, it is possible that it will
39163953Srrs * go away, and certainly its address space can change, because control
40167695Srrs * is returned back to the user-mode side.  In that case, the pipe code
41167695Srrs * arranges to copy the buffer supplied by the user process, to a pageable
42167598Srrs * kernel buffer, and the receiving process will grab the data from the
43163953Srrs * pageable kernel buffer.  Since signals don't happen all that often,
44163953Srrs * the copy operation is normally eliminated.
45163953Srrs *
46163953Srrs * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
47163953Srrs * happen for small transfers so that the system will not spend all of
48163953Srrs * its time context switching.  PIPE_SIZE is constrained by the
49163953Srrs * amount of kernel virtual memory.
50170091Srrs */
51185694Srrs
52164085Srrs#include <sys/param.h>
53163953Srrs#include <sys/systm.h>
54163953Srrs#include <sys/proc.h>
55217611Stuexen#include <sys/fcntl.h>
56217760Stuexen#include <sys/file.h>
57170091Srrs#include <sys/filedesc.h>
58163953Srrs#include <sys/filio.h>
59163953Srrs#include <sys/ttycom.h>
60163953Srrs#include <sys/stat.h>
61163953Srrs#include <sys/poll.h>
62163953Srrs#include <sys/select.h>
63179783Srrs#include <sys/signalvar.h>
64163953Srrs#include <sys/sysproto.h>
65179783Srrs#include <sys/pipe.h>
66179783Srrs#include <sys/uio.h>
67163953Srrs
68179783Srrs#include <vm/vm.h>
69163953Srrs#include <vm/vm_param.h>
70163953Srrs#include <sys/lock.h>
71163953Srrs#include <vm/vm_object.h>
72163953Srrs#include <vm/vm_kern.h>
73163953Srrs#include <vm/vm_extern.h>
74179783Srrs#include <vm/pmap.h>
75170056Srrs#include <vm/vm_map.h>
76163953Srrs#include <vm/vm_page.h>
77163953Srrs#include <vm/vm_zone.h>
78163953Srrs
79163953Srrs/*
80163953Srrs * Use this define if you want to disable *fancy* VM things.  Expect an
81179783Srrs * approx 30% decrease in transfer rate.  This could be useful for
82163953Srrs * NetBSD or OpenBSD.
83179783Srrs */
84179783Srrs/* #define PIPE_NODIRECT */
85179783Srrs
86179783Srrs/*
87179783Srrs * interfaces to the outside world
88179783Srrs */
89179783Srrsstatic int pipe_read __P((struct file *fp, struct uio *uio,
90179783Srrs		struct ucred *cred, int flags, struct proc *p));
91179783Srrsstatic int pipe_write __P((struct file *fp, struct uio *uio,
92179783Srrs		struct ucred *cred, int flags, struct proc *p));
93163953Srrsstatic int pipe_close __P((struct file *fp, struct proc *p));
94163953Srrsstatic int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
95179783Srrs		struct proc *p));
96179783Srrsstatic int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
97179783Srrsstatic int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
98179783Srrs
99179783Srrsstatic struct fileops pipeops =
100163953Srrs    { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_stat, pipe_close };
101166023Srrs
102163953Srrs/*
103179157Srrs * Default pipe buffer size(s), this can be kind-of large now because pipe
104167695Srrs * space is pageable.  The pipe code will try to maintain locality of
105163953Srrs * reference for performance reasons, so small amounts of outstanding I/O
106163953Srrs * will not wipe the cache.
107163953Srrs */
108163953Srrs#define MINPIPESIZE (PIPE_SIZE/3)
109163953Srrs#define MAXPIPESIZE (2*PIPE_SIZE/3)
110197257Stuexen
111163953Srrs/*
112163953Srrs * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
113163953Srrs * is there so that on large systems, we don't exhaust it.
114163953Srrs */
115197257Stuexen#define MAXPIPEKVA (8*1024*1024)
116197257Stuexen
117197257Stuexen/*
118197257Stuexen * Limit for direct transfers, we cannot, of course limit
119163953Srrs * the amount of kva for pipes in general though.
120197257Stuexen */
121163953Srrs#define LIMITPIPEKVA (16*1024*1024)
122163953Srrs
123163953Srrs/*
124163953Srrs * Limit the number of "big" pipes
125197257Stuexen */
126163953Srrs#define LIMITBIGPIPES	32
127163953Srrsstatic int nbigpipe;
128163953Srrs
129163953Srrsstatic int amountpipekva;
130163953Srrs
131190689Srrsstatic void pipeclose __P((struct pipe *cpipe));
132190689Srrsstatic void pipeinit __P((struct pipe *cpipe));
133190689Srrsstatic __inline int pipelock __P((struct pipe *cpipe, int catch));
134190689Srrsstatic __inline void pipeunlock __P((struct pipe *cpipe));
135163953Srrsstatic __inline void pipeselwakeup __P((struct pipe *cpipe));
136163953Srrs#ifndef PIPE_NODIRECT
137163953Srrsstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
138163953Srrsstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe));
139163953Srrsstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
140179783Srrsstatic void pipe_clone_write_buffer __P((struct pipe *wpipe));
141170744Srrs#endif
142170744Srrsstatic void pipespace __P((struct pipe *cpipe));
143170744Srrs
144170744Srrsstatic vm_zone_t pipe_zone;
145170744Srrs
146170744Srrs/*
147163953Srrs * The pipe system call for the DTYPE_PIPE type of pipes
148163953Srrs */
149163953Srrs
150163953Srrs/* ARGSUSED */
151163953Srrsint
152163953Srrspipe(p, uap)
153163953Srrs	struct proc *p;
154163953Srrs	struct pipe_args /* {
155163953Srrs		int	dummy;
156163953Srrs	} */ *uap;
157163953Srrs{
158163953Srrs	register struct filedesc *fdp = p->p_fd;
159163953Srrs	struct file *rf, *wf;
160163953Srrs	struct pipe *rpipe, *wpipe;
161163953Srrs	int fd, error;
162163953Srrs
163163953Srrs	if (pipe_zone == NULL)
164163953Srrs		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
165163953Srrs
166163953Srrs	rpipe = zalloc( pipe_zone);
167169420Srrs	pipeinit(rpipe);
168163953Srrs	rpipe->pipe_state |= PIPE_DIRECTOK;
169169420Srrs	wpipe = zalloc( pipe_zone);
170163953Srrs	pipeinit(wpipe);
171163953Srrs	wpipe->pipe_state |= PIPE_DIRECTOK;
172163953Srrs
173163953Srrs	error = falloc(p, &rf, &fd);
174163953Srrs	if (error)
175163953Srrs		goto free2;
176163953Srrs	p->p_retval[0] = fd;
177163953Srrs	rf->f_flag = FREAD | FWRITE;
178163953Srrs	rf->f_type = DTYPE_PIPE;
179163953Srrs	rf->f_data = (caddr_t)rpipe;
180163953Srrs	rf->f_ops = &pipeops;
181163953Srrs	error = falloc(p, &wf, &fd);
182163953Srrs	if (error)
183163953Srrs		goto free3;
184163953Srrs	wf->f_flag = FREAD | FWRITE;
185163953Srrs	wf->f_type = DTYPE_PIPE;
186163953Srrs	wf->f_data = (caddr_t)wpipe;
187163953Srrs	wf->f_ops = &pipeops;
188163953Srrs	p->p_retval[1] = fd;
189163953Srrs
190163953Srrs	rpipe->pipe_peer = wpipe;
191171943Srrs	wpipe->pipe_peer = rpipe;
192163953Srrs
193163953Srrs	return (0);
194163953Srrsfree3:
195163953Srrs	fdp->fd_ofiles[p->p_retval[0]] = 0;
196163953Srrs	ffree(rf);
197163953Srrsfree2:
198214939Stuexen	(void)pipeclose(wpipe);
199163953Srrs	(void)pipeclose(rpipe);
200163953Srrs	return (error);
201165647Srrs}
202163953Srrs
203165220Srrs/*
204165220Srrs * Allocate kva for pipe circular buffer, the space is pageable
205163953Srrs */
206163953Srrsstatic void
207163953Srrspipespace(cpipe)
208163953Srrs	struct pipe *cpipe;
209185694Srrs{
210185694Srrs	int npages, error;
211185694Srrs
212163953Srrs	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
213163953Srrs	/*
214163953Srrs	 * Create an object, I don't like the idea of paging to/from
215167695Srrs	 * kernel_object.
216163953Srrs	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
217163953Srrs	 */
218163953Srrs	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
219163953Srrs	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
220163953Srrs
221163953Srrs	/*
222163953Srrs	 * Insert the object into the kernel map, and allocate kva for it.
223163953Srrs	 * The map entry is, by default, pageable.
224163953Srrs	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
225163953Srrs	 */
226172091Srrs	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
227163953Srrs		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
228163953Srrs		cpipe->pipe_buffer.size, 1,
229163953Srrs		VM_PROT_ALL, VM_PROT_ALL, 0);
230163953Srrs
231163953Srrs	if (error != KERN_SUCCESS)
232172090Srrs		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
233172090Srrs	amountpipekva += cpipe->pipe_buffer.size;
234172090Srrs}
235172090Srrs
236163953Srrs/*
237172091Srrs * initialize and allocate VM and memory for pipe
238172091Srrs */
239172091Srrsstatic void
240172091Srrspipeinit(cpipe)
241163953Srrs	struct pipe *cpipe;
242163953Srrs{
243172091Srrs
244172091Srrs	cpipe->pipe_buffer.in = 0;
245163953Srrs	cpipe->pipe_buffer.out = 0;
246163953Srrs	cpipe->pipe_buffer.cnt = 0;
247163953Srrs	cpipe->pipe_buffer.size = PIPE_SIZE;
248163953Srrs
249172091Srrs	/* Buffer kva gets dynamically allocated */
250163953Srrs	cpipe->pipe_buffer.buffer = NULL;
251163953Srrs	/* cpipe->pipe_buffer.object = invalid */
252172091Srrs
253172091Srrs	cpipe->pipe_state = 0;
254172091Srrs	cpipe->pipe_peer = NULL;
255172091Srrs	cpipe->pipe_busy = 0;
256172091Srrs	getnanotime(&cpipe->pipe_ctime);
257172091Srrs	cpipe->pipe_atime = cpipe->pipe_ctime;
258172091Srrs	cpipe->pipe_mtime = cpipe->pipe_ctime;
259172091Srrs	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
260172091Srrs
261172091Srrs#ifndef PIPE_NODIRECT
262172091Srrs	/*
263172091Srrs	 * pipe data structure initializations to support direct pipe I/O
264172091Srrs	 */
265172091Srrs	cpipe->pipe_map.cnt = 0;
266172091Srrs	cpipe->pipe_map.kva = 0;
267163953Srrs	cpipe->pipe_map.pos = 0;
268163953Srrs	cpipe->pipe_map.npages = 0;
269163953Srrs	/* cpipe->pipe_map.ms[] = invalid */
270163953Srrs#endif
271163953Srrs}
272163953Srrs
273163953Srrs
274172091Srrs/*
275172091Srrs * lock a pipe for I/O, blocking other access
276172091Srrs */
277172091Srrsstatic __inline int
278172091Srrspipelock(cpipe, catch)
279172091Srrs	struct pipe *cpipe;
280167598Srrs	int catch;
281172091Srrs{
282172091Srrs	int error;
283163953Srrs	while (cpipe->pipe_state & PIPE_LOCK) {
284172091Srrs		cpipe->pipe_state |= PIPE_LWANT;
285172091Srrs		if ((error = tsleep( cpipe,
286172091Srrs			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
287172091Srrs			return error;
288172091Srrs		}
289172091Srrs	}
290163953Srrs	cpipe->pipe_state |= PIPE_LOCK;
291172091Srrs	return 0;
292172091Srrs}
293172091Srrs
294172091Srrs/*
295172091Srrs * unlock a pipe I/O lock
296216669Stuexen */
297211944Stuexenstatic __inline void
298172091Srrspipeunlock(cpipe)
299172091Srrs	struct pipe *cpipe;
300172091Srrs{
301172091Srrs	cpipe->pipe_state &= ~PIPE_LOCK;
302172091Srrs	if (cpipe->pipe_state & PIPE_LWANT) {
303172091Srrs		cpipe->pipe_state &= ~PIPE_LWANT;
304172091Srrs		wakeup(cpipe);
305172091Srrs	}
306172091Srrs}
307172091Srrs
308172091Srrsstatic __inline void
309172091Srrspipeselwakeup(cpipe)
310172091Srrs	struct pipe *cpipe;
311172091Srrs{
312172091Srrs	if (cpipe->pipe_state & PIPE_SEL) {
313172091Srrs		cpipe->pipe_state &= ~PIPE_SEL;
314172091Srrs		selwakeup(&cpipe->pipe_sel);
315172091Srrs	}
316172091Srrs	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
317172091Srrs		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
318172091Srrs}
319172090Srrs
320172091Srrs/* ARGSUSED */
321172091Srrsstatic int
322172091Srrspipe_read(fp, uio, cred, flags, p)
323172091Srrs	struct file *fp;
324172091Srrs	struct uio *uio;
325172091Srrs	struct ucred *cred;
326172090Srrs	struct proc *p;
327172091Srrs	int flags;
328172090Srrs{
329172091Srrs
330172091Srrs	struct pipe *rpipe = (struct pipe *) fp->f_data;
331172090Srrs	int error;
332172091Srrs	int nread = 0;
333163953Srrs	u_int size;
334172091Srrs
335163953Srrs	++rpipe->pipe_busy;
336163953Srrs	error = pipelock(rpipe, 1);
337163953Srrs	if (error)
338163953Srrs		goto unlocked_error;
339163953Srrs
340163953Srrs	while (uio->uio_resid) {
341163953Srrs		/*
342163953Srrs		 * normal pipe buffer receive
343163953Srrs		 */
344163953Srrs		if (rpipe->pipe_buffer.cnt > 0) {
345163953Srrs			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
346167598Srrs			if (size > rpipe->pipe_buffer.cnt)
347163953Srrs				size = rpipe->pipe_buffer.cnt;
348168299Srrs			if (size > (u_int) uio->uio_resid)
349167598Srrs				size = (u_int) uio->uio_resid;
350163953Srrs
351163953Srrs			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
352163953Srrs					size, uio);
353163953Srrs			if (error) {
354163953Srrs				break;
355163953Srrs			}
356163953Srrs			rpipe->pipe_buffer.out += size;
357163953Srrs			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
358163953Srrs				rpipe->pipe_buffer.out = 0;
359163953Srrs
360163953Srrs			rpipe->pipe_buffer.cnt -= size;
361163953Srrs
362163953Srrs			/*
363163953Srrs			 * If there is no more to read in the pipe, reset
364163953Srrs			 * its pointers to the beginning.  This improves
365163953Srrs			 * cache hit stats.
366163953Srrs			 */
367163953Srrs			if (rpipe->pipe_buffer.cnt == 0) {
368163953Srrs				rpipe->pipe_buffer.in = 0;
369163953Srrs				rpipe->pipe_buffer.out = 0;
370163953Srrs			}
371163953Srrs			nread += size;
372163953Srrs#ifndef PIPE_NODIRECT
373163953Srrs		/*
374163953Srrs		 * Direct copy, bypassing a kernel buffer.
375163953Srrs		 */
376163953Srrs		} else if ((size = rpipe->pipe_map.cnt) &&
377163953Srrs			   (rpipe->pipe_state & PIPE_DIRECTW)) {
378163953Srrs			caddr_t	va;
379163953Srrs			if (size > (u_int) uio->uio_resid)
380163953Srrs				size = (u_int) uio->uio_resid;
381163953Srrs
382167598Srrs			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
383163953Srrs			error = uiomove(va, size, uio);
384163953Srrs			if (error)
385172091Srrs				break;
386163953Srrs			nread += size;
387163953Srrs			rpipe->pipe_map.pos += size;
388163953Srrs			rpipe->pipe_map.cnt -= size;
389163953Srrs			if (rpipe->pipe_map.cnt == 0) {
390163953Srrs				rpipe->pipe_state &= ~PIPE_DIRECTW;
391163953Srrs				wakeup(rpipe);
392163953Srrs			}
393163953Srrs#endif
394163953Srrs		} else {
395163953Srrs			/*
396163953Srrs			 * detect EOF condition
397163953Srrs			 */
398163953Srrs			if (rpipe->pipe_state & PIPE_EOF) {
399209029Srrs				/* XXX error = ? */
400209029Srrs				break;
401209029Srrs			}
402163953Srrs
403163953Srrs			/*
404163953Srrs			 * If the "write-side" has been blocked, wake it up now.
405163953Srrs			 */
406163953Srrs			if (rpipe->pipe_state & PIPE_WANTW) {
407163953Srrs				rpipe->pipe_state &= ~PIPE_WANTW;
408163953Srrs				wakeup(rpipe);
409163953Srrs			}
410164085Srrs
411163953Srrs			/*
412163953Srrs			 * Break if some data was read.
413163953Srrs			 */
414163953Srrs			if (nread > 0)
415164085Srrs				break;
416167598Srrs
417163953Srrs			/*
418168299Srrs			 * Unlock the pipe buffer for our remaining processing.  We
419167598Srrs			 * will either break out with an error or we will sleep and
420168299Srrs			 * relock to loop.
421170587Srwatson			 */
422170587Srwatson			pipeunlock(rpipe);
423163953Srrs
424163953Srrs			/*
425164039Srwatson			 * Handle non-blocking mode operation or
426163953Srrs			 * wait for more data.
427163953Srrs			 */
428163953Srrs			if (fp->f_flag & FNONBLOCK)
429163953Srrs				error = EAGAIN;
430163953Srrs			else {
431163953Srrs				rpipe->pipe_state |= PIPE_WANTR;
432167598Srrs				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
433163953Srrs					error = pipelock(rpipe, 1);
434163953Srrs			}
435163953Srrs			if (error)
436163953Srrs				goto unlocked_error;
437163953Srrs		}
438164085Srrs	}
439163953Srrs	pipeunlock(rpipe);
440171943Srrs
441163953Srrs	if (error == 0)
442163953Srrs		getnanotime(&rpipe->pipe_atime);
443163953Srrsunlocked_error:
444163953Srrs	--rpipe->pipe_busy;
445164085Srrs
446164085Srrs	/*
447164085Srrs	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
448164085Srrs	 */
449164085Srrs	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
450164085Srrs		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
451164085Srrs		wakeup(rpipe);
452164085Srrs	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
453164085Srrs		/*
454164085Srrs		 * Handle write blocking hysteresis.
455164085Srrs		 */
456164085Srrs		if (rpipe->pipe_state & PIPE_WANTW) {
457164085Srrs			rpipe->pipe_state &= ~PIPE_WANTW;
458164085Srrs			wakeup(rpipe);
459164085Srrs		}
460164085Srrs	}
461163953Srrs
462163953Srrs	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
463163953Srrs		pipeselwakeup(rpipe);
464163953Srrs
465163953Srrs	return error;
466163953Srrs}
467163953Srrs
468163953Srrs#ifndef PIPE_NODIRECT
469163953Srrs/*
470163953Srrs * Map the sending processes' buffer into kernel space and wire it.
471163953Srrs * This is similar to a physical write operation.
472163953Srrs */
473163953Srrsstatic int
474163953Srrspipe_build_write_buffer(wpipe, uio)
475163953Srrs	struct pipe *wpipe;
476171943Srrs	struct uio *uio;
477163953Srrs{
478171943Srrs	u_int size;
479163953Srrs	int i;
480163953Srrs	vm_offset_t addr, endaddr, paddr;
481163953Srrs
482163953Srrs	size = (u_int) uio->uio_iov->iov_len;
483163953Srrs	if (size > wpipe->pipe_buffer.size)
484163953Srrs		size = wpipe->pipe_buffer.size;
485163953Srrs
486163953Srrs	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
487163953Srrs	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
488163953Srrs		addr < endaddr;
489169380Srrs		addr += PAGE_SIZE, i+=1) {
490169380Srrs
491163953Srrs		vm_page_t m;
492167695Srrs
493163953Srrs		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
494163953Srrs		    (paddr = pmap_kextract(addr)) == 0) {
495163953Srrs			int j;
496163953Srrs			for(j=0;j<i;j++)
497167695Srrs				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
498167695Srrs			return EFAULT;
499167695Srrs		}
500163953Srrs
501163953Srrs		m = PHYS_TO_VM_PAGE(paddr);
502163953Srrs		vm_page_wire(m);
503163953Srrs		wpipe->pipe_map.ms[i] = m;
504163953Srrs	}
505163953Srrs
506163953Srrs/*
507163953Srrs * set up the control block
508163953Srrs */
509163953Srrs	wpipe->pipe_map.npages = i;
510163953Srrs	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
511163953Srrs	wpipe->pipe_map.cnt = size;
512163953Srrs
513163953Srrs/*
514163953Srrs * and map the buffer
515163953Srrs */
516166086Srrs	if (wpipe->pipe_map.kva == 0) {
517170205Srrs		/*
518185694Srrs		 * We need to allocate space for an extra page because the
519171167Sgnn		 * address range might (will) span pages at times.
520163953Srrs		 */
521185694Srrs		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
522185435Sbz			wpipe->pipe_buffer.size + PAGE_SIZE);
523171440Srrs		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
524163953Srrs	}
525163953Srrs	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
526171943Srrs		wpipe->pipe_map.npages);
527163953Srrs
528163953Srrs/*
529184030Srrs * and update the uio data
530184030Srrs */
531184030Srrs
532184030Srrs	uio->uio_iov->iov_len -= size;
533184030Srrs	uio->uio_iov->iov_base += size;
534163953Srrs	if (uio->uio_iov->iov_len == 0)
535170205Srrs		uio->uio_iov++;
536163953Srrs	uio->uio_resid -= size;
537163953Srrs	uio->uio_offset += size;
538163953Srrs	return 0;
539163953Srrs}
540163953Srrs
541163953Srrs/*
542163953Srrs * unmap and unwire the process buffer
543163953Srrs */
544197288Srrsstatic void
545171167Sgnnpipe_destroy_write_buffer(wpipe)
546171133Sgnnstruct pipe *wpipe;
547163953Srrs{
548163953Srrs	int i;
549163953Srrs	if (wpipe->pipe_map.kva) {
550163953Srrs		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
551202523Srrs
552163953Srrs		if (amountpipekva > MAXPIPEKVA) {
553163953Srrs			vm_offset_t kva = wpipe->pipe_map.kva;
554163953Srrs			wpipe->pipe_map.kva = 0;
555163953Srrs			kmem_free(kernel_map, kva,
556163953Srrs				wpipe->pipe_buffer.size + PAGE_SIZE);
557163953Srrs			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
558169352Srrs		}
559169380Srrs	}
560169380Srrs	for (i=0;i<wpipe->pipe_map.npages;i++)
561169254Srrs		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
562202523Srrs}
563202523Srrs
564202523Srrs/*
565202523Srrs * In the case of a signal, the writing process might go away.  This
566202523Srrs * code copies the data into the circular buffer so that the source
567202523Srrs * pages can be freed without loss of data.
568163953Srrs */
569163953Srrsstatic void
570163953Srrspipe_clone_write_buffer(wpipe)
571171167Sgnnstruct pipe *wpipe;
572163953Srrs{
573163953Srrs	int size;
574163953Srrs	int pos;
575163953Srrs
576163953Srrs	size = wpipe->pipe_map.cnt;
577163953Srrs	pos = wpipe->pipe_map.pos;
578163953Srrs	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
579171943Srrs			(caddr_t) wpipe->pipe_buffer.buffer,
580166086Srrs			size);
581163953Srrs
582163953Srrs	wpipe->pipe_buffer.in = size;
583171943Srrs	wpipe->pipe_buffer.out = 0;
584163953Srrs	wpipe->pipe_buffer.cnt = size;
585171943Srrs	wpipe->pipe_state &= ~PIPE_DIRECTW;
586163953Srrs
587171943Srrs	pipe_destroy_write_buffer(wpipe);
588163953Srrs}
589170056Srrs
590171943Srrs/*
591170056Srrs * This implements the pipe buffer write mechanism.  Note that only
592170056Srrs * a direct write OR a normal pipe write can be pending at any given time.
593163953Srrs * If there are any characters in the pipe buffer, the direct write will
594171943Srrs * be deferred until the receiving process grabs all of the bytes from
595171943Srrs * the pipe buffer.  Then the direct mapping write is set-up.
596163953Srrs */
597171943Srrsstatic int
598171572Srrspipe_direct_write(wpipe, uio)
599163953Srrs	struct pipe *wpipe;
600163953Srrs	struct uio *uio;
601163953Srrs{
602171990Srrs	int error;
603163953Srrsretry:
604163953Srrs	while (wpipe->pipe_state & PIPE_DIRECTW) {
605163953Srrs		if ( wpipe->pipe_state & PIPE_WANTR) {
606163953Srrs			wpipe->pipe_state &= ~PIPE_WANTR;
607163953Srrs			wakeup(wpipe);
608163953Srrs		}
609163953Srrs		wpipe->pipe_state |= PIPE_WANTW;
610163953Srrs		error = tsleep(wpipe,
611163953Srrs				PRIBIO|PCATCH, "pipdww", 0);
612163953Srrs		if (error)
613163953Srrs			goto error1;
614163953Srrs		if (wpipe->pipe_state & PIPE_EOF) {
615163953Srrs			error = EPIPE;
616163953Srrs			goto error1;
617163953Srrs		}
618163953Srrs	}
619163953Srrs	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
620163953Srrs	if (wpipe->pipe_buffer.cnt > 0) {
621163953Srrs		if ( wpipe->pipe_state & PIPE_WANTR) {
622163953Srrs			wpipe->pipe_state &= ~PIPE_WANTR;
623163953Srrs			wakeup(wpipe);
624163953Srrs		}
625163953Srrs
626163953Srrs		wpipe->pipe_state |= PIPE_WANTW;
627169380Srrs		error = tsleep(wpipe,
628169380Srrs				PRIBIO|PCATCH, "pipdwc", 0);
629163953Srrs		if (error)
630163953Srrs			goto error1;
631163953Srrs		if (wpipe->pipe_state & PIPE_EOF) {
632163953Srrs			error = EPIPE;
633169380Srrs			goto error1;
634169380Srrs		}
635163953Srrs		goto retry;
636163953Srrs	}
637163953Srrs
638163953Srrs	wpipe->pipe_state |= PIPE_DIRECTW;
639163953Srrs
640163953Srrs	error = pipe_build_write_buffer(wpipe, uio);
641167695Srrs	if (error) {
642163953Srrs		wpipe->pipe_state &= ~PIPE_DIRECTW;
643163953Srrs		goto error1;
644163953Srrs	}
645163953Srrs
646167695Srrs	error = 0;
647167695Srrs	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
648167695Srrs		if (wpipe->pipe_state & PIPE_EOF) {
649163953Srrs			pipelock(wpipe, 0);
650163953Srrs			pipe_destroy_write_buffer(wpipe);
651163953Srrs			pipeunlock(wpipe);
652163953Srrs			pipeselwakeup(wpipe);
653163953Srrs			error = EPIPE;
654163953Srrs			goto error1;
655163953Srrs		}
656163953Srrs		if (wpipe->pipe_state & PIPE_WANTR) {
657163953Srrs			wpipe->pipe_state &= ~PIPE_WANTR;
658163953Srrs			wakeup(wpipe);
659163953Srrs		}
660163953Srrs		pipeselwakeup(wpipe);
661163953Srrs		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
662163953Srrs	}
663163953Srrs
664163953Srrs	pipelock(wpipe,0);
665163953Srrs	if (wpipe->pipe_state & PIPE_DIRECTW) {
666163953Srrs		/*
667163953Srrs		 * this bit of trickery substitutes a kernel buffer for
668163953Srrs		 * the process that might be going away.
669163953Srrs		 */
670163953Srrs		pipe_clone_write_buffer(wpipe);
671163953Srrs	} else {
672163953Srrs		pipe_destroy_write_buffer(wpipe);
673163953Srrs	}
674163953Srrs	pipeunlock(wpipe);
675163953Srrs	return error;
676163953Srrs
677163953Srrserror1:
678163953Srrs	wakeup(wpipe);
679171943Srrs	return error;
680163953Srrs}
681163953Srrs#endif
682163953Srrs
683163953Srrsstatic int
684163953Srrspipe_write(fp, uio, cred, flags, p)
685163953Srrs	struct file *fp;
686163953Srrs	struct uio *uio;
687163953Srrs	struct ucred *cred;
688163953Srrs	struct proc *p;
689163953Srrs	int flags;
690171943Srrs{
691163953Srrs	int error = 0;
692163953Srrs	int orig_resid;
693163953Srrs
694163953Srrs	struct pipe *wpipe, *rpipe;
695163953Srrs
696163953Srrs	rpipe = (struct pipe *) fp->f_data;
697163953Srrs	wpipe = rpipe->pipe_peer;
698163953Srrs
699163953Srrs	/*
700163953Srrs	 * detect loss of pipe read side, issue SIGPIPE if lost.
701163953Srrs	 */
702171943Srrs	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
703163953Srrs		return EPIPE;
704163953Srrs	}
705163953Srrs
706163953Srrs	/*
707163953Srrs	 * If it is advantageous to resize the pipe buffer, do
708163953Srrs	 * so.
709171943Srrs	 */
710163953Srrs	if ((uio->uio_resid > PIPE_SIZE) &&
711163953Srrs		(nbigpipe < LIMITBIGPIPES) &&
712163953Srrs		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
713163953Srrs		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
714163953Srrs		(wpipe->pipe_buffer.cnt == 0)) {
715163953Srrs
716169420Srrs		if (wpipe->pipe_buffer.buffer) {
717163953Srrs			amountpipekva -= wpipe->pipe_buffer.size;
718163953Srrs			kmem_free(kernel_map,
719163953Srrs				(vm_offset_t)wpipe->pipe_buffer.buffer,
720163953Srrs				wpipe->pipe_buffer.size);
721163953Srrs		}
722163953Srrs
723163953Srrs#ifndef PIPE_NODIRECT
724165647Srrs		if (wpipe->pipe_map.kva) {
725163953Srrs			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
726163953Srrs			kmem_free(kernel_map,
727163953Srrs				wpipe->pipe_map.kva,
728163953Srrs				wpipe->pipe_buffer.size + PAGE_SIZE);
729163953Srrs		}
730163953Srrs#endif
731163953Srrs
732163953Srrs		wpipe->pipe_buffer.in = 0;
733163953Srrs		wpipe->pipe_buffer.out = 0;
734163953Srrs		wpipe->pipe_buffer.cnt = 0;
735163953Srrs		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
736163953Srrs		wpipe->pipe_buffer.buffer = NULL;
737163953Srrs		++nbigpipe;
738163953Srrs
739163953Srrs#ifndef PIPE_NODIRECT
740163953Srrs		wpipe->pipe_map.cnt = 0;
741163953Srrs		wpipe->pipe_map.kva = 0;
742163953Srrs		wpipe->pipe_map.pos = 0;
743163953Srrs		wpipe->pipe_map.npages = 0;
744163953Srrs#endif
745163953Srrs
746163953Srrs	}
747163953Srrs
748163953Srrs
749163953Srrs	if( wpipe->pipe_buffer.buffer == NULL) {
750163953Srrs		if ((error = pipelock(wpipe,1)) == 0) {
751163953Srrs			pipespace(wpipe);
752171990Srrs			pipeunlock(wpipe);
753163953Srrs		} else {
754163953Srrs			return error;
755163953Srrs		}
756163953Srrs	}
757163953Srrs
758163953Srrs	++wpipe->pipe_busy;
759171943Srrs	orig_resid = uio->uio_resid;
760163953Srrs	while (uio->uio_resid) {
761163953Srrs		int space;
762163953Srrs#ifndef PIPE_NODIRECT
763171745Srrs		/*
764171745Srrs		 * If the transfer is large, we can gain performance if
765199437Stuexen		 * we do process-to-process copies directly.
766163953Srrs		 * If the write is non-blocking, we don't use the
767163953Srrs		 * direct write mechanism.
768163953Srrs		 */
769163953Srrs		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
770163953Srrs		    (fp->f_flag & FNONBLOCK) == 0 &&
771163953Srrs			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
772163953Srrs			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
773163953Srrs			error = pipe_direct_write( wpipe, uio);
774163953Srrs			if (error) {
775163953Srrs				break;
776171943Srrs			}
777163953Srrs			continue;
778163953Srrs		}
779163953Srrs#endif
780163953Srrs
781163953Srrs		/*
782163953Srrs		 * Pipe buffered writes cannot be coincidental with
783163953Srrs		 * direct writes.  We wait until the currently executing
784163953Srrs		 * direct write is completed before we start filling the
785163953Srrs		 * pipe buffer.
786163953Srrs		 */
787163953Srrs	retrywrite:
788163953Srrs		while (wpipe->pipe_state & PIPE_DIRECTW) {
789163953Srrs			if (wpipe->pipe_state & PIPE_WANTR) {
790163953Srrs				wpipe->pipe_state &= ~PIPE_WANTR;
791163953Srrs				wakeup(wpipe);
792163953Srrs			}
793163953Srrs			error = tsleep(wpipe,
794163953Srrs					PRIBIO|PCATCH, "pipbww", 0);
795163953Srrs			if (error)
796163953Srrs				break;
797163953Srrs		}
798163953Srrs
799163953Srrs		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
800163953Srrs
801163953Srrs		/* Writes of size <= PIPE_BUF must be atomic. */
802163953Srrs		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
803163953Srrs			space = 0;
804165647Srrs
805163953Srrs		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
806165647Srrs			if ((error = pipelock(wpipe,1)) == 0) {
807163953Srrs				int size;	/* Transfer size */
808172396Srrs				int segsize;	/* first segment to transfer */
809172396Srrs				/*
810172396Srrs				 * It is possible for a direct write to
811172090Srrs				 * slip in on us... handle it here...
812163953Srrs				 */
813163953Srrs				if (wpipe->pipe_state & PIPE_DIRECTW) {
814163953Srrs					pipeunlock(wpipe);
815163953Srrs					goto retrywrite;
816163953Srrs				}
817163953Srrs				/*
818163953Srrs				 * If a process blocked in uiomove, our
819171943Srrs				 * value for space might be bad.
820163953Srrs				 */
821163953Srrs				if (space > wpipe->pipe_buffer.size -
822163953Srrs				    wpipe->pipe_buffer.cnt) {
823163953Srrs					pipeunlock(wpipe);
824163953Srrs					goto retrywrite;
825163953Srrs				}
826163953Srrs
827163953Srrs				/*
828163953Srrs				 * Transfer size is minimum of uio transfer
829163953Srrs				 * and free space in pipe buffer.
830166675Srrs				 */
831166675Srrs				if (space > uio->uio_resid)
832163953Srrs					size = uio->uio_resid;
833163953Srrs				else
834163953Srrs					size = space;
835163953Srrs				/*
836172090Srrs				 * First segment to transfer is minimum of
837166675Srrs				 * transfer size and contiguous space in
838166675Srrs				 * pipe buffer.  If first segment to transfer
839166675Srrs				 * is less than the transfer size, we've got
840166675Srrs				 * a wraparound in the buffer.
841171943Srrs				 */
842172703Srrs				segsize = wpipe->pipe_buffer.size -
843163953Srrs					wpipe->pipe_buffer.in;
844163953Srrs				if (segsize > size)
845163953Srrs					segsize = size;
846163953Srrs
847163953Srrs				/* Transfer first segment */
848163953Srrs
849163953Srrs				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
850163953Srrs						segsize, uio);
851163953Srrs
852163953Srrs				if (error == 0 && segsize < size) {
853163953Srrs					/*
854163953Srrs					 * Transfer remaining part now, to
855163953Srrs					 * support atomic writes.  Wraparound
856163953Srrs					 * happened.
857163953Srrs					 */
858163953Srrs					if (wpipe->pipe_buffer.in + segsize !=
859163953Srrs					    wpipe->pipe_buffer.size)
860163953Srrs						panic("Expected pipe buffer wraparound disappeared");
861163953Srrs
862163953Srrs					error = uiomove(&wpipe->pipe_buffer.buffer[0],
863163953Srrs							size - segsize, uio);
864163953Srrs				}
865163953Srrs				if (error == 0) {
866163953Srrs					wpipe->pipe_buffer.in += size;
867163953Srrs					if (wpipe->pipe_buffer.in >=
868163953Srrs					    wpipe->pipe_buffer.size) {
869163953Srrs						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
870169420Srrs							panic("Expected wraparound bad");
871163953Srrs						wpipe->pipe_buffer.in = size - segsize;
872163953Srrs					}
873163953Srrs
874163953Srrs					wpipe->pipe_buffer.cnt += size;
875163953Srrs					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
876163953Srrs						panic("Pipe buffer overflow");
877163953Srrs
878163953Srrs				}
879163953Srrs				pipeunlock(wpipe);
880163953Srrs			}
881163953Srrs			if (error)
882163953Srrs				break;
883163953Srrs
884163953Srrs		} else {
885163953Srrs			/*
886163953Srrs			 * If the "read-side" has been blocked, wake it up now.
887163953Srrs			 */
888163953Srrs			if (wpipe->pipe_state & PIPE_WANTR) {
889163953Srrs				wpipe->pipe_state &= ~PIPE_WANTR;
890163953Srrs				wakeup(wpipe);
891163953Srrs			}
892163953Srrs
893165647Srrs			/*
894163953Srrs			 * don't block on non-blocking I/O
895163953Srrs			 */
896163953Srrs			if (fp->f_flag & FNONBLOCK) {
897163953Srrs				error = EAGAIN;
898163953Srrs				break;
899165647Srrs			}
900163953Srrs
901165220Srrs			/*
902163953Srrs			 * We have no more space and have something to offer,
903172396Srrs			 * wake up select/poll.
904172396Srrs			 */
905172396Srrs			pipeselwakeup(wpipe);
906172396Srrs
907165220Srrs			wpipe->pipe_state |= PIPE_WANTW;
908172090Srrs			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
909163953Srrs				break;
910163953Srrs			}
911163953Srrs			/*
912163953Srrs			 * If read side wants to go away, we just issue a signal
913163953Srrs			 * to ourselves.
914163953Srrs			 */
915171943Srrs			if (wpipe->pipe_state & PIPE_EOF) {
916163953Srrs				error = EPIPE;
917171990Srrs				break;
918172090Srrs			}
919163953Srrs		}
920163953Srrs	}
921188067Srrs
922163953Srrs	--wpipe->pipe_busy;
923163953Srrs	if ((wpipe->pipe_busy == 0) &&
924163953Srrs		(wpipe->pipe_state & PIPE_WANT)) {
925163953Srrs		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
926163953Srrs		wakeup(wpipe);
927163953Srrs	} else if (wpipe->pipe_buffer.cnt > 0) {
928163953Srrs		/*
929163953Srrs		 * If we have put any characters in the buffer, we wake up
930171943Srrs		 * the reader.
931163953Srrs		 */
932163953Srrs		if (wpipe->pipe_state & PIPE_WANTR) {
933163953Srrs			wpipe->pipe_state &= ~PIPE_WANTR;
934163953Srrs			wakeup(wpipe);
935163953Srrs		}
936178202Srrs	}
937178202Srrs
938178202Srrs	/*
939178202Srrs	 * Don't return EPIPE if I/O was successful
940178202Srrs	 */
941178202Srrs	if ((wpipe->pipe_buffer.cnt == 0) &&
942178202Srrs		(uio->uio_resid == 0) &&
943178202Srrs		(error == EPIPE))
944209289Stuexen		error = 0;
945209289Stuexen
946209289Stuexen	if (error == 0)
947209289Stuexen		getnanotime(&wpipe->pipe_mtime);
948209289Stuexen
949209289Stuexen	/*
950209289Stuexen	 * We have something to offer,
951209289Stuexen	 * wake up select/poll.
952209289Stuexen	 */
953209289Stuexen	if (wpipe->pipe_buffer.cnt)
954209289Stuexen		pipeselwakeup(wpipe);
955209289Stuexen
956209289Stuexen	return error;
957209289Stuexen}
958178202Srrs
959178202Srrs/*
960178202Srrs * we implement a very minimal set of ioctls for compatibility with sockets.
961178202Srrs */
962178202Srrsint
963209289Stuexenpipe_ioctl(fp, cmd, data, p)
964209289Stuexen	struct file *fp;
965209289Stuexen	u_long cmd;
966209289Stuexen	register caddr_t data;
967209289Stuexen	struct proc *p;
968178202Srrs{
969178202Srrs	register struct pipe *mpipe = (struct pipe *)fp->f_data;
970178202Srrs
971178202Srrs	switch (cmd) {
972178202Srrs
973178202Srrs	case FIONBIO:
974178202Srrs		return (0);
975178202Srrs
976178202Srrs	case FIOASYNC:
977178202Srrs		if (*(int *)data) {
978178202Srrs			mpipe->pipe_state |= PIPE_ASYNC;
979178202Srrs		} else {
980178202Srrs			mpipe->pipe_state &= ~PIPE_ASYNC;
981178202Srrs		}
982178202Srrs		return (0);
983178202Srrs
984178202Srrs	case FIONREAD:
985178202Srrs		if (mpipe->pipe_state & PIPE_DIRECTW)
986163953Srrs			*(int *)data = mpipe->pipe_map.cnt;
987163953Srrs		else
988163953Srrs			*(int *)data = mpipe->pipe_buffer.cnt;
989163953Srrs		return (0);
990163953Srrs
991163953Srrs	case FIOSETOWN:
992171943Srrs		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
993163953Srrs
994163953Srrs	case FIOGETOWN:
995163953Srrs		*(int *)data = fgetown(mpipe->pipe_sigio);
996163953Srrs		return (0);
997163953Srrs
998163953Srrs	/* This is deprecated, FIOSETOWN should be used instead. */
999204096Stuexen	case TIOCSPGRP:
1000163953Srrs		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1001204096Stuexen
1002163953Srrs	/* This is deprecated, FIOGETOWN should be used instead. */
1003163953Srrs	case TIOCGPGRP:
1004171943Srrs		*(int *)data = -fgetown(mpipe->pipe_sigio);
1005163953Srrs		return (0);
1006163953Srrs
1007163953Srrs	}
1008163953Srrs	return (ENOTTY);
1009163953Srrs}
1010163953Srrs
1011163953Srrsint
1012163953Srrspipe_poll(fp, events, cred, p)
1013163953Srrs	struct file *fp;
1014163953Srrs	int events;
1015188067Srrs	struct ucred *cred;
1016188067Srrs	struct proc *p;
1017188067Srrs{
1018188067Srrs	register struct pipe *rpipe = (struct pipe *)fp->f_data;
1019188067Srrs	struct pipe *wpipe;
1020163953Srrs	int revents = 0;
1021163953Srrs
1022163953Srrs	wpipe = rpipe->pipe_peer;
1023163953Srrs	if (events & (POLLIN | POLLRDNORM))
1024163953Srrs		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1025163953Srrs		    (rpipe->pipe_buffer.cnt > 0) ||
1026163953Srrs		    (rpipe->pipe_state & PIPE_EOF))
1027163953Srrs			revents |= events & (POLLIN | POLLRDNORM);
1028163953Srrs
1029168299Srrs	if (events & (POLLOUT | POLLWRNORM))
1030163953Srrs		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1031163953Srrs		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1032163953Srrs		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1033163953Srrs			revents |= events & (POLLOUT | POLLWRNORM);
1034163953Srrs
1035163953Srrs	if ((rpipe->pipe_state & PIPE_EOF) ||
1036163953Srrs	    (wpipe == NULL) ||
1037163953Srrs	    (wpipe->pipe_state & PIPE_EOF))
1038163953Srrs		revents |= POLLHUP;
1039163953Srrs
1040163953Srrs	if (revents == 0) {
1041163953Srrs		if (events & (POLLIN | POLLRDNORM)) {
1042163953Srrs			selrecord(p, &rpipe->pipe_sel);
1043163953Srrs			rpipe->pipe_state |= PIPE_SEL;
1044163953Srrs		}
1045163953Srrs
1046172218Srrs		if (events & (POLLOUT | POLLWRNORM)) {
1047166675Srrs			selrecord(p, &wpipe->pipe_sel);
1048166675Srrs			wpipe->pipe_state |= PIPE_SEL;
1049166675Srrs		}
1050166675Srrs	}
1051171943Srrs
1052172703Srrs	return (revents);
1053163953Srrs}
1054163953Srrs
1055163953Srrsstatic int
1056163953Srrspipe_stat(fp, ub, p)
1057163953Srrs	struct file *fp;
1058163953Srrs	struct stat *ub;
1059163953Srrs	struct proc *p;
1060163953Srrs{
1061163953Srrs	struct pipe *pipe = (struct pipe *)fp->f_data;
1062163953Srrs
1063163953Srrs	bzero((caddr_t)ub, sizeof (*ub));
1064163953Srrs	ub->st_mode = S_IFIFO;
1065163953Srrs	ub->st_blksize = pipe->pipe_buffer.size;
1066163953Srrs	ub->st_size = pipe->pipe_buffer.cnt;
1067163953Srrs	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1068163953Srrs	ub->st_atimespec = pipe->pipe_atime;
1069163953Srrs	ub->st_mtimespec = pipe->pipe_mtime;
1070163953Srrs	ub->st_ctimespec = pipe->pipe_ctime;
1071163953Srrs	/*
1072163953Srrs	 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
1073163953Srrs	 * st_flags, st_gen.
1074163953Srrs	 * XXX (st_dev, st_ino) should be unique.
1075169420Srrs	 */
1076163953Srrs	return 0;
1077163953Srrs}
1078163953Srrs
1079163953Srrs/* ARGSUSED */
1080163953Srrsstatic int
1081163953Srrspipe_close(fp, p)
1082163953Srrs	struct file *fp;
1083163953Srrs	struct proc *p;
1084163953Srrs{
1085163953Srrs	struct pipe *cpipe = (struct pipe *)fp->f_data;
1086163953Srrs
1087163953Srrs	fp->f_ops = &badfileops;
1088163953Srrs	fp->f_data = NULL;
1089163953Srrs	funsetown(cpipe->pipe_sigio);
1090163953Srrs	pipeclose(cpipe);
1091163953Srrs	return 0;
1092163953Srrs}
1093163953Srrs
1094163953Srrs/*
1095163953Srrs * shutdown the pipe
1096165647Srrs */
1097163953Srrsstatic void
1098163953Srrspipeclose(cpipe)
1099163953Srrs	struct pipe *cpipe;
1100163953Srrs{
1101163953Srrs	struct pipe *ppipe;
1102165647Srrs	if (cpipe) {
1103163953Srrs
1104165220Srrs		pipeselwakeup(cpipe);
1105163953Srrs
1106172396Srrs		/*
1107172396Srrs		 * If the other side is blocked, wake it up saying that
1108172396Srrs		 * we want to close it down.
1109165220Srrs		 */
1110163953Srrs		while (cpipe->pipe_busy) {
1111163953Srrs			wakeup(cpipe);
1112172090Srrs			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
1113163953Srrs			tsleep(cpipe, PRIBIO, "pipecl", 0);
1114171990Srrs		}
1115172090Srrs
1116163953Srrs		/*
1117163953Srrs		 * Disconnect from peer
1118163953Srrs		 */
1119163953Srrs		if ((ppipe = cpipe->pipe_peer) != NULL) {
1120163953Srrs			pipeselwakeup(ppipe);
1121163953Srrs
1122163953Srrs			ppipe->pipe_state |= PIPE_EOF;
1123163953Srrs			wakeup(ppipe);
1124163953Srrs			ppipe->pipe_peer = NULL;
1125163953Srrs		}
1126163953Srrs
1127163953Srrs		/*
1128163953Srrs		 * free resources
1129163953Srrs		 */
1130163953Srrs		if (cpipe->pipe_buffer.buffer) {
1131163953Srrs			if (cpipe->pipe_buffer.size > PIPE_SIZE)
1132178251Srrs				--nbigpipe;
1133163953Srrs			amountpipekva -= cpipe->pipe_buffer.size;
1134163953Srrs			kmem_free(kernel_map,
1135163953Srrs				(vm_offset_t)cpipe->pipe_buffer.buffer,
1136163953Srrs				cpipe->pipe_buffer.size);
1137178251Srrs		}
1138163953Srrs#ifndef PIPE_NODIRECT
1139163953Srrs		if (cpipe->pipe_map.kva) {
1140163953Srrs			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1141163953Srrs			kmem_free(kernel_map,
1142163953Srrs				cpipe->pipe_map.kva,
1143163953Srrs				cpipe->pipe_buffer.size + PAGE_SIZE);
1144172091Srrs		}
1145172091Srrs#endif
1146172091Srrs		zfree(pipe_zone, cpipe);
1147166675Srrs	}
1148168124Srrs}
1149163953Srrs