sys_pipe.c revision 126249
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19
20/*
21 * This file contains a high-performance replacement for the socket-based
22 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23 * all features of sockets, but does do everything that pipes normally
24 * do.
25 */
26
27/*
28 * This code has two modes of operation, a small write mode and a large
29 * write mode.  The small write mode acts like conventional pipes with
30 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33 * the receiving process can copy it directly from the pages in the sending
34 * process.
35 *
36 * If the sending process receives a signal, it is possible that it will
37 * go away, and certainly its address space can change, because control
38 * is returned back to the user-mode side.  In that case, the pipe code
39 * arranges to copy the buffer supplied by the user process, to a pageable
40 * kernel buffer, and the receiving process will grab the data from the
41 * pageable kernel buffer.  Since signals don't happen all that often,
42 * the copy operation is normally eliminated.
43 *
44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45 * happen for small transfers so that the system will not spend all of
46 * its time context switching.
47 *
48 * In order to limit the resource use of pipes, two sysctls exist:
49 *
50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51 * address space available to us in pipe_map.  Whenever the amount in use
52 * exceeds half of this value, all new pipes will be created with size
53 * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54 * as well.  This value is loader tunable only.
55 *
56 * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
57 * be wired in order to facilitate direct copies using page flipping.
58 * Whenever this value is exceeded, pipes will fall back to using regular
59 * copies.  This value is sysctl controllable at all times.
60 *
61 * These values are autotuned in subr_param.c.
62 *
63 * Memory usage may be monitored through the sysctls
64 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
65 *
66 */
67
68#include <sys/cdefs.h>
69__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 126249 2004-02-25 23:30:56Z rwatson $");
70
71#include "opt_mac.h"
72
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/fcntl.h>
76#include <sys/file.h>
77#include <sys/filedesc.h>
78#include <sys/filio.h>
79#include <sys/kernel.h>
80#include <sys/lock.h>
81#include <sys/mac.h>
82#include <sys/mutex.h>
83#include <sys/ttycom.h>
84#include <sys/stat.h>
85#include <sys/malloc.h>
86#include <sys/poll.h>
87#include <sys/selinfo.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/sysproto.h>
91#include <sys/pipe.h>
92#include <sys/proc.h>
93#include <sys/vnode.h>
94#include <sys/uio.h>
95#include <sys/event.h>
96
97#include <vm/vm.h>
98#include <vm/vm_param.h>
99#include <vm/vm_object.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_extern.h>
102#include <vm/pmap.h>
103#include <vm/vm_map.h>
104#include <vm/vm_page.h>
105#include <vm/uma.h>
106
107/*
108 * Use this define if you want to disable *fancy* VM things.  Expect an
109 * approx 30% decrease in transfer rate.  This could be useful for
110 * NetBSD or OpenBSD.
111 */
112/* #define PIPE_NODIRECT */
113
114/*
115 * interfaces to the outside world
116 */
117static fo_rdwr_t	pipe_read;
118static fo_rdwr_t	pipe_write;
119static fo_ioctl_t	pipe_ioctl;
120static fo_poll_t	pipe_poll;
121static fo_kqfilter_t	pipe_kqfilter;
122static fo_stat_t	pipe_stat;
123static fo_close_t	pipe_close;
124
125static struct fileops pipeops = {
126	.fo_read = pipe_read,
127	.fo_write = pipe_write,
128	.fo_ioctl = pipe_ioctl,
129	.fo_poll = pipe_poll,
130	.fo_kqfilter = pipe_kqfilter,
131	.fo_stat = pipe_stat,
132	.fo_close = pipe_close,
133	.fo_flags = DFLAG_PASSABLE
134};
135
136static void	filt_pipedetach(struct knote *kn);
137static int	filt_piperead(struct knote *kn, long hint);
138static int	filt_pipewrite(struct knote *kn, long hint);
139
140static struct filterops pipe_rfiltops =
141	{ 1, NULL, filt_pipedetach, filt_piperead };
142static struct filterops pipe_wfiltops =
143	{ 1, NULL, filt_pipedetach, filt_pipewrite };
144
145/*
146 * Default pipe buffer size(s), this can be kind-of large now because pipe
147 * space is pageable.  The pipe code will try to maintain locality of
148 * reference for performance reasons, so small amounts of outstanding I/O
149 * will not wipe the cache.
150 */
151#define MINPIPESIZE (PIPE_SIZE/3)
152#define MAXPIPESIZE (2*PIPE_SIZE/3)
153
154/*
155 * Limit the number of "big" pipes
156 */
157#define LIMITBIGPIPES	32
158static int nbigpipe;
159
160static int amountpipes;
161static int amountpipekva;
162static int amountpipekvawired;
163
164SYSCTL_DECL(_kern_ipc);
165
166SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
167	   &maxpipekva, 0, "Pipe KVA limit");
168SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
169	   &maxpipekvawired, 0, "Pipe KVA wired limit");
170SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
171	   &amountpipes, 0, "Current # of pipes");
172SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
173	   &nbigpipe, 0, "Current # of big pipes");
174SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
175	   &amountpipekva, 0, "Pipe KVA usage");
176SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
177	   &amountpipekvawired, 0, "Pipe wired KVA usage");
178
179static void pipeinit(void *dummy __unused);
180static void pipeclose(struct pipe *cpipe);
181static void pipe_free_kmem(struct pipe *cpipe);
182static int pipe_create(struct pipe *pipe);
183static __inline int pipelock(struct pipe *cpipe, int catch);
184static __inline void pipeunlock(struct pipe *cpipe);
185static __inline void pipeselwakeup(struct pipe *cpipe);
186#ifndef PIPE_NODIRECT
187static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188static void pipe_destroy_write_buffer(struct pipe *wpipe);
189static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190static void pipe_clone_write_buffer(struct pipe *wpipe);
191#endif
192static int pipespace(struct pipe *cpipe, int size);
193
194static void	pipe_zone_ctor(void *mem, int size, void *arg);
195static void	pipe_zone_dtor(void *mem, int size, void *arg);
196static void	pipe_zone_init(void *mem, int size);
197static void	pipe_zone_fini(void *mem, int size);
198
199static uma_zone_t pipe_zone;
200
201SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
202
203static void
204pipeinit(void *dummy __unused)
205{
206
207	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
208	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
209	    UMA_ALIGN_PTR, 0);
210	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
211}
212
213static void
214pipe_zone_ctor(void *mem, int size, void *arg)
215{
216	struct pipepair *pp;
217	struct pipe *rpipe, *wpipe;
218
219	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
220
221	pp = (struct pipepair *)mem;
222
223	/*
224	 * We zero both pipe endpoints to make sure all the kmem pointers
225	 * are NULL, flag fields are zero'd, etc.  We timestamp both
226	 * endpoints with the same time.
227	 */
228	rpipe = &pp->pp_rpipe;
229	bzero(rpipe, sizeof(*rpipe));
230	vfs_timestamp(&rpipe->pipe_ctime);
231	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
232
233	wpipe = &pp->pp_wpipe;
234	bzero(wpipe, sizeof(*wpipe));
235	wpipe->pipe_ctime = rpipe->pipe_ctime;
236	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
237
238	rpipe->pipe_peer = wpipe;
239	rpipe->pipe_pair = pp;
240	wpipe->pipe_peer = rpipe;
241	wpipe->pipe_pair = pp;
242
243	/*
244	 * Mark both endpoints as present; they will later get free'd
245	 * one at a time.  When both are free'd, then the whole pair
246	 * is released.
247	 */
248	rpipe->pipe_present = 1;
249	wpipe->pipe_present = 1;
250
251	/*
252	 * Eventually, the MAC Framework may initialize the label
253	 * in ctor or init, but for now we do it elswhere to avoid
254	 * blocking in ctor or init.
255	 */
256	pp->pp_label = NULL;
257
258	atomic_add_int(&amountpipes, 2);
259}
260
261static void
262pipe_zone_dtor(void *mem, int size, void *arg)
263{
264	struct pipepair *pp;
265
266	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
267
268	pp = (struct pipepair *)mem;
269
270	atomic_subtract_int(&amountpipes, 2);
271}
272
273static void
274pipe_zone_init(void *mem, int size)
275{
276	struct pipepair *pp;
277
278	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
279
280	pp = (struct pipepair *)mem;
281
282	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
283}
284
285static void
286pipe_zone_fini(void *mem, int size)
287{
288	struct pipepair *pp;
289
290	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
291
292	pp = (struct pipepair *)mem;
293
294	mtx_destroy(&pp->pp_mtx);
295}
296
297/*
298 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
299 * let the zone pick up the pieces via pipeclose().
300 */
301
302/* ARGSUSED */
303int
304pipe(td, uap)
305	struct thread *td;
306	struct pipe_args /* {
307		int	dummy;
308	} */ *uap;
309{
310	struct filedesc *fdp = td->td_proc->p_fd;
311	struct file *rf, *wf;
312	struct pipepair *pp;
313	struct pipe *rpipe, *wpipe;
314	int fd, error;
315
316	pp = uma_zalloc(pipe_zone, M_WAITOK);
317#ifdef MAC
318	/*
319	 * The MAC label is shared between the connected endpoints.  As a
320	 * result mac_init_pipe() and mac_create_pipe() are called once
321	 * for the pair, and not on the endpoints.
322	 */
323	mac_init_pipe(pp);
324	mac_create_pipe(td->td_ucred, pp);
325#endif
326	rpipe = &pp->pp_rpipe;
327	wpipe = &pp->pp_wpipe;
328
329	if (pipe_create(rpipe) || pipe_create(wpipe)) {
330		pipeclose(rpipe);
331		pipeclose(wpipe);
332		return (ENFILE);
333	}
334
335	rpipe->pipe_state |= PIPE_DIRECTOK;
336	wpipe->pipe_state |= PIPE_DIRECTOK;
337
338	error = falloc(td, &rf, &fd);
339	if (error) {
340		pipeclose(rpipe);
341		pipeclose(wpipe);
342		return (error);
343	}
344	/* An extra reference on `rf' has been held for us by falloc(). */
345	td->td_retval[0] = fd;
346
347	/*
348	 * Warning: once we've gotten past allocation of the fd for the
349	 * read-side, we can only drop the read side via fdrop() in order
350	 * to avoid races against processes which manage to dup() the read
351	 * side while we are blocked trying to allocate the write side.
352	 */
353	FILE_LOCK(rf);
354	rf->f_flag = FREAD | FWRITE;
355	rf->f_type = DTYPE_PIPE;
356	rf->f_data = rpipe;
357	rf->f_ops = &pipeops;
358	FILE_UNLOCK(rf);
359	error = falloc(td, &wf, &fd);
360	if (error) {
361		FILEDESC_LOCK(fdp);
362		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
363			fdp->fd_ofiles[td->td_retval[0]] = NULL;
364			fdunused(fdp, td->td_retval[0]);
365			FILEDESC_UNLOCK(fdp);
366			fdrop(rf, td);
367		} else {
368			FILEDESC_UNLOCK(fdp);
369		}
370		fdrop(rf, td);
371		/* rpipe has been closed by fdrop(). */
372		pipeclose(wpipe);
373		return (error);
374	}
375	/* An extra reference on `wf' has been held for us by falloc(). */
376	FILE_LOCK(wf);
377	wf->f_flag = FREAD | FWRITE;
378	wf->f_type = DTYPE_PIPE;
379	wf->f_data = wpipe;
380	wf->f_ops = &pipeops;
381	FILE_UNLOCK(wf);
382	fdrop(wf, td);
383	td->td_retval[1] = fd;
384	fdrop(rf, td);
385
386	return (0);
387}
388
389/*
390 * Allocate kva for pipe circular buffer, the space is pageable
391 * This routine will 'realloc' the size of a pipe safely, if it fails
392 * it will retain the old buffer.
393 * If it fails it will return ENOMEM.
394 */
395static int
396pipespace(cpipe, size)
397	struct pipe *cpipe;
398	int size;
399{
400	caddr_t buffer;
401	int error;
402	static int curfail = 0;
403	static struct timeval lastfail;
404
405	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
406
407	size = round_page(size);
408	/*
409	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
410	 */
411	buffer = (caddr_t) vm_map_min(pipe_map);
412
413	/*
414	 * The map entry is, by default, pageable.
415	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
416	 */
417	error = vm_map_find(pipe_map, NULL, 0,
418		(vm_offset_t *) &buffer, size, 1,
419		VM_PROT_ALL, VM_PROT_ALL, 0);
420	if (error != KERN_SUCCESS) {
421		if (ppsratecheck(&lastfail, &curfail, 1))
422			printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
423		return (ENOMEM);
424	}
425
426	/* free old resources if we're resizing */
427	pipe_free_kmem(cpipe);
428	cpipe->pipe_buffer.buffer = buffer;
429	cpipe->pipe_buffer.size = size;
430	cpipe->pipe_buffer.in = 0;
431	cpipe->pipe_buffer.out = 0;
432	cpipe->pipe_buffer.cnt = 0;
433	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
434	return (0);
435}
436
437/*
438 * lock a pipe for I/O, blocking other access
439 */
440static __inline int
441pipelock(cpipe, catch)
442	struct pipe *cpipe;
443	int catch;
444{
445	int error;
446
447	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
448	while (cpipe->pipe_state & PIPE_LOCKFL) {
449		cpipe->pipe_state |= PIPE_LWANT;
450		error = msleep(cpipe, PIPE_MTX(cpipe),
451		    catch ? (PRIBIO | PCATCH) : PRIBIO,
452		    "pipelk", 0);
453		if (error != 0)
454			return (error);
455	}
456	cpipe->pipe_state |= PIPE_LOCKFL;
457	return (0);
458}
459
460/*
461 * unlock a pipe I/O lock
462 */
463static __inline void
464pipeunlock(cpipe)
465	struct pipe *cpipe;
466{
467
468	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
469	cpipe->pipe_state &= ~PIPE_LOCKFL;
470	if (cpipe->pipe_state & PIPE_LWANT) {
471		cpipe->pipe_state &= ~PIPE_LWANT;
472		wakeup(cpipe);
473	}
474}
475
476static __inline void
477pipeselwakeup(cpipe)
478	struct pipe *cpipe;
479{
480
481	if (cpipe->pipe_state & PIPE_SEL) {
482		cpipe->pipe_state &= ~PIPE_SEL;
483		selwakeuppri(&cpipe->pipe_sel, PSOCK);
484	}
485	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
486		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
487	KNOTE(&cpipe->pipe_sel.si_note, 0);
488}
489
490/*
491 * Initialize and allocate VM and memory for pipe.  The structure
492 * will start out zero'd from the ctor, so we just manage the kmem.
493 */
494static int
495pipe_create(pipe)
496	struct pipe *pipe;
497{
498	int error;
499
500	PIPE_LOCK(pipe);
501	pipelock(pipe, 0);
502	PIPE_UNLOCK(pipe);
503	/*
504	 * Reduce to 1/4th pipe size if we're over our global max.
505	 */
506	if (amountpipekva > maxpipekva / 2)
507		error = pipespace(pipe, SMALL_PIPE_SIZE);
508	else
509		error = pipespace(pipe, PIPE_SIZE);
510	PIPE_LOCK(pipe);
511	pipeunlock(pipe);
512	PIPE_UNLOCK(pipe);
513	if (error)
514		return (error);
515
516	return (0);
517}
518
519/* ARGSUSED */
520static int
521pipe_read(fp, uio, active_cred, flags, td)
522	struct file *fp;
523	struct uio *uio;
524	struct ucred *active_cred;
525	struct thread *td;
526	int flags;
527{
528	struct pipe *rpipe = fp->f_data;
529	int error;
530	int nread = 0;
531	u_int size;
532
533	PIPE_LOCK(rpipe);
534	++rpipe->pipe_busy;
535	error = pipelock(rpipe, 1);
536	if (error)
537		goto unlocked_error;
538
539#ifdef MAC
540	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
541	if (error)
542		goto locked_error;
543#endif
544
545	while (uio->uio_resid) {
546		/*
547		 * normal pipe buffer receive
548		 */
549		if (rpipe->pipe_buffer.cnt > 0) {
550			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
551			if (size > rpipe->pipe_buffer.cnt)
552				size = rpipe->pipe_buffer.cnt;
553			if (size > (u_int) uio->uio_resid)
554				size = (u_int) uio->uio_resid;
555
556			PIPE_UNLOCK(rpipe);
557			error = uiomove(
558			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
559			    size, uio);
560			PIPE_LOCK(rpipe);
561			if (error)
562				break;
563
564			rpipe->pipe_buffer.out += size;
565			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
566				rpipe->pipe_buffer.out = 0;
567
568			rpipe->pipe_buffer.cnt -= size;
569
570			/*
571			 * If there is no more to read in the pipe, reset
572			 * its pointers to the beginning.  This improves
573			 * cache hit stats.
574			 */
575			if (rpipe->pipe_buffer.cnt == 0) {
576				rpipe->pipe_buffer.in = 0;
577				rpipe->pipe_buffer.out = 0;
578			}
579			nread += size;
580#ifndef PIPE_NODIRECT
581		/*
582		 * Direct copy, bypassing a kernel buffer.
583		 */
584		} else if ((size = rpipe->pipe_map.cnt) &&
585			   (rpipe->pipe_state & PIPE_DIRECTW)) {
586			caddr_t	va;
587			if (size > (u_int) uio->uio_resid)
588				size = (u_int) uio->uio_resid;
589
590			va = (caddr_t) rpipe->pipe_map.kva +
591			    rpipe->pipe_map.pos;
592			PIPE_UNLOCK(rpipe);
593			error = uiomove(va, size, uio);
594			PIPE_LOCK(rpipe);
595			if (error)
596				break;
597			nread += size;
598			rpipe->pipe_map.pos += size;
599			rpipe->pipe_map.cnt -= size;
600			if (rpipe->pipe_map.cnt == 0) {
601				rpipe->pipe_state &= ~PIPE_DIRECTW;
602				wakeup(rpipe);
603			}
604#endif
605		} else {
606			/*
607			 * detect EOF condition
608			 * read returns 0 on EOF, no need to set error
609			 */
610			if (rpipe->pipe_state & PIPE_EOF)
611				break;
612
613			/*
614			 * If the "write-side" has been blocked, wake it up now.
615			 */
616			if (rpipe->pipe_state & PIPE_WANTW) {
617				rpipe->pipe_state &= ~PIPE_WANTW;
618				wakeup(rpipe);
619			}
620
621			/*
622			 * Break if some data was read.
623			 */
624			if (nread > 0)
625				break;
626
627			/*
628			 * Unlock the pipe buffer for our remaining processing.
629			 * We will either break out with an error or we will
630			 * sleep and relock to loop.
631			 */
632			pipeunlock(rpipe);
633
634			/*
635			 * Handle non-blocking mode operation or
636			 * wait for more data.
637			 */
638			if (fp->f_flag & FNONBLOCK) {
639				error = EAGAIN;
640			} else {
641				rpipe->pipe_state |= PIPE_WANTR;
642				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
643				    PRIBIO | PCATCH,
644				    "piperd", 0)) == 0)
645					error = pipelock(rpipe, 1);
646			}
647			if (error)
648				goto unlocked_error;
649		}
650	}
651#ifdef MAC
652locked_error:
653#endif
654	pipeunlock(rpipe);
655
656	/* XXX: should probably do this before getting any locks. */
657	if (error == 0)
658		vfs_timestamp(&rpipe->pipe_atime);
659unlocked_error:
660	--rpipe->pipe_busy;
661
662	/*
663	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
664	 */
665	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
666		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
667		wakeup(rpipe);
668	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
669		/*
670		 * Handle write blocking hysteresis.
671		 */
672		if (rpipe->pipe_state & PIPE_WANTW) {
673			rpipe->pipe_state &= ~PIPE_WANTW;
674			wakeup(rpipe);
675		}
676	}
677
678	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
679		pipeselwakeup(rpipe);
680
681	PIPE_UNLOCK(rpipe);
682	return (error);
683}
684
685#ifndef PIPE_NODIRECT
686/*
687 * Map the sending processes' buffer into kernel space and wire it.
688 * This is similar to a physical write operation.
689 */
690static int
691pipe_build_write_buffer(wpipe, uio)
692	struct pipe *wpipe;
693	struct uio *uio;
694{
695	pmap_t pmap;
696	u_int size;
697	int i, j;
698	vm_offset_t addr, endaddr;
699
700	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
701
702	size = (u_int) uio->uio_iov->iov_len;
703	if (size > wpipe->pipe_buffer.size)
704		size = wpipe->pipe_buffer.size;
705
706	pmap = vmspace_pmap(curproc->p_vmspace);
707	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
708	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
709	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
710		/*
711		 * vm_fault_quick() can sleep.  Consequently,
712		 * vm_page_lock_queue() and vm_page_unlock_queue()
713		 * should not be performed outside of this loop.
714		 */
715	race:
716		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
717			vm_page_lock_queues();
718			for (j = 0; j < i; j++)
719				vm_page_unhold(wpipe->pipe_map.ms[j]);
720			vm_page_unlock_queues();
721			return (EFAULT);
722		}
723		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
724		    VM_PROT_READ);
725		if (wpipe->pipe_map.ms[i] == NULL)
726			goto race;
727	}
728
729/*
730 * set up the control block
731 */
732	wpipe->pipe_map.npages = i;
733	wpipe->pipe_map.pos =
734	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
735	wpipe->pipe_map.cnt = size;
736
737/*
738 * and map the buffer
739 */
740	if (wpipe->pipe_map.kva == 0) {
741		/*
742		 * We need to allocate space for an extra page because the
743		 * address range might (will) span pages at times.
744		 */
745		wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
746			wpipe->pipe_buffer.size + PAGE_SIZE);
747		atomic_add_int(&amountpipekvawired,
748		    wpipe->pipe_buffer.size + PAGE_SIZE);
749	}
750	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
751		wpipe->pipe_map.npages);
752
753/*
754 * and update the uio data
755 */
756
757	uio->uio_iov->iov_len -= size;
758	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
759	if (uio->uio_iov->iov_len == 0)
760		uio->uio_iov++;
761	uio->uio_resid -= size;
762	uio->uio_offset += size;
763	return (0);
764}
765
766/*
767 * unmap and unwire the process buffer
768 */
769static void
770pipe_destroy_write_buffer(wpipe)
771	struct pipe *wpipe;
772{
773	int i;
774
775	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
776	if (wpipe->pipe_map.kva) {
777		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
778
779		if (amountpipekvawired > maxpipekvawired / 2) {
780			/* Conserve address space */
781			vm_offset_t kva = wpipe->pipe_map.kva;
782			wpipe->pipe_map.kva = 0;
783			kmem_free(kernel_map, kva,
784			    wpipe->pipe_buffer.size + PAGE_SIZE);
785			atomic_subtract_int(&amountpipekvawired,
786			    wpipe->pipe_buffer.size + PAGE_SIZE);
787		}
788	}
789	vm_page_lock_queues();
790	for (i = 0; i < wpipe->pipe_map.npages; i++) {
791		vm_page_unhold(wpipe->pipe_map.ms[i]);
792	}
793	vm_page_unlock_queues();
794	wpipe->pipe_map.npages = 0;
795}
796
797/*
798 * In the case of a signal, the writing process might go away.  This
799 * code copies the data into the circular buffer so that the source
800 * pages can be freed without loss of data.
801 */
802static void
803pipe_clone_write_buffer(wpipe)
804	struct pipe *wpipe;
805{
806	int size;
807	int pos;
808
809	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
810	size = wpipe->pipe_map.cnt;
811	pos = wpipe->pipe_map.pos;
812
813	wpipe->pipe_buffer.in = size;
814	wpipe->pipe_buffer.out = 0;
815	wpipe->pipe_buffer.cnt = size;
816	wpipe->pipe_state &= ~PIPE_DIRECTW;
817
818	PIPE_UNLOCK(wpipe);
819	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
820	    wpipe->pipe_buffer.buffer, size);
821	pipe_destroy_write_buffer(wpipe);
822	PIPE_LOCK(wpipe);
823}
824
825/*
826 * This implements the pipe buffer write mechanism.  Note that only
827 * a direct write OR a normal pipe write can be pending at any given time.
828 * If there are any characters in the pipe buffer, the direct write will
829 * be deferred until the receiving process grabs all of the bytes from
830 * the pipe buffer.  Then the direct mapping write is set-up.
831 */
832static int
833pipe_direct_write(wpipe, uio)
834	struct pipe *wpipe;
835	struct uio *uio;
836{
837	int error;
838
839retry:
840	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
841	while (wpipe->pipe_state & PIPE_DIRECTW) {
842		if (wpipe->pipe_state & PIPE_WANTR) {
843			wpipe->pipe_state &= ~PIPE_WANTR;
844			wakeup(wpipe);
845		}
846		wpipe->pipe_state |= PIPE_WANTW;
847		error = msleep(wpipe, PIPE_MTX(wpipe),
848		    PRIBIO | PCATCH, "pipdww", 0);
849		if (error)
850			goto error1;
851		if (wpipe->pipe_state & PIPE_EOF) {
852			error = EPIPE;
853			goto error1;
854		}
855	}
856	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
857	if (wpipe->pipe_buffer.cnt > 0) {
858		if (wpipe->pipe_state & PIPE_WANTR) {
859			wpipe->pipe_state &= ~PIPE_WANTR;
860			wakeup(wpipe);
861		}
862
863		wpipe->pipe_state |= PIPE_WANTW;
864		error = msleep(wpipe, PIPE_MTX(wpipe),
865		    PRIBIO | PCATCH, "pipdwc", 0);
866		if (error)
867			goto error1;
868		if (wpipe->pipe_state & PIPE_EOF) {
869			error = EPIPE;
870			goto error1;
871		}
872		goto retry;
873	}
874
875	wpipe->pipe_state |= PIPE_DIRECTW;
876
877	pipelock(wpipe, 0);
878	if (wpipe->pipe_state & PIPE_EOF) {
879		error = EPIPE;
880		goto error2;
881	}
882	PIPE_UNLOCK(wpipe);
883	error = pipe_build_write_buffer(wpipe, uio);
884	PIPE_LOCK(wpipe);
885	pipeunlock(wpipe);
886	if (error) {
887		wpipe->pipe_state &= ~PIPE_DIRECTW;
888		goto error1;
889	}
890
891	error = 0;
892	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
893		if (wpipe->pipe_state & PIPE_EOF) {
894			pipelock(wpipe, 0);
895			PIPE_UNLOCK(wpipe);
896			pipe_destroy_write_buffer(wpipe);
897			PIPE_LOCK(wpipe);
898			pipeselwakeup(wpipe);
899			pipeunlock(wpipe);
900			error = EPIPE;
901			goto error1;
902		}
903		if (wpipe->pipe_state & PIPE_WANTR) {
904			wpipe->pipe_state &= ~PIPE_WANTR;
905			wakeup(wpipe);
906		}
907		pipeselwakeup(wpipe);
908		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
909		    "pipdwt", 0);
910	}
911
912	pipelock(wpipe,0);
913	if (wpipe->pipe_state & PIPE_EOF)
914		error = EPIPE;
915	if (wpipe->pipe_state & PIPE_DIRECTW) {
916		/*
917		 * this bit of trickery substitutes a kernel buffer for
918		 * the process that might be going away.
919		 */
920		pipe_clone_write_buffer(wpipe);
921	} else {
922		PIPE_UNLOCK(wpipe);
923		pipe_destroy_write_buffer(wpipe);
924		PIPE_LOCK(wpipe);
925	}
926error2:
927	pipeunlock(wpipe);
928	return (error);
929
930error1:
931	wakeup(wpipe);
932	return (error);
933}
934#endif
935
936static int
937pipe_write(fp, uio, active_cred, flags, td)
938	struct file *fp;
939	struct uio *uio;
940	struct ucred *active_cred;
941	struct thread *td;
942	int flags;
943{
944	int error = 0;
945	int orig_resid;
946	struct pipe *wpipe, *rpipe;
947
948	rpipe = fp->f_data;
949	wpipe = rpipe->pipe_peer;
950
951	PIPE_LOCK(rpipe);
952	/*
953	 * detect loss of pipe read side, issue SIGPIPE if lost.
954	 */
955	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
956		PIPE_UNLOCK(rpipe);
957		return (EPIPE);
958	}
959#ifdef MAC
960	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
961	if (error) {
962		PIPE_UNLOCK(rpipe);
963		return (error);
964	}
965#endif
966	++wpipe->pipe_busy;
967
968	/*
969	 * If it is advantageous to resize the pipe buffer, do
970	 * so.
971	 */
972	if ((uio->uio_resid > PIPE_SIZE) &&
973		(amountpipekva < maxpipekva / 2) &&
974		(nbigpipe < LIMITBIGPIPES) &&
975		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
976		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
977		(wpipe->pipe_buffer.cnt == 0)) {
978
979		if ((error = pipelock(wpipe, 1)) == 0) {
980			if (wpipe->pipe_state & PIPE_EOF)
981				error = EPIPE;
982			else {
983				PIPE_UNLOCK(wpipe);
984				if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
985					atomic_add_int(&nbigpipe, 1);
986				PIPE_LOCK(wpipe);
987			}
988			pipeunlock(wpipe);
989		}
990	}
991
992	/*
993	 * If an early error occured unbusy and return, waking up any pending
994	 * readers.
995	 */
996	if (error) {
997		--wpipe->pipe_busy;
998		if ((wpipe->pipe_busy == 0) &&
999		    (wpipe->pipe_state & PIPE_WANT)) {
1000			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1001			wakeup(wpipe);
1002		}
1003		PIPE_UNLOCK(rpipe);
1004		return(error);
1005	}
1006
1007	orig_resid = uio->uio_resid;
1008
1009	while (uio->uio_resid) {
1010		int space;
1011
1012#ifndef PIPE_NODIRECT
1013		/*
1014		 * If the transfer is large, we can gain performance if
1015		 * we do process-to-process copies directly.
1016		 * If the write is non-blocking, we don't use the
1017		 * direct write mechanism.
1018		 *
1019		 * The direct write mechanism will detect the reader going
1020		 * away on us.
1021		 */
1022		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1023		    (fp->f_flag & FNONBLOCK) == 0 &&
1024		    amountpipekvawired + uio->uio_resid < maxpipekvawired) {
1025			error = pipe_direct_write(wpipe, uio);
1026			if (error)
1027				break;
1028			continue;
1029		}
1030#endif
1031
1032		/*
1033		 * Pipe buffered writes cannot be coincidental with
1034		 * direct writes.  We wait until the currently executing
1035		 * direct write is completed before we start filling the
1036		 * pipe buffer.  We break out if a signal occurs or the
1037		 * reader goes away.
1038		 */
1039	retrywrite:
1040		while (wpipe->pipe_state & PIPE_DIRECTW) {
1041			if (wpipe->pipe_state & PIPE_WANTR) {
1042				wpipe->pipe_state &= ~PIPE_WANTR;
1043				wakeup(wpipe);
1044			}
1045			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1046			    "pipbww", 0);
1047			if (wpipe->pipe_state & PIPE_EOF) {
1048				error = EPIPE;
1049				break;
1050			}
1051			if (error)
1052				break;
1053		}
1054
1055		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1056
1057		/* Writes of size <= PIPE_BUF must be atomic. */
1058		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1059			space = 0;
1060
1061		if (space > 0) {
1062			if ((error = pipelock(wpipe,1)) == 0) {
1063				int size;	/* Transfer size */
1064				int segsize;	/* first segment to transfer */
1065
1066				/*
1067				 * It is possible for a direct write/EOF to
1068				 * slip in on us... handle them here...
1069				 */
1070				if (wpipe->pipe_state & PIPE_EOF)
1071					goto lost_wpipe;
1072				if (wpipe->pipe_state & PIPE_DIRECTW) {
1073					pipeunlock(wpipe);
1074					goto retrywrite;
1075				}
1076				/*
1077				 * If a process blocked in uiomove, our
1078				 * value for space might be bad.
1079				 *
1080				 * XXX will we be ok if the reader has gone
1081				 * away here?
1082				 */
1083				if (space > wpipe->pipe_buffer.size -
1084				    wpipe->pipe_buffer.cnt) {
1085					pipeunlock(wpipe);
1086					goto retrywrite;
1087				}
1088
1089				/*
1090				 * Transfer size is minimum of uio transfer
1091				 * and free space in pipe buffer.
1092				 */
1093				if (space > uio->uio_resid)
1094					size = uio->uio_resid;
1095				else
1096					size = space;
1097				/*
1098				 * First segment to transfer is minimum of
1099				 * transfer size and contiguous space in
1100				 * pipe buffer.  If first segment to transfer
1101				 * is less than the transfer size, we've got
1102				 * a wraparound in the buffer.
1103				 */
1104				segsize = wpipe->pipe_buffer.size -
1105					wpipe->pipe_buffer.in;
1106				if (segsize > size)
1107					segsize = size;
1108
1109				/* Transfer first segment */
1110
1111				PIPE_UNLOCK(rpipe);
1112				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1113						segsize, uio);
1114				PIPE_LOCK(rpipe);
1115
1116				if (error == 0 && segsize < size) {
1117					/*
1118					 * Transfer remaining part now, to
1119					 * support atomic writes.  Wraparound
1120					 * happened.
1121					 */
1122					if (wpipe->pipe_buffer.in + segsize !=
1123					    wpipe->pipe_buffer.size)
1124						panic("Expected pipe buffer "
1125						    "wraparound disappeared");
1126
1127					PIPE_UNLOCK(rpipe);
1128					error = uiomove(
1129					    &wpipe->pipe_buffer.buffer[0],
1130					    size - segsize, uio);
1131					PIPE_LOCK(rpipe);
1132				}
1133				if (error == 0) {
1134					wpipe->pipe_buffer.in += size;
1135					if (wpipe->pipe_buffer.in >=
1136					    wpipe->pipe_buffer.size) {
1137						if (wpipe->pipe_buffer.in !=
1138						    size - segsize +
1139						    wpipe->pipe_buffer.size)
1140							panic("Expected "
1141							    "wraparound bad");
1142						wpipe->pipe_buffer.in = size -
1143						    segsize;
1144					}
1145
1146					wpipe->pipe_buffer.cnt += size;
1147					if (wpipe->pipe_buffer.cnt >
1148					    wpipe->pipe_buffer.size)
1149						panic("Pipe buffer overflow");
1150
1151				}
1152lost_wpipe:
1153				pipeunlock(wpipe);
1154			}
1155			if (error)
1156				break;
1157
1158		} else {
1159			/*
1160			 * If the "read-side" has been blocked, wake it up now.
1161			 */
1162			if (wpipe->pipe_state & PIPE_WANTR) {
1163				wpipe->pipe_state &= ~PIPE_WANTR;
1164				wakeup(wpipe);
1165			}
1166
1167			/*
1168			 * don't block on non-blocking I/O
1169			 */
1170			if (fp->f_flag & FNONBLOCK) {
1171				error = EAGAIN;
1172				break;
1173			}
1174
1175			/*
1176			 * We have no more space and have something to offer,
1177			 * wake up select/poll.
1178			 */
1179			pipeselwakeup(wpipe);
1180
1181			wpipe->pipe_state |= PIPE_WANTW;
1182			error = msleep(wpipe, PIPE_MTX(rpipe),
1183			    PRIBIO | PCATCH, "pipewr", 0);
1184			if (error != 0)
1185				break;
1186			/*
1187			 * If read side wants to go away, we just issue a signal
1188			 * to ourselves.
1189			 */
1190			if (wpipe->pipe_state & PIPE_EOF) {
1191				error = EPIPE;
1192				break;
1193			}
1194		}
1195	}
1196
1197	--wpipe->pipe_busy;
1198
1199	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1200		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1201		wakeup(wpipe);
1202	} else if (wpipe->pipe_buffer.cnt > 0) {
1203		/*
1204		 * If we have put any characters in the buffer, we wake up
1205		 * the reader.
1206		 */
1207		if (wpipe->pipe_state & PIPE_WANTR) {
1208			wpipe->pipe_state &= ~PIPE_WANTR;
1209			wakeup(wpipe);
1210		}
1211	}
1212
1213	/*
1214	 * Don't return EPIPE if I/O was successful
1215	 */
1216	if ((wpipe->pipe_buffer.cnt == 0) &&
1217	    (uio->uio_resid == 0) &&
1218	    (error == EPIPE)) {
1219		error = 0;
1220	}
1221
1222	if (error == 0)
1223		vfs_timestamp(&wpipe->pipe_mtime);
1224
1225	/*
1226	 * We have something to offer,
1227	 * wake up select/poll.
1228	 */
1229	if (wpipe->pipe_buffer.cnt)
1230		pipeselwakeup(wpipe);
1231
1232	PIPE_UNLOCK(rpipe);
1233	return (error);
1234}
1235
1236/*
1237 * we implement a very minimal set of ioctls for compatibility with sockets.
1238 */
1239static int
1240pipe_ioctl(fp, cmd, data, active_cred, td)
1241	struct file *fp;
1242	u_long cmd;
1243	void *data;
1244	struct ucred *active_cred;
1245	struct thread *td;
1246{
1247	struct pipe *mpipe = fp->f_data;
1248#ifdef MAC
1249	int error;
1250#endif
1251
1252	PIPE_LOCK(mpipe);
1253
1254#ifdef MAC
1255	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1256	if (error) {
1257		PIPE_UNLOCK(mpipe);
1258		return (error);
1259	}
1260#endif
1261
1262	switch (cmd) {
1263
1264	case FIONBIO:
1265		PIPE_UNLOCK(mpipe);
1266		return (0);
1267
1268	case FIOASYNC:
1269		if (*(int *)data) {
1270			mpipe->pipe_state |= PIPE_ASYNC;
1271		} else {
1272			mpipe->pipe_state &= ~PIPE_ASYNC;
1273		}
1274		PIPE_UNLOCK(mpipe);
1275		return (0);
1276
1277	case FIONREAD:
1278		if (mpipe->pipe_state & PIPE_DIRECTW)
1279			*(int *)data = mpipe->pipe_map.cnt;
1280		else
1281			*(int *)data = mpipe->pipe_buffer.cnt;
1282		PIPE_UNLOCK(mpipe);
1283		return (0);
1284
1285	case FIOSETOWN:
1286		PIPE_UNLOCK(mpipe);
1287		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1288
1289	case FIOGETOWN:
1290		PIPE_UNLOCK(mpipe);
1291		*(int *)data = fgetown(&mpipe->pipe_sigio);
1292		return (0);
1293
1294	/* This is deprecated, FIOSETOWN should be used instead. */
1295	case TIOCSPGRP:
1296		PIPE_UNLOCK(mpipe);
1297		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1298
1299	/* This is deprecated, FIOGETOWN should be used instead. */
1300	case TIOCGPGRP:
1301		PIPE_UNLOCK(mpipe);
1302		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1303		return (0);
1304
1305	}
1306	PIPE_UNLOCK(mpipe);
1307	return (ENOTTY);
1308}
1309
1310static int
1311pipe_poll(fp, events, active_cred, td)
1312	struct file *fp;
1313	int events;
1314	struct ucred *active_cred;
1315	struct thread *td;
1316{
1317	struct pipe *rpipe = fp->f_data;
1318	struct pipe *wpipe;
1319	int revents = 0;
1320#ifdef MAC
1321	int error;
1322#endif
1323
1324	wpipe = rpipe->pipe_peer;
1325	PIPE_LOCK(rpipe);
1326#ifdef MAC
1327	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1328	if (error)
1329		goto locked_error;
1330#endif
1331	if (events & (POLLIN | POLLRDNORM))
1332		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1333		    (rpipe->pipe_buffer.cnt > 0) ||
1334		    (rpipe->pipe_state & PIPE_EOF))
1335			revents |= events & (POLLIN | POLLRDNORM);
1336
1337	if (events & (POLLOUT | POLLWRNORM))
1338		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
1339		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1340		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1341			revents |= events & (POLLOUT | POLLWRNORM);
1342
1343	if ((rpipe->pipe_state & PIPE_EOF) ||
1344	    (!wpipe->pipe_present) ||
1345	    (wpipe->pipe_state & PIPE_EOF))
1346		revents |= POLLHUP;
1347
1348	if (revents == 0) {
1349		if (events & (POLLIN | POLLRDNORM)) {
1350			selrecord(td, &rpipe->pipe_sel);
1351			rpipe->pipe_state |= PIPE_SEL;
1352		}
1353
1354		if (events & (POLLOUT | POLLWRNORM)) {
1355			selrecord(td, &wpipe->pipe_sel);
1356			wpipe->pipe_state |= PIPE_SEL;
1357		}
1358	}
1359#ifdef MAC
1360locked_error:
1361#endif
1362	PIPE_UNLOCK(rpipe);
1363
1364	return (revents);
1365}
1366
1367/*
1368 * We shouldn't need locks here as we're doing a read and this should
1369 * be a natural race.
1370 */
1371static int
1372pipe_stat(fp, ub, active_cred, td)
1373	struct file *fp;
1374	struct stat *ub;
1375	struct ucred *active_cred;
1376	struct thread *td;
1377{
1378	struct pipe *pipe = fp->f_data;
1379#ifdef MAC
1380	int error;
1381
1382	PIPE_LOCK(pipe);
1383	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1384	PIPE_UNLOCK(pipe);
1385	if (error)
1386		return (error);
1387#endif
1388	bzero(ub, sizeof(*ub));
1389	ub->st_mode = S_IFIFO;
1390	ub->st_blksize = pipe->pipe_buffer.size;
1391	ub->st_size = pipe->pipe_buffer.cnt;
1392	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1393	ub->st_atimespec = pipe->pipe_atime;
1394	ub->st_mtimespec = pipe->pipe_mtime;
1395	ub->st_ctimespec = pipe->pipe_ctime;
1396	ub->st_uid = fp->f_cred->cr_uid;
1397	ub->st_gid = fp->f_cred->cr_gid;
1398	/*
1399	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1400	 * XXX (st_dev, st_ino) should be unique.
1401	 */
1402	return (0);
1403}
1404
1405/* ARGSUSED */
1406static int
1407pipe_close(fp, td)
1408	struct file *fp;
1409	struct thread *td;
1410{
1411	struct pipe *cpipe = fp->f_data;
1412
1413	fp->f_ops = &badfileops;
1414	fp->f_data = NULL;
1415	funsetown(&cpipe->pipe_sigio);
1416	pipeclose(cpipe);
1417	return (0);
1418}
1419
1420static void
1421pipe_free_kmem(cpipe)
1422	struct pipe *cpipe;
1423{
1424
1425	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1426	    ("pipe_free_kmem: pipe mutex locked"));
1427
1428	if (cpipe->pipe_buffer.buffer != NULL) {
1429		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1430			atomic_subtract_int(&nbigpipe, 1);
1431		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1432		vm_map_remove(pipe_map,
1433		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1434		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1435		cpipe->pipe_buffer.buffer = NULL;
1436	}
1437#ifndef PIPE_NODIRECT
1438	if (cpipe->pipe_map.kva != 0) {
1439		atomic_subtract_int(&amountpipekvawired,
1440		    cpipe->pipe_buffer.size + PAGE_SIZE);
1441		kmem_free(kernel_map,
1442			cpipe->pipe_map.kva,
1443			cpipe->pipe_buffer.size + PAGE_SIZE);
1444		cpipe->pipe_map.cnt = 0;
1445		cpipe->pipe_map.kva = 0;
1446		cpipe->pipe_map.pos = 0;
1447		cpipe->pipe_map.npages = 0;
1448	}
1449#endif
1450}
1451
1452/*
1453 * shutdown the pipe
1454 */
1455static void
1456pipeclose(cpipe)
1457	struct pipe *cpipe;
1458{
1459	struct pipepair *pp;
1460	struct pipe *ppipe;
1461
1462	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1463
1464	PIPE_LOCK(cpipe);
1465	pp = cpipe->pipe_pair;
1466
1467	pipeselwakeup(cpipe);
1468
1469	/*
1470	 * If the other side is blocked, wake it up saying that
1471	 * we want to close it down.
1472	 */
1473	cpipe->pipe_state |= PIPE_EOF;
1474	while (cpipe->pipe_busy) {
1475		wakeup(cpipe);
1476		cpipe->pipe_state |= PIPE_WANT;
1477		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1478	}
1479
1480
1481	/*
1482	 * Disconnect from peer, if any.
1483	 */
1484	ppipe = cpipe->pipe_peer;
1485	if (ppipe->pipe_present != 0) {
1486		pipeselwakeup(ppipe);
1487
1488		ppipe->pipe_state |= PIPE_EOF;
1489		wakeup(ppipe);
1490		KNOTE(&ppipe->pipe_sel.si_note, 0);
1491	}
1492
1493	/*
1494	 * Mark this endpoint as free.  Release kmem resources.  We
1495	 * don't mark this endpoint as unused until we've finished
1496	 * doing that, or the pipe might disappear out from under
1497	 * us.
1498	 */
1499	pipelock(cpipe, 0);
1500	PIPE_UNLOCK(cpipe);
1501	pipe_free_kmem(cpipe);
1502	PIPE_LOCK(cpipe);
1503	cpipe->pipe_present = 0;
1504	pipeunlock(cpipe);
1505
1506	/*
1507	 * If both endpoints are now closed, release the memory for the
1508	 * pipe pair.  If not, unlock.
1509	 */
1510	if (ppipe->pipe_present == 0) {
1511		PIPE_UNLOCK(cpipe);
1512#ifdef MAC
1513		mac_destroy_pipe(pp);
1514#endif
1515		uma_zfree(pipe_zone, cpipe->pipe_pair);
1516	} else
1517		PIPE_UNLOCK(cpipe);
1518}
1519
1520/*ARGSUSED*/
1521static int
1522pipe_kqfilter(struct file *fp, struct knote *kn)
1523{
1524	struct pipe *cpipe;
1525
1526	cpipe = kn->kn_fp->f_data;
1527	PIPE_LOCK(cpipe);
1528	switch (kn->kn_filter) {
1529	case EVFILT_READ:
1530		kn->kn_fop = &pipe_rfiltops;
1531		break;
1532	case EVFILT_WRITE:
1533		kn->kn_fop = &pipe_wfiltops;
1534		if (!cpipe->pipe_peer->pipe_present) {
1535			/* other end of pipe has been closed */
1536			PIPE_UNLOCK(cpipe);
1537			return (EPIPE);
1538		}
1539		cpipe = cpipe->pipe_peer;
1540		break;
1541	default:
1542		PIPE_UNLOCK(cpipe);
1543		return (1);
1544	}
1545
1546	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1547	PIPE_UNLOCK(cpipe);
1548	return (0);
1549}
1550
1551static void
1552filt_pipedetach(struct knote *kn)
1553{
1554	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1555
1556	PIPE_LOCK(cpipe);
1557	if (kn->kn_filter == EVFILT_WRITE) {
1558		if (!cpipe->pipe_peer->pipe_present) {
1559			PIPE_UNLOCK(cpipe);
1560			return;
1561		}
1562		cpipe = cpipe->pipe_peer;
1563	}
1564	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1565	PIPE_UNLOCK(cpipe);
1566}
1567
1568/*ARGSUSED*/
1569static int
1570filt_piperead(struct knote *kn, long hint)
1571{
1572	struct pipe *rpipe = kn->kn_fp->f_data;
1573	struct pipe *wpipe = rpipe->pipe_peer;
1574
1575	PIPE_LOCK(rpipe);
1576	kn->kn_data = rpipe->pipe_buffer.cnt;
1577	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1578		kn->kn_data = rpipe->pipe_map.cnt;
1579
1580	if ((rpipe->pipe_state & PIPE_EOF) ||
1581	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1582		kn->kn_flags |= EV_EOF;
1583		PIPE_UNLOCK(rpipe);
1584		return (1);
1585	}
1586	PIPE_UNLOCK(rpipe);
1587	return (kn->kn_data > 0);
1588}
1589
1590/*ARGSUSED*/
1591static int
1592filt_pipewrite(struct knote *kn, long hint)
1593{
1594	struct pipe *rpipe = kn->kn_fp->f_data;
1595	struct pipe *wpipe = rpipe->pipe_peer;
1596
1597	PIPE_LOCK(rpipe);
1598	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1599		kn->kn_data = 0;
1600		kn->kn_flags |= EV_EOF;
1601		PIPE_UNLOCK(rpipe);
1602		return (1);
1603	}
1604	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1605	if (wpipe->pipe_state & PIPE_DIRECTW)
1606		kn->kn_data = 0;
1607
1608	PIPE_UNLOCK(rpipe);
1609	return (kn->kn_data >= PIPE_BUF);
1610}
1611