sys_pipe.c revision 207410
1317019Sdim/*-
2317019Sdim * Copyright (c) 1996 John S. Dyson
3317019Sdim * All rights reserved.
4317019Sdim *
5317019Sdim * Redistribution and use in source and binary forms, with or without
6317019Sdim * modification, are permitted provided that the following conditions
7317019Sdim * are met:
8317019Sdim * 1. Redistributions of source code must retain the above copyright
9317019Sdim *    notice immediately at the beginning of the file, without modification,
10317019Sdim *    this list of conditions, and the following disclaimer.
11317019Sdim * 2. Redistributions in binary form must reproduce the above copyright
12317019Sdim *    notice, this list of conditions and the following disclaimer in the
13317019Sdim *    documentation and/or other materials provided with the distribution.
14317019Sdim * 3. Absolutely no warranty of function or purpose is made by the author
15317019Sdim *    John S. Dyson.
16341825Sdim * 4. Modifications may be freely made to this file if the above conditions
17317019Sdim *    are met.
18317019Sdim */
19317019Sdim
20317019Sdim/*
21317019Sdim * This file contains a high-performance replacement for the socket-based
22317019Sdim * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23317019Sdim * all features of sockets, but does do everything that pipes normally
24317019Sdim * do.
25317019Sdim */
26317019Sdim
27317019Sdim/*
28317019Sdim * This code has two modes of operation, a small write mode and a large
29317019Sdim * write mode.  The small write mode acts like conventional pipes with
30317019Sdim * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31317019Sdim * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32317019Sdim * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33317019Sdim * the receiving process can copy it directly from the pages in the sending
34317019Sdim * process.
35317019Sdim *
36327952Sdim * If the sending process receives a signal, it is possible that it will
37317019Sdim * go away, and certainly its address space can change, because control
38317019Sdim * is returned back to the user-mode side.  In that case, the pipe code
39317019Sdim * arranges to copy the buffer supplied by the user process, to a pageable
40317019Sdim * kernel buffer, and the receiving process will grab the data from the
41317019Sdim * pageable kernel buffer.  Since signals don't happen all that often,
42317019Sdim * the copy operation is normally eliminated.
43317019Sdim *
44317019Sdim * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45317019Sdim * happen for small transfers so that the system will not spend all of
46317019Sdim * its time context switching.
47317019Sdim *
48317019Sdim * In order to limit the resource use of pipes, two sysctls exist:
49317019Sdim *
50317019Sdim * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51317019Sdim * address space available to us in pipe_map. This value is normally
52317019Sdim * autotuned, but may also be loader tuned.
53317019Sdim *
54317019Sdim * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
55341825Sdim * memory in use by pipes.
56341825Sdim *
57341825Sdim * Based on how large pipekva is relative to maxpipekva, the following
58341825Sdim * will happen:
59341825Sdim *
60317019Sdim * 0% - 50%:
61341825Sdim *     New pipes are given 16K of memory backing, pipes may dynamically
62341825Sdim *     grow to as large as 64K where needed.
63341825Sdim * 50% - 75%:
64341825Sdim *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
65341825Sdim *     existing pipes may NOT grow.
66341825Sdim * 75% - 100%:
67317019Sdim *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
68317019Sdim *     existing pipes will be shrunk down to 4K whenever possible.
69317019Sdim *
70317019Sdim * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
71317019Sdim * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
72317019Sdim * resize which MUST occur for reverse-direction pipes when they are
73317019Sdim * first used.
74317019Sdim *
75317019Sdim * Additional information about the current state of pipes may be obtained
76317019Sdim * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
77317019Sdim * and kern.ipc.piperesizefail.
78317019Sdim *
79317019Sdim * Locking rules:  There are two locks present here:  A mutex, used via
80317019Sdim * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
81317019Sdim * the flag, as mutexes can not persist over uiomove.  The mutex
82317019Sdim * exists only to guard access to the flag, and is not in itself a
83317019Sdim * locking mechanism.  Also note that there is only a single mutex for
84317019Sdim * both directions of a pipe.
85317019Sdim *
86317019Sdim * As pipelock() may have to sleep before it can acquire the flag, it
87317019Sdim * is important to reread all data after a call to pipelock(); everything
88317019Sdim * in the structure may have changed.
89317019Sdim */
90317019Sdim
91317019Sdim#include <sys/cdefs.h>
92317019Sdim__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 207410 2010-04-30 00:46:43Z kmacy $");
93317019Sdim
94317019Sdim#include <sys/param.h>
95317019Sdim#include <sys/systm.h>
96317019Sdim#include <sys/fcntl.h>
97317019Sdim#include <sys/file.h>
98317019Sdim#include <sys/filedesc.h>
99317019Sdim#include <sys/filio.h>
100317019Sdim#include <sys/kernel.h>
101317019Sdim#include <sys/lock.h>
102317019Sdim#include <sys/mutex.h>
103317019Sdim#include <sys/ttycom.h>
104317019Sdim#include <sys/stat.h>
105317019Sdim#include <sys/malloc.h>
106317019Sdim#include <sys/poll.h>
107317019Sdim#include <sys/selinfo.h>
108317019Sdim#include <sys/signalvar.h>
109317019Sdim#include <sys/syscallsubr.h>
110317019Sdim#include <sys/sysctl.h>
111317019Sdim#include <sys/sysproto.h>
112317019Sdim#include <sys/pipe.h>
113317019Sdim#include <sys/proc.h>
114317019Sdim#include <sys/vnode.h>
115317019Sdim#include <sys/uio.h>
116317019Sdim#include <sys/event.h>
117317019Sdim
118317019Sdim#include <security/mac/mac_framework.h>
119317019Sdim
120317019Sdim#include <vm/vm.h>
121317019Sdim#include <vm/vm_param.h>
122317019Sdim#include <vm/vm_object.h>
123317019Sdim#include <vm/vm_kern.h>
124317019Sdim#include <vm/vm_extern.h>
125317019Sdim#include <vm/pmap.h>
126317019Sdim#include <vm/vm_map.h>
127317019Sdim#include <vm/vm_page.h>
128317019Sdim#include <vm/uma.h>
129317019Sdim
130317019Sdim/*
131317019Sdim * Use this define if you want to disable *fancy* VM things.  Expect an
132317019Sdim * approx 30% decrease in transfer rate.  This could be useful for
133317019Sdim * NetBSD or OpenBSD.
134317019Sdim */
135317019Sdim/* #define PIPE_NODIRECT */
136317019Sdim
137317019Sdim/*
138317019Sdim * interfaces to the outside world
139317019Sdim */
140317019Sdimstatic fo_rdwr_t	pipe_read;
141317019Sdimstatic fo_rdwr_t	pipe_write;
142317019Sdimstatic fo_truncate_t	pipe_truncate;
143317019Sdimstatic fo_ioctl_t	pipe_ioctl;
144317019Sdimstatic fo_poll_t	pipe_poll;
145317019Sdimstatic fo_kqfilter_t	pipe_kqfilter;
146317019Sdimstatic fo_stat_t	pipe_stat;
147317019Sdimstatic fo_close_t	pipe_close;
148317019Sdim
149317019Sdimstatic struct fileops pipeops = {
150317019Sdim	.fo_read = pipe_read,
151317019Sdim	.fo_write = pipe_write,
152317019Sdim	.fo_truncate = pipe_truncate,
153317019Sdim	.fo_ioctl = pipe_ioctl,
154317019Sdim	.fo_poll = pipe_poll,
155317019Sdim	.fo_kqfilter = pipe_kqfilter,
156317019Sdim	.fo_stat = pipe_stat,
157317019Sdim	.fo_close = pipe_close,
158317019Sdim	.fo_flags = DFLAG_PASSABLE
159317019Sdim};
160317019Sdim
161317019Sdimstatic void	filt_pipedetach(struct knote *kn);
162317019Sdimstatic int	filt_piperead(struct knote *kn, long hint);
163317019Sdimstatic int	filt_pipewrite(struct knote *kn, long hint);
164317019Sdim
165317019Sdimstatic struct filterops pipe_rfiltops = {
166317019Sdim	.f_isfd = 1,
167317019Sdim	.f_detach = filt_pipedetach,
168317019Sdim	.f_event = filt_piperead
169317019Sdim};
170317019Sdimstatic struct filterops pipe_wfiltops = {
171317019Sdim	.f_isfd = 1,
172317019Sdim	.f_detach = filt_pipedetach,
173317019Sdim	.f_event = filt_pipewrite
174317019Sdim};
175317019Sdim
176317019Sdim/*
177317019Sdim * Default pipe buffer size(s), this can be kind-of large now because pipe
178317019Sdim * space is pageable.  The pipe code will try to maintain locality of
179317019Sdim * reference for performance reasons, so small amounts of outstanding I/O
180317019Sdim * will not wipe the cache.
181317019Sdim */
182317019Sdim#define MINPIPESIZE (PIPE_SIZE/3)
183317019Sdim#define MAXPIPESIZE (2*PIPE_SIZE/3)
184317019Sdim
185317019Sdimstatic long amountpipekva;
186317019Sdimstatic int pipefragretry;
187317019Sdimstatic int pipeallocfail;
188317019Sdimstatic int piperesizefail;
189317019Sdimstatic int piperesizeallowed = 1;
190317019Sdim
191317019SdimSYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
192317019Sdim	   &maxpipekva, 0, "Pipe KVA limit");
193317019SdimSYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
194317019Sdim	   &amountpipekva, 0, "Pipe KVA usage");
195317019SdimSYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
196317019Sdim	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
197317019SdimSYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
198317019Sdim	  &pipeallocfail, 0, "Pipe allocation failures");
199317019SdimSYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
200317019Sdim	  &piperesizefail, 0, "Pipe resize failures");
201317019SdimSYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
202317019Sdim	  &piperesizeallowed, 0, "Pipe resizing allowed");
203344779Sdim
204344779Sdimstatic void pipeinit(void *dummy __unused);
205344779Sdimstatic void pipeclose(struct pipe *cpipe);
206344779Sdimstatic void pipe_free_kmem(struct pipe *cpipe);
207344779Sdimstatic int pipe_create(struct pipe *pipe, int backing);
208317019Sdimstatic __inline int pipelock(struct pipe *cpipe, int catch);
209317019Sdimstatic __inline void pipeunlock(struct pipe *cpipe);
210317019Sdimstatic __inline void pipeselwakeup(struct pipe *cpipe);
211#ifndef PIPE_NODIRECT
212static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
213static void pipe_destroy_write_buffer(struct pipe *wpipe);
214static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
215static void pipe_clone_write_buffer(struct pipe *wpipe);
216#endif
217static int pipespace(struct pipe *cpipe, int size);
218static int pipespace_new(struct pipe *cpipe, int size);
219
220static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
221static int	pipe_zone_init(void *mem, int size, int flags);
222static void	pipe_zone_fini(void *mem, int size);
223
224static uma_zone_t pipe_zone;
225
226SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
227
228static void
229pipeinit(void *dummy __unused)
230{
231
232	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
233	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
234	    UMA_ALIGN_PTR, 0);
235	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
236}
237
238static int
239pipe_zone_ctor(void *mem, int size, void *arg, int flags)
240{
241	struct pipepair *pp;
242	struct pipe *rpipe, *wpipe;
243
244	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
245
246	pp = (struct pipepair *)mem;
247
248	/*
249	 * We zero both pipe endpoints to make sure all the kmem pointers
250	 * are NULL, flag fields are zero'd, etc.  We timestamp both
251	 * endpoints with the same time.
252	 */
253	rpipe = &pp->pp_rpipe;
254	bzero(rpipe, sizeof(*rpipe));
255	vfs_timestamp(&rpipe->pipe_ctime);
256	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
257
258	wpipe = &pp->pp_wpipe;
259	bzero(wpipe, sizeof(*wpipe));
260	wpipe->pipe_ctime = rpipe->pipe_ctime;
261	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
262
263	rpipe->pipe_peer = wpipe;
264	rpipe->pipe_pair = pp;
265	wpipe->pipe_peer = rpipe;
266	wpipe->pipe_pair = pp;
267
268	/*
269	 * Mark both endpoints as present; they will later get free'd
270	 * one at a time.  When both are free'd, then the whole pair
271	 * is released.
272	 */
273	rpipe->pipe_present = PIPE_ACTIVE;
274	wpipe->pipe_present = PIPE_ACTIVE;
275
276	/*
277	 * Eventually, the MAC Framework may initialize the label
278	 * in ctor or init, but for now we do it elswhere to avoid
279	 * blocking in ctor or init.
280	 */
281	pp->pp_label = NULL;
282
283	return (0);
284}
285
286static int
287pipe_zone_init(void *mem, int size, int flags)
288{
289	struct pipepair *pp;
290
291	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
292
293	pp = (struct pipepair *)mem;
294
295	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
296	return (0);
297}
298
299static void
300pipe_zone_fini(void *mem, int size)
301{
302	struct pipepair *pp;
303
304	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
305
306	pp = (struct pipepair *)mem;
307
308	mtx_destroy(&pp->pp_mtx);
309}
310
311/*
312 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
313 * the zone pick up the pieces via pipeclose().
314 */
315int
316kern_pipe(struct thread *td, int fildes[2])
317{
318	struct filedesc *fdp = td->td_proc->p_fd;
319	struct file *rf, *wf;
320	struct pipepair *pp;
321	struct pipe *rpipe, *wpipe;
322	int fd, error;
323
324	pp = uma_zalloc(pipe_zone, M_WAITOK);
325#ifdef MAC
326	/*
327	 * The MAC label is shared between the connected endpoints.  As a
328	 * result mac_pipe_init() and mac_pipe_create() are called once
329	 * for the pair, and not on the endpoints.
330	 */
331	mac_pipe_init(pp);
332	mac_pipe_create(td->td_ucred, pp);
333#endif
334	rpipe = &pp->pp_rpipe;
335	wpipe = &pp->pp_wpipe;
336
337	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
338	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
339
340	/* Only the forward direction pipe is backed by default */
341	if ((error = pipe_create(rpipe, 1)) != 0 ||
342	    (error = pipe_create(wpipe, 0)) != 0) {
343		pipeclose(rpipe);
344		pipeclose(wpipe);
345		return (error);
346	}
347
348	rpipe->pipe_state |= PIPE_DIRECTOK;
349	wpipe->pipe_state |= PIPE_DIRECTOK;
350
351	error = falloc(td, &rf, &fd);
352	if (error) {
353		pipeclose(rpipe);
354		pipeclose(wpipe);
355		return (error);
356	}
357	/* An extra reference on `rf' has been held for us by falloc(). */
358	fildes[0] = fd;
359
360	/*
361	 * Warning: once we've gotten past allocation of the fd for the
362	 * read-side, we can only drop the read side via fdrop() in order
363	 * to avoid races against processes which manage to dup() the read
364	 * side while we are blocked trying to allocate the write side.
365	 */
366	finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops);
367	error = falloc(td, &wf, &fd);
368	if (error) {
369		fdclose(fdp, rf, fildes[0], td);
370		fdrop(rf, td);
371		/* rpipe has been closed by fdrop(). */
372		pipeclose(wpipe);
373		return (error);
374	}
375	/* An extra reference on `wf' has been held for us by falloc(). */
376	finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops);
377	fdrop(wf, td);
378	fildes[1] = fd;
379	fdrop(rf, td);
380
381	return (0);
382}
383
384/* ARGSUSED */
385int
386pipe(struct thread *td, struct pipe_args *uap)
387{
388	int error;
389	int fildes[2];
390
391	error = kern_pipe(td, fildes);
392	if (error)
393		return (error);
394
395	td->td_retval[0] = fildes[0];
396	td->td_retval[1] = fildes[1];
397
398	return (0);
399}
400
401/*
402 * Allocate kva for pipe circular buffer, the space is pageable
403 * This routine will 'realloc' the size of a pipe safely, if it fails
404 * it will retain the old buffer.
405 * If it fails it will return ENOMEM.
406 */
407static int
408pipespace_new(cpipe, size)
409	struct pipe *cpipe;
410	int size;
411{
412	caddr_t buffer;
413	int error, cnt, firstseg;
414	static int curfail = 0;
415	static struct timeval lastfail;
416
417	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
418	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
419		("pipespace: resize of direct writes not allowed"));
420retry:
421	cnt = cpipe->pipe_buffer.cnt;
422	if (cnt > size)
423		size = cnt;
424
425	size = round_page(size);
426	buffer = (caddr_t) vm_map_min(pipe_map);
427
428	error = vm_map_find(pipe_map, NULL, 0,
429		(vm_offset_t *) &buffer, size, 1,
430		VM_PROT_ALL, VM_PROT_ALL, 0);
431	if (error != KERN_SUCCESS) {
432		if ((cpipe->pipe_buffer.buffer == NULL) &&
433			(size > SMALL_PIPE_SIZE)) {
434			size = SMALL_PIPE_SIZE;
435			pipefragretry++;
436			goto retry;
437		}
438		if (cpipe->pipe_buffer.buffer == NULL) {
439			pipeallocfail++;
440			if (ppsratecheck(&lastfail, &curfail, 1))
441				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
442		} else {
443			piperesizefail++;
444		}
445		return (ENOMEM);
446	}
447
448	/* copy data, then free old resources if we're resizing */
449	if (cnt > 0) {
450		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
451			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
452			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
453				buffer, firstseg);
454			if ((cnt - firstseg) > 0)
455				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
456					cpipe->pipe_buffer.in);
457		} else {
458			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
459				buffer, cnt);
460		}
461	}
462	pipe_free_kmem(cpipe);
463	cpipe->pipe_buffer.buffer = buffer;
464	cpipe->pipe_buffer.size = size;
465	cpipe->pipe_buffer.in = cnt;
466	cpipe->pipe_buffer.out = 0;
467	cpipe->pipe_buffer.cnt = cnt;
468	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
469	return (0);
470}
471
472/*
473 * Wrapper for pipespace_new() that performs locking assertions.
474 */
475static int
476pipespace(cpipe, size)
477	struct pipe *cpipe;
478	int size;
479{
480
481	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
482		("Unlocked pipe passed to pipespace"));
483	return (pipespace_new(cpipe, size));
484}
485
486/*
487 * lock a pipe for I/O, blocking other access
488 */
489static __inline int
490pipelock(cpipe, catch)
491	struct pipe *cpipe;
492	int catch;
493{
494	int error;
495
496	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
497	while (cpipe->pipe_state & PIPE_LOCKFL) {
498		cpipe->pipe_state |= PIPE_LWANT;
499		error = msleep(cpipe, PIPE_MTX(cpipe),
500		    catch ? (PRIBIO | PCATCH) : PRIBIO,
501		    "pipelk", 0);
502		if (error != 0)
503			return (error);
504	}
505	cpipe->pipe_state |= PIPE_LOCKFL;
506	return (0);
507}
508
509/*
510 * unlock a pipe I/O lock
511 */
512static __inline void
513pipeunlock(cpipe)
514	struct pipe *cpipe;
515{
516
517	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
518	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
519		("Unlocked pipe passed to pipeunlock"));
520	cpipe->pipe_state &= ~PIPE_LOCKFL;
521	if (cpipe->pipe_state & PIPE_LWANT) {
522		cpipe->pipe_state &= ~PIPE_LWANT;
523		wakeup(cpipe);
524	}
525}
526
527static __inline void
528pipeselwakeup(cpipe)
529	struct pipe *cpipe;
530{
531
532	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
533	if (cpipe->pipe_state & PIPE_SEL) {
534		selwakeuppri(&cpipe->pipe_sel, PSOCK);
535		if (!SEL_WAITING(&cpipe->pipe_sel))
536			cpipe->pipe_state &= ~PIPE_SEL;
537	}
538	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
539		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
540	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
541}
542
543/*
544 * Initialize and allocate VM and memory for pipe.  The structure
545 * will start out zero'd from the ctor, so we just manage the kmem.
546 */
547static int
548pipe_create(pipe, backing)
549	struct pipe *pipe;
550	int backing;
551{
552	int error;
553
554	if (backing) {
555		if (amountpipekva > maxpipekva / 2)
556			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
557		else
558			error = pipespace_new(pipe, PIPE_SIZE);
559	} else {
560		/* If we're not backing this pipe, no need to do anything. */
561		error = 0;
562	}
563	return (error);
564}
565
566/* ARGSUSED */
567static int
568pipe_read(fp, uio, active_cred, flags, td)
569	struct file *fp;
570	struct uio *uio;
571	struct ucred *active_cred;
572	struct thread *td;
573	int flags;
574{
575	struct pipe *rpipe = fp->f_data;
576	int error;
577	int nread = 0;
578	u_int size;
579
580	PIPE_LOCK(rpipe);
581	++rpipe->pipe_busy;
582	error = pipelock(rpipe, 1);
583	if (error)
584		goto unlocked_error;
585
586#ifdef MAC
587	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
588	if (error)
589		goto locked_error;
590#endif
591	if (amountpipekva > (3 * maxpipekva) / 4) {
592		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
593			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
594			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
595			(piperesizeallowed == 1)) {
596			PIPE_UNLOCK(rpipe);
597			pipespace(rpipe, SMALL_PIPE_SIZE);
598			PIPE_LOCK(rpipe);
599		}
600	}
601
602	while (uio->uio_resid) {
603		/*
604		 * normal pipe buffer receive
605		 */
606		if (rpipe->pipe_buffer.cnt > 0) {
607			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
608			if (size > rpipe->pipe_buffer.cnt)
609				size = rpipe->pipe_buffer.cnt;
610			if (size > (u_int) uio->uio_resid)
611				size = (u_int) uio->uio_resid;
612
613			PIPE_UNLOCK(rpipe);
614			error = uiomove(
615			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
616			    size, uio);
617			PIPE_LOCK(rpipe);
618			if (error)
619				break;
620
621			rpipe->pipe_buffer.out += size;
622			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
623				rpipe->pipe_buffer.out = 0;
624
625			rpipe->pipe_buffer.cnt -= size;
626
627			/*
628			 * If there is no more to read in the pipe, reset
629			 * its pointers to the beginning.  This improves
630			 * cache hit stats.
631			 */
632			if (rpipe->pipe_buffer.cnt == 0) {
633				rpipe->pipe_buffer.in = 0;
634				rpipe->pipe_buffer.out = 0;
635			}
636			nread += size;
637#ifndef PIPE_NODIRECT
638		/*
639		 * Direct copy, bypassing a kernel buffer.
640		 */
641		} else if ((size = rpipe->pipe_map.cnt) &&
642			   (rpipe->pipe_state & PIPE_DIRECTW)) {
643			if (size > (u_int) uio->uio_resid)
644				size = (u_int) uio->uio_resid;
645
646			PIPE_UNLOCK(rpipe);
647			error = uiomove_fromphys(rpipe->pipe_map.ms,
648			    rpipe->pipe_map.pos, size, uio);
649			PIPE_LOCK(rpipe);
650			if (error)
651				break;
652			nread += size;
653			rpipe->pipe_map.pos += size;
654			rpipe->pipe_map.cnt -= size;
655			if (rpipe->pipe_map.cnt == 0) {
656				rpipe->pipe_state &= ~PIPE_DIRECTW;
657				wakeup(rpipe);
658			}
659#endif
660		} else {
661			/*
662			 * detect EOF condition
663			 * read returns 0 on EOF, no need to set error
664			 */
665			if (rpipe->pipe_state & PIPE_EOF)
666				break;
667
668			/*
669			 * If the "write-side" has been blocked, wake it up now.
670			 */
671			if (rpipe->pipe_state & PIPE_WANTW) {
672				rpipe->pipe_state &= ~PIPE_WANTW;
673				wakeup(rpipe);
674			}
675
676			/*
677			 * Break if some data was read.
678			 */
679			if (nread > 0)
680				break;
681
682			/*
683			 * Unlock the pipe buffer for our remaining processing.
684			 * We will either break out with an error or we will
685			 * sleep and relock to loop.
686			 */
687			pipeunlock(rpipe);
688
689			/*
690			 * Handle non-blocking mode operation or
691			 * wait for more data.
692			 */
693			if (fp->f_flag & FNONBLOCK) {
694				error = EAGAIN;
695			} else {
696				rpipe->pipe_state |= PIPE_WANTR;
697				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
698				    PRIBIO | PCATCH,
699				    "piperd", 0)) == 0)
700					error = pipelock(rpipe, 1);
701			}
702			if (error)
703				goto unlocked_error;
704		}
705	}
706#ifdef MAC
707locked_error:
708#endif
709	pipeunlock(rpipe);
710
711	/* XXX: should probably do this before getting any locks. */
712	if (error == 0)
713		vfs_timestamp(&rpipe->pipe_atime);
714unlocked_error:
715	--rpipe->pipe_busy;
716
717	/*
718	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
719	 */
720	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
721		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
722		wakeup(rpipe);
723	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
724		/*
725		 * Handle write blocking hysteresis.
726		 */
727		if (rpipe->pipe_state & PIPE_WANTW) {
728			rpipe->pipe_state &= ~PIPE_WANTW;
729			wakeup(rpipe);
730		}
731	}
732
733	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
734		pipeselwakeup(rpipe);
735
736	PIPE_UNLOCK(rpipe);
737	return (error);
738}
739
740#ifndef PIPE_NODIRECT
741/*
742 * Map the sending processes' buffer into kernel space and wire it.
743 * This is similar to a physical write operation.
744 */
745static int
746pipe_build_write_buffer(wpipe, uio)
747	struct pipe *wpipe;
748	struct uio *uio;
749{
750	pmap_t pmap;
751	u_int size;
752	int i, j;
753	vm_offset_t addr, endaddr;
754
755	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
756	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
757		("Clone attempt on non-direct write pipe!"));
758
759	size = (u_int) uio->uio_iov->iov_len;
760	if (size > wpipe->pipe_buffer.size)
761		size = wpipe->pipe_buffer.size;
762
763	pmap = vmspace_pmap(curproc->p_vmspace);
764	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
765	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
766	if (endaddr < addr)
767		return (EFAULT);
768	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
769		/*
770		 * vm_fault_quick() can sleep.  Consequently,
771		 * vm_page_lock_queue() and vm_page_unlock_queue()
772		 * should not be performed outside of this loop.
773		 */
774	race:
775		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
776
777			for (j = 0; j < i; j++) {
778				vm_page_lock(wpipe->pipe_map.ms[j]);
779				vm_page_unhold(wpipe->pipe_map.ms[j]);
780				vm_page_unlock(wpipe->pipe_map.ms[j]);
781			}
782			return (EFAULT);
783		}
784		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
785		    VM_PROT_READ);
786		if (wpipe->pipe_map.ms[i] == NULL)
787			goto race;
788	}
789
790/*
791 * set up the control block
792 */
793	wpipe->pipe_map.npages = i;
794	wpipe->pipe_map.pos =
795	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
796	wpipe->pipe_map.cnt = size;
797
798/*
799 * and update the uio data
800 */
801
802	uio->uio_iov->iov_len -= size;
803	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
804	if (uio->uio_iov->iov_len == 0)
805		uio->uio_iov++;
806	uio->uio_resid -= size;
807	uio->uio_offset += size;
808	return (0);
809}
810
811/*
812 * unmap and unwire the process buffer
813 */
814static void
815pipe_destroy_write_buffer(wpipe)
816	struct pipe *wpipe;
817{
818	int i;
819
820	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
821	for (i = 0; i < wpipe->pipe_map.npages; i++) {
822		vm_page_lock(wpipe->pipe_map.ms[i]);
823		vm_page_unhold(wpipe->pipe_map.ms[i]);
824		vm_page_unlock(wpipe->pipe_map.ms[i]);
825	}
826	wpipe->pipe_map.npages = 0;
827}
828
829/*
830 * In the case of a signal, the writing process might go away.  This
831 * code copies the data into the circular buffer so that the source
832 * pages can be freed without loss of data.
833 */
834static void
835pipe_clone_write_buffer(wpipe)
836	struct pipe *wpipe;
837{
838	struct uio uio;
839	struct iovec iov;
840	int size;
841	int pos;
842
843	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
844	size = wpipe->pipe_map.cnt;
845	pos = wpipe->pipe_map.pos;
846
847	wpipe->pipe_buffer.in = size;
848	wpipe->pipe_buffer.out = 0;
849	wpipe->pipe_buffer.cnt = size;
850	wpipe->pipe_state &= ~PIPE_DIRECTW;
851
852	PIPE_UNLOCK(wpipe);
853	iov.iov_base = wpipe->pipe_buffer.buffer;
854	iov.iov_len = size;
855	uio.uio_iov = &iov;
856	uio.uio_iovcnt = 1;
857	uio.uio_offset = 0;
858	uio.uio_resid = size;
859	uio.uio_segflg = UIO_SYSSPACE;
860	uio.uio_rw = UIO_READ;
861	uio.uio_td = curthread;
862	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
863	PIPE_LOCK(wpipe);
864	pipe_destroy_write_buffer(wpipe);
865}
866
867/*
868 * This implements the pipe buffer write mechanism.  Note that only
869 * a direct write OR a normal pipe write can be pending at any given time.
870 * If there are any characters in the pipe buffer, the direct write will
871 * be deferred until the receiving process grabs all of the bytes from
872 * the pipe buffer.  Then the direct mapping write is set-up.
873 */
874static int
875pipe_direct_write(wpipe, uio)
876	struct pipe *wpipe;
877	struct uio *uio;
878{
879	int error;
880
881retry:
882	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
883	error = pipelock(wpipe, 1);
884	if (wpipe->pipe_state & PIPE_EOF)
885		error = EPIPE;
886	if (error) {
887		pipeunlock(wpipe);
888		goto error1;
889	}
890	while (wpipe->pipe_state & PIPE_DIRECTW) {
891		if (wpipe->pipe_state & PIPE_WANTR) {
892			wpipe->pipe_state &= ~PIPE_WANTR;
893			wakeup(wpipe);
894		}
895		pipeselwakeup(wpipe);
896		wpipe->pipe_state |= PIPE_WANTW;
897		pipeunlock(wpipe);
898		error = msleep(wpipe, PIPE_MTX(wpipe),
899		    PRIBIO | PCATCH, "pipdww", 0);
900		if (error)
901			goto error1;
902		else
903			goto retry;
904	}
905	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
906	if (wpipe->pipe_buffer.cnt > 0) {
907		if (wpipe->pipe_state & PIPE_WANTR) {
908			wpipe->pipe_state &= ~PIPE_WANTR;
909			wakeup(wpipe);
910		}
911		pipeselwakeup(wpipe);
912		wpipe->pipe_state |= PIPE_WANTW;
913		pipeunlock(wpipe);
914		error = msleep(wpipe, PIPE_MTX(wpipe),
915		    PRIBIO | PCATCH, "pipdwc", 0);
916		if (error)
917			goto error1;
918		else
919			goto retry;
920	}
921
922	wpipe->pipe_state |= PIPE_DIRECTW;
923
924	PIPE_UNLOCK(wpipe);
925	error = pipe_build_write_buffer(wpipe, uio);
926	PIPE_LOCK(wpipe);
927	if (error) {
928		wpipe->pipe_state &= ~PIPE_DIRECTW;
929		pipeunlock(wpipe);
930		goto error1;
931	}
932
933	error = 0;
934	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
935		if (wpipe->pipe_state & PIPE_EOF) {
936			pipe_destroy_write_buffer(wpipe);
937			pipeselwakeup(wpipe);
938			pipeunlock(wpipe);
939			error = EPIPE;
940			goto error1;
941		}
942		if (wpipe->pipe_state & PIPE_WANTR) {
943			wpipe->pipe_state &= ~PIPE_WANTR;
944			wakeup(wpipe);
945		}
946		pipeselwakeup(wpipe);
947		pipeunlock(wpipe);
948		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
949		    "pipdwt", 0);
950		pipelock(wpipe, 0);
951	}
952
953	if (wpipe->pipe_state & PIPE_EOF)
954		error = EPIPE;
955	if (wpipe->pipe_state & PIPE_DIRECTW) {
956		/*
957		 * this bit of trickery substitutes a kernel buffer for
958		 * the process that might be going away.
959		 */
960		pipe_clone_write_buffer(wpipe);
961	} else {
962		pipe_destroy_write_buffer(wpipe);
963	}
964	pipeunlock(wpipe);
965	return (error);
966
967error1:
968	wakeup(wpipe);
969	return (error);
970}
971#endif
972
973static int
974pipe_write(fp, uio, active_cred, flags, td)
975	struct file *fp;
976	struct uio *uio;
977	struct ucred *active_cred;
978	struct thread *td;
979	int flags;
980{
981	int error = 0;
982	int desiredsize, orig_resid;
983	struct pipe *wpipe, *rpipe;
984
985	rpipe = fp->f_data;
986	wpipe = rpipe->pipe_peer;
987
988	PIPE_LOCK(rpipe);
989	error = pipelock(wpipe, 1);
990	if (error) {
991		PIPE_UNLOCK(rpipe);
992		return (error);
993	}
994	/*
995	 * detect loss of pipe read side, issue SIGPIPE if lost.
996	 */
997	if (wpipe->pipe_present != PIPE_ACTIVE ||
998	    (wpipe->pipe_state & PIPE_EOF)) {
999		pipeunlock(wpipe);
1000		PIPE_UNLOCK(rpipe);
1001		return (EPIPE);
1002	}
1003#ifdef MAC
1004	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
1005	if (error) {
1006		pipeunlock(wpipe);
1007		PIPE_UNLOCK(rpipe);
1008		return (error);
1009	}
1010#endif
1011	++wpipe->pipe_busy;
1012
1013	/* Choose a larger size if it's advantageous */
1014	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1015	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1016		if (piperesizeallowed != 1)
1017			break;
1018		if (amountpipekva > maxpipekva / 2)
1019			break;
1020		if (desiredsize == BIG_PIPE_SIZE)
1021			break;
1022		desiredsize = desiredsize * 2;
1023	}
1024
1025	/* Choose a smaller size if we're in a OOM situation */
1026	if ((amountpipekva > (3 * maxpipekva) / 4) &&
1027		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1028		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1029		(piperesizeallowed == 1))
1030		desiredsize = SMALL_PIPE_SIZE;
1031
1032	/* Resize if the above determined that a new size was necessary */
1033	if ((desiredsize != wpipe->pipe_buffer.size) &&
1034		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1035		PIPE_UNLOCK(wpipe);
1036		pipespace(wpipe, desiredsize);
1037		PIPE_LOCK(wpipe);
1038	}
1039	if (wpipe->pipe_buffer.size == 0) {
1040		/*
1041		 * This can only happen for reverse direction use of pipes
1042		 * in a complete OOM situation.
1043		 */
1044		error = ENOMEM;
1045		--wpipe->pipe_busy;
1046		pipeunlock(wpipe);
1047		PIPE_UNLOCK(wpipe);
1048		return (error);
1049	}
1050
1051	pipeunlock(wpipe);
1052
1053	orig_resid = uio->uio_resid;
1054
1055	while (uio->uio_resid) {
1056		int space;
1057
1058		pipelock(wpipe, 0);
1059		if (wpipe->pipe_state & PIPE_EOF) {
1060			pipeunlock(wpipe);
1061			error = EPIPE;
1062			break;
1063		}
1064#ifndef PIPE_NODIRECT
1065		/*
1066		 * If the transfer is large, we can gain performance if
1067		 * we do process-to-process copies directly.
1068		 * If the write is non-blocking, we don't use the
1069		 * direct write mechanism.
1070		 *
1071		 * The direct write mechanism will detect the reader going
1072		 * away on us.
1073		 */
1074		if (uio->uio_segflg == UIO_USERSPACE &&
1075		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
1076		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
1077		    (fp->f_flag & FNONBLOCK) == 0) {
1078			pipeunlock(wpipe);
1079			error = pipe_direct_write(wpipe, uio);
1080			if (error)
1081				break;
1082			continue;
1083		}
1084#endif
1085
1086		/*
1087		 * Pipe buffered writes cannot be coincidental with
1088		 * direct writes.  We wait until the currently executing
1089		 * direct write is completed before we start filling the
1090		 * pipe buffer.  We break out if a signal occurs or the
1091		 * reader goes away.
1092		 */
1093		if (wpipe->pipe_state & PIPE_DIRECTW) {
1094			if (wpipe->pipe_state & PIPE_WANTR) {
1095				wpipe->pipe_state &= ~PIPE_WANTR;
1096				wakeup(wpipe);
1097			}
1098			pipeselwakeup(wpipe);
1099			wpipe->pipe_state |= PIPE_WANTW;
1100			pipeunlock(wpipe);
1101			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1102			    "pipbww", 0);
1103			if (error)
1104				break;
1105			else
1106				continue;
1107		}
1108
1109		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1110
1111		/* Writes of size <= PIPE_BUF must be atomic. */
1112		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1113			space = 0;
1114
1115		if (space > 0) {
1116			int size;	/* Transfer size */
1117			int segsize;	/* first segment to transfer */
1118
1119			/*
1120			 * Transfer size is minimum of uio transfer
1121			 * and free space in pipe buffer.
1122			 */
1123			if (space > uio->uio_resid)
1124				size = uio->uio_resid;
1125			else
1126				size = space;
1127			/*
1128			 * First segment to transfer is minimum of
1129			 * transfer size and contiguous space in
1130			 * pipe buffer.  If first segment to transfer
1131			 * is less than the transfer size, we've got
1132			 * a wraparound in the buffer.
1133			 */
1134			segsize = wpipe->pipe_buffer.size -
1135				wpipe->pipe_buffer.in;
1136			if (segsize > size)
1137				segsize = size;
1138
1139			/* Transfer first segment */
1140
1141			PIPE_UNLOCK(rpipe);
1142			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1143					segsize, uio);
1144			PIPE_LOCK(rpipe);
1145
1146			if (error == 0 && segsize < size) {
1147				KASSERT(wpipe->pipe_buffer.in + segsize ==
1148					wpipe->pipe_buffer.size,
1149					("Pipe buffer wraparound disappeared"));
1150				/*
1151				 * Transfer remaining part now, to
1152				 * support atomic writes.  Wraparound
1153				 * happened.
1154				 */
1155
1156				PIPE_UNLOCK(rpipe);
1157				error = uiomove(
1158				    &wpipe->pipe_buffer.buffer[0],
1159				    size - segsize, uio);
1160				PIPE_LOCK(rpipe);
1161			}
1162			if (error == 0) {
1163				wpipe->pipe_buffer.in += size;
1164				if (wpipe->pipe_buffer.in >=
1165				    wpipe->pipe_buffer.size) {
1166					KASSERT(wpipe->pipe_buffer.in ==
1167						size - segsize +
1168						wpipe->pipe_buffer.size,
1169						("Expected wraparound bad"));
1170					wpipe->pipe_buffer.in = size - segsize;
1171				}
1172
1173				wpipe->pipe_buffer.cnt += size;
1174				KASSERT(wpipe->pipe_buffer.cnt <=
1175					wpipe->pipe_buffer.size,
1176					("Pipe buffer overflow"));
1177			}
1178			pipeunlock(wpipe);
1179			if (error != 0)
1180				break;
1181		} else {
1182			/*
1183			 * If the "read-side" has been blocked, wake it up now.
1184			 */
1185			if (wpipe->pipe_state & PIPE_WANTR) {
1186				wpipe->pipe_state &= ~PIPE_WANTR;
1187				wakeup(wpipe);
1188			}
1189
1190			/*
1191			 * don't block on non-blocking I/O
1192			 */
1193			if (fp->f_flag & FNONBLOCK) {
1194				error = EAGAIN;
1195				pipeunlock(wpipe);
1196				break;
1197			}
1198
1199			/*
1200			 * We have no more space and have something to offer,
1201			 * wake up select/poll.
1202			 */
1203			pipeselwakeup(wpipe);
1204
1205			wpipe->pipe_state |= PIPE_WANTW;
1206			pipeunlock(wpipe);
1207			error = msleep(wpipe, PIPE_MTX(rpipe),
1208			    PRIBIO | PCATCH, "pipewr", 0);
1209			if (error != 0)
1210				break;
1211		}
1212	}
1213
1214	pipelock(wpipe, 0);
1215	--wpipe->pipe_busy;
1216
1217	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1218		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1219		wakeup(wpipe);
1220	} else if (wpipe->pipe_buffer.cnt > 0) {
1221		/*
1222		 * If we have put any characters in the buffer, we wake up
1223		 * the reader.
1224		 */
1225		if (wpipe->pipe_state & PIPE_WANTR) {
1226			wpipe->pipe_state &= ~PIPE_WANTR;
1227			wakeup(wpipe);
1228		}
1229	}
1230
1231	/*
1232	 * Don't return EPIPE if I/O was successful
1233	 */
1234	if ((wpipe->pipe_buffer.cnt == 0) &&
1235	    (uio->uio_resid == 0) &&
1236	    (error == EPIPE)) {
1237		error = 0;
1238	}
1239
1240	if (error == 0)
1241		vfs_timestamp(&wpipe->pipe_mtime);
1242
1243	/*
1244	 * We have something to offer,
1245	 * wake up select/poll.
1246	 */
1247	if (wpipe->pipe_buffer.cnt)
1248		pipeselwakeup(wpipe);
1249
1250	pipeunlock(wpipe);
1251	PIPE_UNLOCK(rpipe);
1252	return (error);
1253}
1254
1255/* ARGSUSED */
1256static int
1257pipe_truncate(fp, length, active_cred, td)
1258	struct file *fp;
1259	off_t length;
1260	struct ucred *active_cred;
1261	struct thread *td;
1262{
1263
1264	return (EINVAL);
1265}
1266
1267/*
1268 * we implement a very minimal set of ioctls for compatibility with sockets.
1269 */
1270static int
1271pipe_ioctl(fp, cmd, data, active_cred, td)
1272	struct file *fp;
1273	u_long cmd;
1274	void *data;
1275	struct ucred *active_cred;
1276	struct thread *td;
1277{
1278	struct pipe *mpipe = fp->f_data;
1279	int error;
1280
1281	PIPE_LOCK(mpipe);
1282
1283#ifdef MAC
1284	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1285	if (error) {
1286		PIPE_UNLOCK(mpipe);
1287		return (error);
1288	}
1289#endif
1290
1291	error = 0;
1292	switch (cmd) {
1293
1294	case FIONBIO:
1295		break;
1296
1297	case FIOASYNC:
1298		if (*(int *)data) {
1299			mpipe->pipe_state |= PIPE_ASYNC;
1300		} else {
1301			mpipe->pipe_state &= ~PIPE_ASYNC;
1302		}
1303		break;
1304
1305	case FIONREAD:
1306		if (mpipe->pipe_state & PIPE_DIRECTW)
1307			*(int *)data = mpipe->pipe_map.cnt;
1308		else
1309			*(int *)data = mpipe->pipe_buffer.cnt;
1310		break;
1311
1312	case FIOSETOWN:
1313		PIPE_UNLOCK(mpipe);
1314		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1315		goto out_unlocked;
1316
1317	case FIOGETOWN:
1318		*(int *)data = fgetown(&mpipe->pipe_sigio);
1319		break;
1320
1321	/* This is deprecated, FIOSETOWN should be used instead. */
1322	case TIOCSPGRP:
1323		PIPE_UNLOCK(mpipe);
1324		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1325		goto out_unlocked;
1326
1327	/* This is deprecated, FIOGETOWN should be used instead. */
1328	case TIOCGPGRP:
1329		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1330		break;
1331
1332	default:
1333		error = ENOTTY;
1334		break;
1335	}
1336	PIPE_UNLOCK(mpipe);
1337out_unlocked:
1338	return (error);
1339}
1340
1341static int
1342pipe_poll(fp, events, active_cred, td)
1343	struct file *fp;
1344	int events;
1345	struct ucred *active_cred;
1346	struct thread *td;
1347{
1348	struct pipe *rpipe = fp->f_data;
1349	struct pipe *wpipe;
1350	int revents = 0;
1351#ifdef MAC
1352	int error;
1353#endif
1354
1355	wpipe = rpipe->pipe_peer;
1356	PIPE_LOCK(rpipe);
1357#ifdef MAC
1358	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
1359	if (error)
1360		goto locked_error;
1361#endif
1362	if (events & (POLLIN | POLLRDNORM))
1363		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1364		    (rpipe->pipe_buffer.cnt > 0))
1365			revents |= events & (POLLIN | POLLRDNORM);
1366
1367	if (events & (POLLOUT | POLLWRNORM))
1368		if (wpipe->pipe_present != PIPE_ACTIVE ||
1369		    (wpipe->pipe_state & PIPE_EOF) ||
1370		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1371		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1372			revents |= events & (POLLOUT | POLLWRNORM);
1373
1374	if ((events & POLLINIGNEOF) == 0) {
1375		if (rpipe->pipe_state & PIPE_EOF) {
1376			revents |= (events & (POLLIN | POLLRDNORM));
1377			if (wpipe->pipe_present != PIPE_ACTIVE ||
1378			    (wpipe->pipe_state & PIPE_EOF))
1379				revents |= POLLHUP;
1380		}
1381	}
1382
1383	if (revents == 0) {
1384		if (events & (POLLIN | POLLRDNORM)) {
1385			selrecord(td, &rpipe->pipe_sel);
1386			if (SEL_WAITING(&rpipe->pipe_sel))
1387				rpipe->pipe_state |= PIPE_SEL;
1388		}
1389
1390		if (events & (POLLOUT | POLLWRNORM)) {
1391			selrecord(td, &wpipe->pipe_sel);
1392			if (SEL_WAITING(&wpipe->pipe_sel))
1393				wpipe->pipe_state |= PIPE_SEL;
1394		}
1395	}
1396#ifdef MAC
1397locked_error:
1398#endif
1399	PIPE_UNLOCK(rpipe);
1400
1401	return (revents);
1402}
1403
1404/*
1405 * We shouldn't need locks here as we're doing a read and this should
1406 * be a natural race.
1407 */
1408static int
1409pipe_stat(fp, ub, active_cred, td)
1410	struct file *fp;
1411	struct stat *ub;
1412	struct ucred *active_cred;
1413	struct thread *td;
1414{
1415	struct pipe *pipe = fp->f_data;
1416#ifdef MAC
1417	int error;
1418
1419	PIPE_LOCK(pipe);
1420	error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
1421	PIPE_UNLOCK(pipe);
1422	if (error)
1423		return (error);
1424#endif
1425	bzero(ub, sizeof(*ub));
1426	ub->st_mode = S_IFIFO;
1427	ub->st_blksize = PAGE_SIZE;
1428	if (pipe->pipe_state & PIPE_DIRECTW)
1429		ub->st_size = pipe->pipe_map.cnt;
1430	else
1431		ub->st_size = pipe->pipe_buffer.cnt;
1432	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1433	ub->st_atim = pipe->pipe_atime;
1434	ub->st_mtim = pipe->pipe_mtime;
1435	ub->st_ctim = pipe->pipe_ctime;
1436	ub->st_uid = fp->f_cred->cr_uid;
1437	ub->st_gid = fp->f_cred->cr_gid;
1438	/*
1439	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1440	 * XXX (st_dev, st_ino) should be unique.
1441	 */
1442	return (0);
1443}
1444
1445/* ARGSUSED */
1446static int
1447pipe_close(fp, td)
1448	struct file *fp;
1449	struct thread *td;
1450{
1451	struct pipe *cpipe = fp->f_data;
1452
1453	fp->f_ops = &badfileops;
1454	fp->f_data = NULL;
1455	funsetown(&cpipe->pipe_sigio);
1456	pipeclose(cpipe);
1457	return (0);
1458}
1459
1460static void
1461pipe_free_kmem(cpipe)
1462	struct pipe *cpipe;
1463{
1464
1465	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1466	    ("pipe_free_kmem: pipe mutex locked"));
1467
1468	if (cpipe->pipe_buffer.buffer != NULL) {
1469		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
1470		vm_map_remove(pipe_map,
1471		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1472		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1473		cpipe->pipe_buffer.buffer = NULL;
1474	}
1475#ifndef PIPE_NODIRECT
1476	{
1477		cpipe->pipe_map.cnt = 0;
1478		cpipe->pipe_map.pos = 0;
1479		cpipe->pipe_map.npages = 0;
1480	}
1481#endif
1482}
1483
1484/*
1485 * shutdown the pipe
1486 */
1487static void
1488pipeclose(cpipe)
1489	struct pipe *cpipe;
1490{
1491	struct pipepair *pp;
1492	struct pipe *ppipe;
1493
1494	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1495
1496	PIPE_LOCK(cpipe);
1497	pipelock(cpipe, 0);
1498	pp = cpipe->pipe_pair;
1499
1500	pipeselwakeup(cpipe);
1501
1502	/*
1503	 * If the other side is blocked, wake it up saying that
1504	 * we want to close it down.
1505	 */
1506	cpipe->pipe_state |= PIPE_EOF;
1507	while (cpipe->pipe_busy) {
1508		wakeup(cpipe);
1509		cpipe->pipe_state |= PIPE_WANT;
1510		pipeunlock(cpipe);
1511		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1512		pipelock(cpipe, 0);
1513	}
1514
1515
1516	/*
1517	 * Disconnect from peer, if any.
1518	 */
1519	ppipe = cpipe->pipe_peer;
1520	if (ppipe->pipe_present == PIPE_ACTIVE) {
1521		pipeselwakeup(ppipe);
1522
1523		ppipe->pipe_state |= PIPE_EOF;
1524		wakeup(ppipe);
1525		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1526	}
1527
1528	/*
1529	 * Mark this endpoint as free.  Release kmem resources.  We
1530	 * don't mark this endpoint as unused until we've finished
1531	 * doing that, or the pipe might disappear out from under
1532	 * us.
1533	 */
1534	PIPE_UNLOCK(cpipe);
1535	pipe_free_kmem(cpipe);
1536	PIPE_LOCK(cpipe);
1537	cpipe->pipe_present = PIPE_CLOSING;
1538	pipeunlock(cpipe);
1539
1540	/*
1541	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
1542	 * PIPE_FINALIZED, that allows other end to free the
1543	 * pipe_pair, only after the knotes are completely dismantled.
1544	 */
1545	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1546	cpipe->pipe_present = PIPE_FINALIZED;
1547	knlist_destroy(&cpipe->pipe_sel.si_note);
1548
1549	/*
1550	 * If both endpoints are now closed, release the memory for the
1551	 * pipe pair.  If not, unlock.
1552	 */
1553	if (ppipe->pipe_present == PIPE_FINALIZED) {
1554		PIPE_UNLOCK(cpipe);
1555#ifdef MAC
1556		mac_pipe_destroy(pp);
1557#endif
1558		uma_zfree(pipe_zone, cpipe->pipe_pair);
1559	} else
1560		PIPE_UNLOCK(cpipe);
1561}
1562
1563/*ARGSUSED*/
1564static int
1565pipe_kqfilter(struct file *fp, struct knote *kn)
1566{
1567	struct pipe *cpipe;
1568
1569	cpipe = kn->kn_fp->f_data;
1570	PIPE_LOCK(cpipe);
1571	switch (kn->kn_filter) {
1572	case EVFILT_READ:
1573		kn->kn_fop = &pipe_rfiltops;
1574		break;
1575	case EVFILT_WRITE:
1576		kn->kn_fop = &pipe_wfiltops;
1577		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
1578			/* other end of pipe has been closed */
1579			PIPE_UNLOCK(cpipe);
1580			return (EPIPE);
1581		}
1582		cpipe = cpipe->pipe_peer;
1583		break;
1584	default:
1585		PIPE_UNLOCK(cpipe);
1586		return (EINVAL);
1587	}
1588
1589	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1590	PIPE_UNLOCK(cpipe);
1591	return (0);
1592}
1593
1594static void
1595filt_pipedetach(struct knote *kn)
1596{
1597	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1598
1599	PIPE_LOCK(cpipe);
1600	if (kn->kn_filter == EVFILT_WRITE)
1601		cpipe = cpipe->pipe_peer;
1602	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1603	PIPE_UNLOCK(cpipe);
1604}
1605
1606/*ARGSUSED*/
1607static int
1608filt_piperead(struct knote *kn, long hint)
1609{
1610	struct pipe *rpipe = kn->kn_fp->f_data;
1611	struct pipe *wpipe = rpipe->pipe_peer;
1612	int ret;
1613
1614	PIPE_LOCK(rpipe);
1615	kn->kn_data = rpipe->pipe_buffer.cnt;
1616	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1617		kn->kn_data = rpipe->pipe_map.cnt;
1618
1619	if ((rpipe->pipe_state & PIPE_EOF) ||
1620	    wpipe->pipe_present != PIPE_ACTIVE ||
1621	    (wpipe->pipe_state & PIPE_EOF)) {
1622		kn->kn_flags |= EV_EOF;
1623		PIPE_UNLOCK(rpipe);
1624		return (1);
1625	}
1626	ret = kn->kn_data > 0;
1627	PIPE_UNLOCK(rpipe);
1628	return ret;
1629}
1630
1631/*ARGSUSED*/
1632static int
1633filt_pipewrite(struct knote *kn, long hint)
1634{
1635	struct pipe *rpipe = kn->kn_fp->f_data;
1636	struct pipe *wpipe = rpipe->pipe_peer;
1637
1638	PIPE_LOCK(rpipe);
1639	if (wpipe->pipe_present != PIPE_ACTIVE ||
1640	    (wpipe->pipe_state & PIPE_EOF)) {
1641		kn->kn_data = 0;
1642		kn->kn_flags |= EV_EOF;
1643		PIPE_UNLOCK(rpipe);
1644		return (1);
1645	}
1646	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1647	if (wpipe->pipe_state & PIPE_DIRECTW)
1648		kn->kn_data = 0;
1649
1650	PIPE_UNLOCK(rpipe);
1651	return (kn->kn_data >= PIPE_BUF);
1652}
1653