sys_pipe.c revision 153484
1/*-
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19
20/*
21 * This file contains a high-performance replacement for the socket-based
22 * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23 * all features of sockets, but does do everything that pipes normally
24 * do.
25 */
26
27/*
28 * This code has two modes of operation, a small write mode and a large
29 * write mode.  The small write mode acts like conventional pipes with
30 * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31 * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33 * the receiving process can copy it directly from the pages in the sending
34 * process.
35 *
36 * If the sending process receives a signal, it is possible that it will
37 * go away, and certainly its address space can change, because control
38 * is returned back to the user-mode side.  In that case, the pipe code
39 * arranges to copy the buffer supplied by the user process, to a pageable
40 * kernel buffer, and the receiving process will grab the data from the
41 * pageable kernel buffer.  Since signals don't happen all that often,
42 * the copy operation is normally eliminated.
43 *
44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45 * happen for small transfers so that the system will not spend all of
46 * its time context switching.
47 *
48 * In order to limit the resource use of pipes, two sysctls exist:
49 *
50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51 * address space available to us in pipe_map. This value is normally
52 * autotuned, but may also be loader tuned.
53 *
54 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
55 * memory in use by pipes.
56 *
57 * Based on how large pipekva is relative to maxpipekva, the following
58 * will happen:
59 *
60 * 0% - 50%:
61 *     New pipes are given 16K of memory backing, pipes may dynamically
62 *     grow to as large as 64K where needed.
63 * 50% - 75%:
64 *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
65 *     existing pipes may NOT grow.
66 * 75% - 100%:
67 *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
68 *     existing pipes will be shrunk down to 4K whenever possible.
69 *
70 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
71 * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
72 * resize which MUST occur for reverse-direction pipes when they are
73 * first used.
74 *
75 * Additional information about the current state of pipes may be obtained
76 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
77 * and kern.ipc.piperesizefail.
78 *
79 * Locking rules:  There are two locks present here:  A mutex, used via
80 * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
81 * the flag, as mutexes can not persist over uiomove.  The mutex
82 * exists only to guard access to the flag, and is not in itself a
83 * locking mechanism.  Also note that there is only a single mutex for
84 * both directions of a pipe.
85 *
86 * As pipelock() may have to sleep before it can acquire the flag, it
87 * is important to reread all data after a call to pipelock(); everything
88 * in the structure may have changed.
89 */
90
91#include <sys/cdefs.h>
92__FBSDID("$FreeBSD: head/sys/kern/sys_pipe.c 153484 2005-12-16 18:32:39Z delphij $");
93
94#include "opt_mac.h"
95
96#include <sys/param.h>
97#include <sys/systm.h>
98#include <sys/fcntl.h>
99#include <sys/file.h>
100#include <sys/filedesc.h>
101#include <sys/filio.h>
102#include <sys/kernel.h>
103#include <sys/lock.h>
104#include <sys/mac.h>
105#include <sys/mutex.h>
106#include <sys/ttycom.h>
107#include <sys/stat.h>
108#include <sys/malloc.h>
109#include <sys/poll.h>
110#include <sys/selinfo.h>
111#include <sys/signalvar.h>
112#include <sys/sysctl.h>
113#include <sys/sysproto.h>
114#include <sys/pipe.h>
115#include <sys/proc.h>
116#include <sys/vnode.h>
117#include <sys/uio.h>
118#include <sys/event.h>
119
120#include <vm/vm.h>
121#include <vm/vm_param.h>
122#include <vm/vm_object.h>
123#include <vm/vm_kern.h>
124#include <vm/vm_extern.h>
125#include <vm/pmap.h>
126#include <vm/vm_map.h>
127#include <vm/vm_page.h>
128#include <vm/uma.h>
129
130/*
131 * Use this define if you want to disable *fancy* VM things.  Expect an
132 * approx 30% decrease in transfer rate.  This could be useful for
133 * NetBSD or OpenBSD.
134 */
135/* #define PIPE_NODIRECT */
136
137/*
138 * interfaces to the outside world
139 */
140static fo_rdwr_t	pipe_read;
141static fo_rdwr_t	pipe_write;
142static fo_ioctl_t	pipe_ioctl;
143static fo_poll_t	pipe_poll;
144static fo_kqfilter_t	pipe_kqfilter;
145static fo_stat_t	pipe_stat;
146static fo_close_t	pipe_close;
147
148static struct fileops pipeops = {
149	.fo_read = pipe_read,
150	.fo_write = pipe_write,
151	.fo_ioctl = pipe_ioctl,
152	.fo_poll = pipe_poll,
153	.fo_kqfilter = pipe_kqfilter,
154	.fo_stat = pipe_stat,
155	.fo_close = pipe_close,
156	.fo_flags = DFLAG_PASSABLE
157};
158
159static void	filt_pipedetach(struct knote *kn);
160static int	filt_piperead(struct knote *kn, long hint);
161static int	filt_pipewrite(struct knote *kn, long hint);
162
163static struct filterops pipe_rfiltops =
164	{ 1, NULL, filt_pipedetach, filt_piperead };
165static struct filterops pipe_wfiltops =
166	{ 1, NULL, filt_pipedetach, filt_pipewrite };
167
168/*
169 * Default pipe buffer size(s), this can be kind-of large now because pipe
170 * space is pageable.  The pipe code will try to maintain locality of
171 * reference for performance reasons, so small amounts of outstanding I/O
172 * will not wipe the cache.
173 */
174#define MINPIPESIZE (PIPE_SIZE/3)
175#define MAXPIPESIZE (2*PIPE_SIZE/3)
176
177static int amountpipes;
178static int amountpipekva;
179static int pipefragretry;
180static int pipeallocfail;
181static int piperesizefail;
182static int piperesizeallowed = 1;
183
184SYSCTL_DECL(_kern_ipc);
185
186SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
187	   &maxpipekva, 0, "Pipe KVA limit");
188SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
189	   &amountpipes, 0, "Current # of pipes");
190SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
191	   &amountpipekva, 0, "Pipe KVA usage");
192SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
193	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
194SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
195	  &pipeallocfail, 0, "Pipe allocation failures");
196SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
197	  &piperesizefail, 0, "Pipe resize failures");
198SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
199	  &piperesizeallowed, 0, "Pipe resizing allowed");
200
201static void pipeinit(void *dummy __unused);
202static void pipeclose(struct pipe *cpipe);
203static void pipe_free_kmem(struct pipe *cpipe);
204static int pipe_create(struct pipe *pipe, int backing);
205static __inline int pipelock(struct pipe *cpipe, int catch);
206static __inline void pipeunlock(struct pipe *cpipe);
207static __inline void pipeselwakeup(struct pipe *cpipe);
208#ifndef PIPE_NODIRECT
209static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
210static void pipe_destroy_write_buffer(struct pipe *wpipe);
211static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
212static void pipe_clone_write_buffer(struct pipe *wpipe);
213#endif
214static int pipespace(struct pipe *cpipe, int size);
215static int pipespace_new(struct pipe *cpipe, int size);
216
217static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
218static void	pipe_zone_dtor(void *mem, int size, void *arg);
219static int	pipe_zone_init(void *mem, int size, int flags);
220static void	pipe_zone_fini(void *mem, int size);
221
222static uma_zone_t pipe_zone;
223
224SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
225
226static void
227pipeinit(void *dummy __unused)
228{
229
230	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
231	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
232	    UMA_ALIGN_PTR, 0);
233	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
234}
235
236static int
237pipe_zone_ctor(void *mem, int size, void *arg, int flags)
238{
239	struct pipepair *pp;
240	struct pipe *rpipe, *wpipe;
241
242	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
243
244	pp = (struct pipepair *)mem;
245
246	/*
247	 * We zero both pipe endpoints to make sure all the kmem pointers
248	 * are NULL, flag fields are zero'd, etc.  We timestamp both
249	 * endpoints with the same time.
250	 */
251	rpipe = &pp->pp_rpipe;
252	bzero(rpipe, sizeof(*rpipe));
253	vfs_timestamp(&rpipe->pipe_ctime);
254	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
255
256	wpipe = &pp->pp_wpipe;
257	bzero(wpipe, sizeof(*wpipe));
258	wpipe->pipe_ctime = rpipe->pipe_ctime;
259	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
260
261	rpipe->pipe_peer = wpipe;
262	rpipe->pipe_pair = pp;
263	wpipe->pipe_peer = rpipe;
264	wpipe->pipe_pair = pp;
265
266	/*
267	 * Mark both endpoints as present; they will later get free'd
268	 * one at a time.  When both are free'd, then the whole pair
269	 * is released.
270	 */
271	rpipe->pipe_present = 1;
272	wpipe->pipe_present = 1;
273
274	/*
275	 * Eventually, the MAC Framework may initialize the label
276	 * in ctor or init, but for now we do it elswhere to avoid
277	 * blocking in ctor or init.
278	 */
279	pp->pp_label = NULL;
280
281	atomic_add_int(&amountpipes, 2);
282	return (0);
283}
284
285static void
286pipe_zone_dtor(void *mem, int size, void *arg)
287{
288	struct pipepair *pp;
289
290	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
291
292	pp = (struct pipepair *)mem;
293
294	atomic_subtract_int(&amountpipes, 2);
295}
296
297static int
298pipe_zone_init(void *mem, int size, int flags)
299{
300	struct pipepair *pp;
301
302	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
303
304	pp = (struct pipepair *)mem;
305
306	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
307	return (0);
308}
309
310static void
311pipe_zone_fini(void *mem, int size)
312{
313	struct pipepair *pp;
314
315	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
316
317	pp = (struct pipepair *)mem;
318
319	mtx_destroy(&pp->pp_mtx);
320}
321
322/*
323 * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
324 * let the zone pick up the pieces via pipeclose().
325 */
326
327/* ARGSUSED */
328int
329pipe(td, uap)
330	struct thread *td;
331	struct pipe_args /* {
332		int	dummy;
333	} */ *uap;
334{
335	struct filedesc *fdp = td->td_proc->p_fd;
336	struct file *rf, *wf;
337	struct pipepair *pp;
338	struct pipe *rpipe, *wpipe;
339	int fd, error;
340
341	pp = uma_zalloc(pipe_zone, M_WAITOK);
342#ifdef MAC
343	/*
344	 * The MAC label is shared between the connected endpoints.  As a
345	 * result mac_init_pipe() and mac_create_pipe() are called once
346	 * for the pair, and not on the endpoints.
347	 */
348	mac_init_pipe(pp);
349	mac_create_pipe(td->td_ucred, pp);
350#endif
351	rpipe = &pp->pp_rpipe;
352	wpipe = &pp->pp_wpipe;
353
354	knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL,
355	    NULL);
356	knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL,
357	    NULL);
358
359	/* Only the forward direction pipe is backed by default */
360	if (pipe_create(rpipe, 1) || pipe_create(wpipe, 0)) {
361		pipeclose(rpipe);
362		pipeclose(wpipe);
363		return (ENFILE);
364	}
365
366	rpipe->pipe_state |= PIPE_DIRECTOK;
367	wpipe->pipe_state |= PIPE_DIRECTOK;
368
369	error = falloc(td, &rf, &fd);
370	if (error) {
371		pipeclose(rpipe);
372		pipeclose(wpipe);
373		return (error);
374	}
375	/* An extra reference on `rf' has been held for us by falloc(). */
376	td->td_retval[0] = fd;
377
378	/*
379	 * Warning: once we've gotten past allocation of the fd for the
380	 * read-side, we can only drop the read side via fdrop() in order
381	 * to avoid races against processes which manage to dup() the read
382	 * side while we are blocked trying to allocate the write side.
383	 */
384	FILE_LOCK(rf);
385	rf->f_flag = FREAD | FWRITE;
386	rf->f_type = DTYPE_PIPE;
387	rf->f_data = rpipe;
388	rf->f_ops = &pipeops;
389	FILE_UNLOCK(rf);
390	error = falloc(td, &wf, &fd);
391	if (error) {
392		fdclose(fdp, rf, td->td_retval[0], td);
393		fdrop(rf, td);
394		/* rpipe has been closed by fdrop(). */
395		pipeclose(wpipe);
396		return (error);
397	}
398	/* An extra reference on `wf' has been held for us by falloc(). */
399	FILE_LOCK(wf);
400	wf->f_flag = FREAD | FWRITE;
401	wf->f_type = DTYPE_PIPE;
402	wf->f_data = wpipe;
403	wf->f_ops = &pipeops;
404	FILE_UNLOCK(wf);
405	fdrop(wf, td);
406	td->td_retval[1] = fd;
407	fdrop(rf, td);
408
409	return (0);
410}
411
412/*
413 * Allocate kva for pipe circular buffer, the space is pageable
414 * This routine will 'realloc' the size of a pipe safely, if it fails
415 * it will retain the old buffer.
416 * If it fails it will return ENOMEM.
417 */
418static int
419pipespace_new(cpipe, size)
420	struct pipe *cpipe;
421	int size;
422{
423	caddr_t buffer;
424	int error, cnt, firstseg;
425	static int curfail = 0;
426	static struct timeval lastfail;
427
428	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
429	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
430		("pipespace: resize of direct writes not allowed"));
431retry:
432	cnt = cpipe->pipe_buffer.cnt;
433	if (cnt > size)
434		size = cnt;
435
436	size = round_page(size);
437	buffer = (caddr_t) vm_map_min(pipe_map);
438
439	error = vm_map_find(pipe_map, NULL, 0,
440		(vm_offset_t *) &buffer, size, 1,
441		VM_PROT_ALL, VM_PROT_ALL, 0);
442	if (error != KERN_SUCCESS) {
443		if ((cpipe->pipe_buffer.buffer == NULL) &&
444			(size > SMALL_PIPE_SIZE)) {
445			size = SMALL_PIPE_SIZE;
446			pipefragretry++;
447			goto retry;
448		}
449		if (cpipe->pipe_buffer.buffer == NULL) {
450			pipeallocfail++;
451			if (ppsratecheck(&lastfail, &curfail, 1))
452				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
453		} else {
454			piperesizefail++;
455		}
456		return (ENOMEM);
457	}
458
459	/* copy data, then free old resources if we're resizing */
460	if (cnt > 0) {
461		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
462			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
463			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
464				buffer, firstseg);
465			if ((cnt - firstseg) > 0)
466				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
467					cpipe->pipe_buffer.in);
468		} else {
469			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
470				buffer, cnt);
471		}
472	}
473	pipe_free_kmem(cpipe);
474	cpipe->pipe_buffer.buffer = buffer;
475	cpipe->pipe_buffer.size = size;
476	cpipe->pipe_buffer.in = cnt;
477	cpipe->pipe_buffer.out = 0;
478	cpipe->pipe_buffer.cnt = cnt;
479	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
480	return (0);
481}
482
483/*
484 * Wrapper for pipespace_new() that performs locking assertions.
485 */
486static int
487pipespace(cpipe, size)
488	struct pipe *cpipe;
489	int size;
490{
491
492	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
493		("Unlocked pipe passed to pipespace"));
494	return (pipespace_new(cpipe, size));
495}
496
497/*
498 * lock a pipe for I/O, blocking other access
499 */
500static __inline int
501pipelock(cpipe, catch)
502	struct pipe *cpipe;
503	int catch;
504{
505	int error;
506
507	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
508	while (cpipe->pipe_state & PIPE_LOCKFL) {
509		cpipe->pipe_state |= PIPE_LWANT;
510		error = msleep(cpipe, PIPE_MTX(cpipe),
511		    catch ? (PRIBIO | PCATCH) : PRIBIO,
512		    "pipelk", 0);
513		if (error != 0)
514			return (error);
515	}
516	cpipe->pipe_state |= PIPE_LOCKFL;
517	return (0);
518}
519
520/*
521 * unlock a pipe I/O lock
522 */
523static __inline void
524pipeunlock(cpipe)
525	struct pipe *cpipe;
526{
527
528	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
529	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
530		("Unlocked pipe passed to pipeunlock"));
531	cpipe->pipe_state &= ~PIPE_LOCKFL;
532	if (cpipe->pipe_state & PIPE_LWANT) {
533		cpipe->pipe_state &= ~PIPE_LWANT;
534		wakeup(cpipe);
535	}
536}
537
538static __inline void
539pipeselwakeup(cpipe)
540	struct pipe *cpipe;
541{
542
543	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
544	if (cpipe->pipe_state & PIPE_SEL) {
545		cpipe->pipe_state &= ~PIPE_SEL;
546		selwakeuppri(&cpipe->pipe_sel, PSOCK);
547	}
548	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
549		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
550	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
551}
552
553/*
554 * Initialize and allocate VM and memory for pipe.  The structure
555 * will start out zero'd from the ctor, so we just manage the kmem.
556 */
557static int
558pipe_create(pipe, backing)
559	struct pipe *pipe;
560	int backing;
561{
562	int error;
563
564	if (backing) {
565		if (amountpipekva > maxpipekva / 2)
566			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
567		else
568			error = pipespace_new(pipe, PIPE_SIZE);
569	} else {
570		/* If we're not backing this pipe, no need to do anything. */
571		error = 0;
572	}
573	return (error);
574}
575
576/* ARGSUSED */
577static int
578pipe_read(fp, uio, active_cred, flags, td)
579	struct file *fp;
580	struct uio *uio;
581	struct ucred *active_cred;
582	struct thread *td;
583	int flags;
584{
585	struct pipe *rpipe = fp->f_data;
586	int error;
587	int nread = 0;
588	u_int size;
589
590	PIPE_LOCK(rpipe);
591	++rpipe->pipe_busy;
592	error = pipelock(rpipe, 1);
593	if (error)
594		goto unlocked_error;
595
596#ifdef MAC
597	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
598	if (error)
599		goto locked_error;
600#endif
601	if (amountpipekva > (3 * maxpipekva) / 4) {
602		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
603			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
604			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
605			(piperesizeallowed == 1)) {
606			PIPE_UNLOCK(rpipe);
607			pipespace(rpipe, SMALL_PIPE_SIZE);
608			PIPE_LOCK(rpipe);
609		}
610	}
611
612	while (uio->uio_resid) {
613		/*
614		 * normal pipe buffer receive
615		 */
616		if (rpipe->pipe_buffer.cnt > 0) {
617			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
618			if (size > rpipe->pipe_buffer.cnt)
619				size = rpipe->pipe_buffer.cnt;
620			if (size > (u_int) uio->uio_resid)
621				size = (u_int) uio->uio_resid;
622
623			PIPE_UNLOCK(rpipe);
624			error = uiomove(
625			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
626			    size, uio);
627			PIPE_LOCK(rpipe);
628			if (error)
629				break;
630
631			rpipe->pipe_buffer.out += size;
632			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
633				rpipe->pipe_buffer.out = 0;
634
635			rpipe->pipe_buffer.cnt -= size;
636
637			/*
638			 * If there is no more to read in the pipe, reset
639			 * its pointers to the beginning.  This improves
640			 * cache hit stats.
641			 */
642			if (rpipe->pipe_buffer.cnt == 0) {
643				rpipe->pipe_buffer.in = 0;
644				rpipe->pipe_buffer.out = 0;
645			}
646			nread += size;
647#ifndef PIPE_NODIRECT
648		/*
649		 * Direct copy, bypassing a kernel buffer.
650		 */
651		} else if ((size = rpipe->pipe_map.cnt) &&
652			   (rpipe->pipe_state & PIPE_DIRECTW)) {
653			if (size > (u_int) uio->uio_resid)
654				size = (u_int) uio->uio_resid;
655
656			PIPE_UNLOCK(rpipe);
657			error = uiomove_fromphys(rpipe->pipe_map.ms,
658			    rpipe->pipe_map.pos, size, uio);
659			PIPE_LOCK(rpipe);
660			if (error)
661				break;
662			nread += size;
663			rpipe->pipe_map.pos += size;
664			rpipe->pipe_map.cnt -= size;
665			if (rpipe->pipe_map.cnt == 0) {
666				rpipe->pipe_state &= ~PIPE_DIRECTW;
667				wakeup(rpipe);
668			}
669#endif
670		} else {
671			/*
672			 * detect EOF condition
673			 * read returns 0 on EOF, no need to set error
674			 */
675			if (rpipe->pipe_state & PIPE_EOF)
676				break;
677
678			/*
679			 * If the "write-side" has been blocked, wake it up now.
680			 */
681			if (rpipe->pipe_state & PIPE_WANTW) {
682				rpipe->pipe_state &= ~PIPE_WANTW;
683				wakeup(rpipe);
684			}
685
686			/*
687			 * Break if some data was read.
688			 */
689			if (nread > 0)
690				break;
691
692			/*
693			 * Unlock the pipe buffer for our remaining processing.
694			 * We will either break out with an error or we will
695			 * sleep and relock to loop.
696			 */
697			pipeunlock(rpipe);
698
699			/*
700			 * Handle non-blocking mode operation or
701			 * wait for more data.
702			 */
703			if (fp->f_flag & FNONBLOCK) {
704				error = EAGAIN;
705			} else {
706				rpipe->pipe_state |= PIPE_WANTR;
707				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
708				    PRIBIO | PCATCH,
709				    "piperd", 0)) == 0)
710					error = pipelock(rpipe, 1);
711			}
712			if (error)
713				goto unlocked_error;
714		}
715	}
716#ifdef MAC
717locked_error:
718#endif
719	pipeunlock(rpipe);
720
721	/* XXX: should probably do this before getting any locks. */
722	if (error == 0)
723		vfs_timestamp(&rpipe->pipe_atime);
724unlocked_error:
725	--rpipe->pipe_busy;
726
727	/*
728	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
729	 */
730	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
731		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
732		wakeup(rpipe);
733	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
734		/*
735		 * Handle write blocking hysteresis.
736		 */
737		if (rpipe->pipe_state & PIPE_WANTW) {
738			rpipe->pipe_state &= ~PIPE_WANTW;
739			wakeup(rpipe);
740		}
741	}
742
743	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
744		pipeselwakeup(rpipe);
745
746	PIPE_UNLOCK(rpipe);
747	return (error);
748}
749
750#ifndef PIPE_NODIRECT
751/*
752 * Map the sending processes' buffer into kernel space and wire it.
753 * This is similar to a physical write operation.
754 */
755static int
756pipe_build_write_buffer(wpipe, uio)
757	struct pipe *wpipe;
758	struct uio *uio;
759{
760	pmap_t pmap;
761	u_int size;
762	int i, j;
763	vm_offset_t addr, endaddr;
764
765	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
766	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
767		("Clone attempt on non-direct write pipe!"));
768
769	size = (u_int) uio->uio_iov->iov_len;
770	if (size > wpipe->pipe_buffer.size)
771		size = wpipe->pipe_buffer.size;
772
773	pmap = vmspace_pmap(curproc->p_vmspace);
774	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
775	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
776	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
777		/*
778		 * vm_fault_quick() can sleep.  Consequently,
779		 * vm_page_lock_queue() and vm_page_unlock_queue()
780		 * should not be performed outside of this loop.
781		 */
782	race:
783		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
784			vm_page_lock_queues();
785			for (j = 0; j < i; j++)
786				vm_page_unhold(wpipe->pipe_map.ms[j]);
787			vm_page_unlock_queues();
788			return (EFAULT);
789		}
790		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
791		    VM_PROT_READ);
792		if (wpipe->pipe_map.ms[i] == NULL)
793			goto race;
794	}
795
796/*
797 * set up the control block
798 */
799	wpipe->pipe_map.npages = i;
800	wpipe->pipe_map.pos =
801	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
802	wpipe->pipe_map.cnt = size;
803
804/*
805 * and update the uio data
806 */
807
808	uio->uio_iov->iov_len -= size;
809	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
810	if (uio->uio_iov->iov_len == 0)
811		uio->uio_iov++;
812	uio->uio_resid -= size;
813	uio->uio_offset += size;
814	return (0);
815}
816
817/*
818 * unmap and unwire the process buffer
819 */
820static void
821pipe_destroy_write_buffer(wpipe)
822	struct pipe *wpipe;
823{
824	int i;
825
826	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
827	vm_page_lock_queues();
828	for (i = 0; i < wpipe->pipe_map.npages; i++) {
829		vm_page_unhold(wpipe->pipe_map.ms[i]);
830	}
831	vm_page_unlock_queues();
832	wpipe->pipe_map.npages = 0;
833}
834
835/*
836 * In the case of a signal, the writing process might go away.  This
837 * code copies the data into the circular buffer so that the source
838 * pages can be freed without loss of data.
839 */
840static void
841pipe_clone_write_buffer(wpipe)
842	struct pipe *wpipe;
843{
844	struct uio uio;
845	struct iovec iov;
846	int size;
847	int pos;
848
849	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
850	size = wpipe->pipe_map.cnt;
851	pos = wpipe->pipe_map.pos;
852
853	wpipe->pipe_buffer.in = size;
854	wpipe->pipe_buffer.out = 0;
855	wpipe->pipe_buffer.cnt = size;
856	wpipe->pipe_state &= ~PIPE_DIRECTW;
857
858	PIPE_UNLOCK(wpipe);
859	iov.iov_base = wpipe->pipe_buffer.buffer;
860	iov.iov_len = size;
861	uio.uio_iov = &iov;
862	uio.uio_iovcnt = 1;
863	uio.uio_offset = 0;
864	uio.uio_resid = size;
865	uio.uio_segflg = UIO_SYSSPACE;
866	uio.uio_rw = UIO_READ;
867	uio.uio_td = curthread;
868	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
869	PIPE_LOCK(wpipe);
870	pipe_destroy_write_buffer(wpipe);
871}
872
873/*
874 * This implements the pipe buffer write mechanism.  Note that only
875 * a direct write OR a normal pipe write can be pending at any given time.
876 * If there are any characters in the pipe buffer, the direct write will
877 * be deferred until the receiving process grabs all of the bytes from
878 * the pipe buffer.  Then the direct mapping write is set-up.
879 */
880static int
881pipe_direct_write(wpipe, uio)
882	struct pipe *wpipe;
883	struct uio *uio;
884{
885	int error;
886
887retry:
888	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
889	error = pipelock(wpipe, 1);
890	if (wpipe->pipe_state & PIPE_EOF)
891		error = EPIPE;
892	if (error) {
893		pipeunlock(wpipe);
894		goto error1;
895	}
896	while (wpipe->pipe_state & PIPE_DIRECTW) {
897		if (wpipe->pipe_state & PIPE_WANTR) {
898			wpipe->pipe_state &= ~PIPE_WANTR;
899			wakeup(wpipe);
900		}
901		wpipe->pipe_state |= PIPE_WANTW;
902		pipeunlock(wpipe);
903		error = msleep(wpipe, PIPE_MTX(wpipe),
904		    PRIBIO | PCATCH, "pipdww", 0);
905		if (error)
906			goto error1;
907		else
908			goto retry;
909	}
910	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
911	if (wpipe->pipe_buffer.cnt > 0) {
912		if (wpipe->pipe_state & PIPE_WANTR) {
913			wpipe->pipe_state &= ~PIPE_WANTR;
914			wakeup(wpipe);
915		}
916		wpipe->pipe_state |= PIPE_WANTW;
917		pipeunlock(wpipe);
918		error = msleep(wpipe, PIPE_MTX(wpipe),
919		    PRIBIO | PCATCH, "pipdwc", 0);
920		if (error)
921			goto error1;
922		else
923			goto retry;
924	}
925
926	wpipe->pipe_state |= PIPE_DIRECTW;
927
928	PIPE_UNLOCK(wpipe);
929	error = pipe_build_write_buffer(wpipe, uio);
930	PIPE_LOCK(wpipe);
931	if (error) {
932		wpipe->pipe_state &= ~PIPE_DIRECTW;
933		pipeunlock(wpipe);
934		goto error1;
935	}
936
937	error = 0;
938	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
939		if (wpipe->pipe_state & PIPE_EOF) {
940			pipe_destroy_write_buffer(wpipe);
941			pipeselwakeup(wpipe);
942			pipeunlock(wpipe);
943			error = EPIPE;
944			goto error1;
945		}
946		if (wpipe->pipe_state & PIPE_WANTR) {
947			wpipe->pipe_state &= ~PIPE_WANTR;
948			wakeup(wpipe);
949		}
950		pipeselwakeup(wpipe);
951		pipeunlock(wpipe);
952		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
953		    "pipdwt", 0);
954		pipelock(wpipe, 0);
955	}
956
957	if (wpipe->pipe_state & PIPE_EOF)
958		error = EPIPE;
959	if (wpipe->pipe_state & PIPE_DIRECTW) {
960		/*
961		 * this bit of trickery substitutes a kernel buffer for
962		 * the process that might be going away.
963		 */
964		pipe_clone_write_buffer(wpipe);
965	} else {
966		pipe_destroy_write_buffer(wpipe);
967	}
968	pipeunlock(wpipe);
969	return (error);
970
971error1:
972	wakeup(wpipe);
973	return (error);
974}
975#endif
976
977static int
978pipe_write(fp, uio, active_cred, flags, td)
979	struct file *fp;
980	struct uio *uio;
981	struct ucred *active_cred;
982	struct thread *td;
983	int flags;
984{
985	int error = 0;
986	int desiredsize, orig_resid;
987	struct pipe *wpipe, *rpipe;
988
989	rpipe = fp->f_data;
990	wpipe = rpipe->pipe_peer;
991
992	PIPE_LOCK(rpipe);
993	error = pipelock(wpipe, 1);
994	if (error) {
995		PIPE_UNLOCK(rpipe);
996		return (error);
997	}
998	/*
999	 * detect loss of pipe read side, issue SIGPIPE if lost.
1000	 */
1001	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1002		pipeunlock(wpipe);
1003		PIPE_UNLOCK(rpipe);
1004		return (EPIPE);
1005	}
1006#ifdef MAC
1007	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
1008	if (error) {
1009		pipeunlock(wpipe);
1010		PIPE_UNLOCK(rpipe);
1011		return (error);
1012	}
1013#endif
1014	++wpipe->pipe_busy;
1015
1016	/* Choose a larger size if it's advantageous */
1017	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1018	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1019		if (piperesizeallowed != 1)
1020			break;
1021		if (amountpipekva > maxpipekva / 2)
1022			break;
1023		if (desiredsize == BIG_PIPE_SIZE)
1024			break;
1025		desiredsize = desiredsize * 2;
1026	}
1027
1028	/* Choose a smaller size if we're in a OOM situation */
1029	if ((amountpipekva > (3 * maxpipekva) / 4) &&
1030		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1031		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1032		(piperesizeallowed == 1))
1033		desiredsize = SMALL_PIPE_SIZE;
1034
1035	/* Resize if the above determined that a new size was necessary */
1036	if ((desiredsize != wpipe->pipe_buffer.size) &&
1037		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1038		PIPE_UNLOCK(wpipe);
1039		pipespace(wpipe, desiredsize);
1040		PIPE_LOCK(wpipe);
1041	}
1042	if (wpipe->pipe_buffer.size == 0) {
1043		/*
1044		 * This can only happen for reverse direction use of pipes
1045		 * in a complete OOM situation.
1046		 */
1047		error = ENOMEM;
1048		--wpipe->pipe_busy;
1049		pipeunlock(wpipe);
1050		PIPE_UNLOCK(wpipe);
1051		return (error);
1052	}
1053
1054	pipeunlock(wpipe);
1055
1056	orig_resid = uio->uio_resid;
1057
1058	while (uio->uio_resid) {
1059		int space;
1060
1061		pipelock(wpipe, 0);
1062		if (wpipe->pipe_state & PIPE_EOF) {
1063			pipeunlock(wpipe);
1064			error = EPIPE;
1065			break;
1066		}
1067#ifndef PIPE_NODIRECT
1068		/*
1069		 * If the transfer is large, we can gain performance if
1070		 * we do process-to-process copies directly.
1071		 * If the write is non-blocking, we don't use the
1072		 * direct write mechanism.
1073		 *
1074		 * The direct write mechanism will detect the reader going
1075		 * away on us.
1076		 */
1077		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1078		    (wpipe->pipe_buffer.size >= PIPE_MINDIRECT) &&
1079		    (fp->f_flag & FNONBLOCK) == 0) {
1080			pipeunlock(wpipe);
1081			error = pipe_direct_write(wpipe, uio);
1082			if (error)
1083				break;
1084			continue;
1085		}
1086#endif
1087
1088		/*
1089		 * Pipe buffered writes cannot be coincidental with
1090		 * direct writes.  We wait until the currently executing
1091		 * direct write is completed before we start filling the
1092		 * pipe buffer.  We break out if a signal occurs or the
1093		 * reader goes away.
1094		 */
1095		if (wpipe->pipe_state & PIPE_DIRECTW) {
1096			if (wpipe->pipe_state & PIPE_WANTR) {
1097				wpipe->pipe_state &= ~PIPE_WANTR;
1098				wakeup(wpipe);
1099			}
1100			pipeunlock(wpipe);
1101			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1102			    "pipbww", 0);
1103			if (error)
1104				break;
1105			else
1106				continue;
1107		}
1108
1109		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1110
1111		/* Writes of size <= PIPE_BUF must be atomic. */
1112		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1113			space = 0;
1114
1115		if (space > 0) {
1116			int size;	/* Transfer size */
1117			int segsize;	/* first segment to transfer */
1118
1119			/*
1120			 * Transfer size is minimum of uio transfer
1121			 * and free space in pipe buffer.
1122			 */
1123			if (space > uio->uio_resid)
1124				size = uio->uio_resid;
1125			else
1126				size = space;
1127			/*
1128			 * First segment to transfer is minimum of
1129			 * transfer size and contiguous space in
1130			 * pipe buffer.  If first segment to transfer
1131			 * is less than the transfer size, we've got
1132			 * a wraparound in the buffer.
1133			 */
1134			segsize = wpipe->pipe_buffer.size -
1135				wpipe->pipe_buffer.in;
1136			if (segsize > size)
1137				segsize = size;
1138
1139			/* Transfer first segment */
1140
1141			PIPE_UNLOCK(rpipe);
1142			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1143					segsize, uio);
1144			PIPE_LOCK(rpipe);
1145
1146			if (error == 0 && segsize < size) {
1147				KASSERT(wpipe->pipe_buffer.in + segsize ==
1148					wpipe->pipe_buffer.size,
1149					("Pipe buffer wraparound disappeared"));
1150				/*
1151				 * Transfer remaining part now, to
1152				 * support atomic writes.  Wraparound
1153				 * happened.
1154				 */
1155
1156				PIPE_UNLOCK(rpipe);
1157				error = uiomove(
1158				    &wpipe->pipe_buffer.buffer[0],
1159				    size - segsize, uio);
1160				PIPE_LOCK(rpipe);
1161			}
1162			if (error == 0) {
1163				wpipe->pipe_buffer.in += size;
1164				if (wpipe->pipe_buffer.in >=
1165				    wpipe->pipe_buffer.size) {
1166					KASSERT(wpipe->pipe_buffer.in ==
1167						size - segsize +
1168						wpipe->pipe_buffer.size,
1169						("Expected wraparound bad"));
1170					wpipe->pipe_buffer.in = size - segsize;
1171				}
1172
1173				wpipe->pipe_buffer.cnt += size;
1174				KASSERT(wpipe->pipe_buffer.cnt <=
1175					wpipe->pipe_buffer.size,
1176					("Pipe buffer overflow"));
1177			}
1178			pipeunlock(wpipe);
1179			if (error != 0)
1180				break;
1181		} else {
1182			/*
1183			 * If the "read-side" has been blocked, wake it up now.
1184			 */
1185			if (wpipe->pipe_state & PIPE_WANTR) {
1186				wpipe->pipe_state &= ~PIPE_WANTR;
1187				wakeup(wpipe);
1188			}
1189
1190			/*
1191			 * don't block on non-blocking I/O
1192			 */
1193			if (fp->f_flag & FNONBLOCK) {
1194				error = EAGAIN;
1195				pipeunlock(wpipe);
1196				break;
1197			}
1198
1199			/*
1200			 * We have no more space and have something to offer,
1201			 * wake up select/poll.
1202			 */
1203			pipeselwakeup(wpipe);
1204
1205			wpipe->pipe_state |= PIPE_WANTW;
1206			pipeunlock(wpipe);
1207			error = msleep(wpipe, PIPE_MTX(rpipe),
1208			    PRIBIO | PCATCH, "pipewr", 0);
1209			if (error != 0)
1210				break;
1211		}
1212	}
1213
1214	pipelock(wpipe, 0);
1215	--wpipe->pipe_busy;
1216
1217	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1218		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1219		wakeup(wpipe);
1220	} else if (wpipe->pipe_buffer.cnt > 0) {
1221		/*
1222		 * If we have put any characters in the buffer, we wake up
1223		 * the reader.
1224		 */
1225		if (wpipe->pipe_state & PIPE_WANTR) {
1226			wpipe->pipe_state &= ~PIPE_WANTR;
1227			wakeup(wpipe);
1228		}
1229	}
1230
1231	/*
1232	 * Don't return EPIPE if I/O was successful
1233	 */
1234	if ((wpipe->pipe_buffer.cnt == 0) &&
1235	    (uio->uio_resid == 0) &&
1236	    (error == EPIPE)) {
1237		error = 0;
1238	}
1239
1240	if (error == 0)
1241		vfs_timestamp(&wpipe->pipe_mtime);
1242
1243	/*
1244	 * We have something to offer,
1245	 * wake up select/poll.
1246	 */
1247	if (wpipe->pipe_buffer.cnt)
1248		pipeselwakeup(wpipe);
1249
1250	pipeunlock(wpipe);
1251	PIPE_UNLOCK(rpipe);
1252	return (error);
1253}
1254
1255/*
1256 * we implement a very minimal set of ioctls for compatibility with sockets.
1257 */
1258static int
1259pipe_ioctl(fp, cmd, data, active_cred, td)
1260	struct file *fp;
1261	u_long cmd;
1262	void *data;
1263	struct ucred *active_cred;
1264	struct thread *td;
1265{
1266	struct pipe *mpipe = fp->f_data;
1267	int error;
1268
1269	PIPE_LOCK(mpipe);
1270
1271#ifdef MAC
1272	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1273	if (error) {
1274		PIPE_UNLOCK(mpipe);
1275		return (error);
1276	}
1277#endif
1278
1279	error = 0;
1280	switch (cmd) {
1281
1282	case FIONBIO:
1283		break;
1284
1285	case FIOASYNC:
1286		if (*(int *)data) {
1287			mpipe->pipe_state |= PIPE_ASYNC;
1288		} else {
1289			mpipe->pipe_state &= ~PIPE_ASYNC;
1290		}
1291		break;
1292
1293	case FIONREAD:
1294		if (mpipe->pipe_state & PIPE_DIRECTW)
1295			*(int *)data = mpipe->pipe_map.cnt;
1296		else
1297			*(int *)data = mpipe->pipe_buffer.cnt;
1298		break;
1299
1300	case FIOSETOWN:
1301		PIPE_UNLOCK(mpipe);
1302		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1303		goto out_unlocked;
1304
1305	case FIOGETOWN:
1306		*(int *)data = fgetown(&mpipe->pipe_sigio);
1307		break;
1308
1309	/* This is deprecated, FIOSETOWN should be used instead. */
1310	case TIOCSPGRP:
1311		PIPE_UNLOCK(mpipe);
1312		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1313		goto out_unlocked;
1314
1315	/* This is deprecated, FIOGETOWN should be used instead. */
1316	case TIOCGPGRP:
1317		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1318		break;
1319
1320	default:
1321		error = ENOTTY;
1322		break;
1323	}
1324	PIPE_UNLOCK(mpipe);
1325out_unlocked:
1326	return (error);
1327}
1328
1329static int
1330pipe_poll(fp, events, active_cred, td)
1331	struct file *fp;
1332	int events;
1333	struct ucred *active_cred;
1334	struct thread *td;
1335{
1336	struct pipe *rpipe = fp->f_data;
1337	struct pipe *wpipe;
1338	int revents = 0;
1339#ifdef MAC
1340	int error;
1341#endif
1342
1343	wpipe = rpipe->pipe_peer;
1344	PIPE_LOCK(rpipe);
1345#ifdef MAC
1346	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1347	if (error)
1348		goto locked_error;
1349#endif
1350	if (events & (POLLIN | POLLRDNORM))
1351		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1352		    (rpipe->pipe_buffer.cnt > 0) ||
1353		    (rpipe->pipe_state & PIPE_EOF))
1354			revents |= events & (POLLIN | POLLRDNORM);
1355
1356	if (events & (POLLOUT | POLLWRNORM))
1357		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
1358		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1359		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1360			revents |= events & (POLLOUT | POLLWRNORM);
1361
1362	if ((rpipe->pipe_state & PIPE_EOF) ||
1363	    (!wpipe->pipe_present) ||
1364	    (wpipe->pipe_state & PIPE_EOF))
1365		revents |= POLLHUP;
1366
1367	if (revents == 0) {
1368		if (events & (POLLIN | POLLRDNORM)) {
1369			selrecord(td, &rpipe->pipe_sel);
1370			rpipe->pipe_state |= PIPE_SEL;
1371		}
1372
1373		if (events & (POLLOUT | POLLWRNORM)) {
1374			selrecord(td, &wpipe->pipe_sel);
1375			wpipe->pipe_state |= PIPE_SEL;
1376		}
1377	}
1378#ifdef MAC
1379locked_error:
1380#endif
1381	PIPE_UNLOCK(rpipe);
1382
1383	return (revents);
1384}
1385
1386/*
1387 * We shouldn't need locks here as we're doing a read and this should
1388 * be a natural race.
1389 */
1390static int
1391pipe_stat(fp, ub, active_cred, td)
1392	struct file *fp;
1393	struct stat *ub;
1394	struct ucred *active_cred;
1395	struct thread *td;
1396{
1397	struct pipe *pipe = fp->f_data;
1398#ifdef MAC
1399	int error;
1400
1401	PIPE_LOCK(pipe);
1402	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1403	PIPE_UNLOCK(pipe);
1404	if (error)
1405		return (error);
1406#endif
1407	bzero(ub, sizeof(*ub));
1408	ub->st_mode = S_IFIFO;
1409	ub->st_blksize = PAGE_SIZE;
1410	if (pipe->pipe_state & PIPE_DIRECTW)
1411		ub->st_size = pipe->pipe_map.cnt;
1412	else
1413		ub->st_size = pipe->pipe_buffer.cnt;
1414	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1415	ub->st_atimespec = pipe->pipe_atime;
1416	ub->st_mtimespec = pipe->pipe_mtime;
1417	ub->st_ctimespec = pipe->pipe_ctime;
1418	ub->st_uid = fp->f_cred->cr_uid;
1419	ub->st_gid = fp->f_cred->cr_gid;
1420	/*
1421	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1422	 * XXX (st_dev, st_ino) should be unique.
1423	 */
1424	return (0);
1425}
1426
1427/* ARGSUSED */
1428static int
1429pipe_close(fp, td)
1430	struct file *fp;
1431	struct thread *td;
1432{
1433	struct pipe *cpipe = fp->f_data;
1434
1435	fp->f_ops = &badfileops;
1436	fp->f_data = NULL;
1437	funsetown(&cpipe->pipe_sigio);
1438	pipeclose(cpipe);
1439	return (0);
1440}
1441
1442static void
1443pipe_free_kmem(cpipe)
1444	struct pipe *cpipe;
1445{
1446
1447	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1448	    ("pipe_free_kmem: pipe mutex locked"));
1449
1450	if (cpipe->pipe_buffer.buffer != NULL) {
1451		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1452		vm_map_remove(pipe_map,
1453		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1454		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1455		cpipe->pipe_buffer.buffer = NULL;
1456	}
1457#ifndef PIPE_NODIRECT
1458	{
1459		cpipe->pipe_map.cnt = 0;
1460		cpipe->pipe_map.pos = 0;
1461		cpipe->pipe_map.npages = 0;
1462	}
1463#endif
1464}
1465
1466/*
1467 * shutdown the pipe
1468 */
1469static void
1470pipeclose(cpipe)
1471	struct pipe *cpipe;
1472{
1473	struct pipepair *pp;
1474	struct pipe *ppipe;
1475
1476	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1477
1478	PIPE_LOCK(cpipe);
1479	pipelock(cpipe, 0);
1480	pp = cpipe->pipe_pair;
1481
1482	pipeselwakeup(cpipe);
1483
1484	/*
1485	 * If the other side is blocked, wake it up saying that
1486	 * we want to close it down.
1487	 */
1488	cpipe->pipe_state |= PIPE_EOF;
1489	while (cpipe->pipe_busy) {
1490		wakeup(cpipe);
1491		cpipe->pipe_state |= PIPE_WANT;
1492		pipeunlock(cpipe);
1493		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1494		pipelock(cpipe, 0);
1495	}
1496
1497
1498	/*
1499	 * Disconnect from peer, if any.
1500	 */
1501	ppipe = cpipe->pipe_peer;
1502	if (ppipe->pipe_present != 0) {
1503		pipeselwakeup(ppipe);
1504
1505		ppipe->pipe_state |= PIPE_EOF;
1506		wakeup(ppipe);
1507		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1508	}
1509
1510	/*
1511	 * Mark this endpoint as free.  Release kmem resources.  We
1512	 * don't mark this endpoint as unused until we've finished
1513	 * doing that, or the pipe might disappear out from under
1514	 * us.
1515	 */
1516	PIPE_UNLOCK(cpipe);
1517	pipe_free_kmem(cpipe);
1518	PIPE_LOCK(cpipe);
1519	cpipe->pipe_present = 0;
1520	pipeunlock(cpipe);
1521	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1522	knlist_destroy(&cpipe->pipe_sel.si_note);
1523
1524	/*
1525	 * If both endpoints are now closed, release the memory for the
1526	 * pipe pair.  If not, unlock.
1527	 */
1528	if (ppipe->pipe_present == 0) {
1529		PIPE_UNLOCK(cpipe);
1530#ifdef MAC
1531		mac_destroy_pipe(pp);
1532#endif
1533		uma_zfree(pipe_zone, cpipe->pipe_pair);
1534	} else
1535		PIPE_UNLOCK(cpipe);
1536}
1537
1538/*ARGSUSED*/
1539static int
1540pipe_kqfilter(struct file *fp, struct knote *kn)
1541{
1542	struct pipe *cpipe;
1543
1544	cpipe = kn->kn_fp->f_data;
1545	PIPE_LOCK(cpipe);
1546	switch (kn->kn_filter) {
1547	case EVFILT_READ:
1548		kn->kn_fop = &pipe_rfiltops;
1549		break;
1550	case EVFILT_WRITE:
1551		kn->kn_fop = &pipe_wfiltops;
1552		if (!cpipe->pipe_peer->pipe_present) {
1553			/* other end of pipe has been closed */
1554			PIPE_UNLOCK(cpipe);
1555			return (EPIPE);
1556		}
1557		cpipe = cpipe->pipe_peer;
1558		break;
1559	default:
1560		PIPE_UNLOCK(cpipe);
1561		return (EINVAL);
1562	}
1563
1564	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1565	PIPE_UNLOCK(cpipe);
1566	return (0);
1567}
1568
1569static void
1570filt_pipedetach(struct knote *kn)
1571{
1572	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1573
1574	PIPE_LOCK(cpipe);
1575	if (kn->kn_filter == EVFILT_WRITE) {
1576		if (!cpipe->pipe_peer->pipe_present) {
1577			PIPE_UNLOCK(cpipe);
1578			return;
1579		}
1580		cpipe = cpipe->pipe_peer;
1581	}
1582	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1583	PIPE_UNLOCK(cpipe);
1584}
1585
1586/*ARGSUSED*/
1587static int
1588filt_piperead(struct knote *kn, long hint)
1589{
1590	struct pipe *rpipe = kn->kn_fp->f_data;
1591	struct pipe *wpipe = rpipe->pipe_peer;
1592	int ret;
1593
1594	PIPE_LOCK(rpipe);
1595	kn->kn_data = rpipe->pipe_buffer.cnt;
1596	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1597		kn->kn_data = rpipe->pipe_map.cnt;
1598
1599	if ((rpipe->pipe_state & PIPE_EOF) ||
1600	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1601		kn->kn_flags |= EV_EOF;
1602		PIPE_UNLOCK(rpipe);
1603		return (1);
1604	}
1605	ret = kn->kn_data > 0;
1606	PIPE_UNLOCK(rpipe);
1607	return ret;
1608}
1609
1610/*ARGSUSED*/
1611static int
1612filt_pipewrite(struct knote *kn, long hint)
1613{
1614	struct pipe *rpipe = kn->kn_fp->f_data;
1615	struct pipe *wpipe = rpipe->pipe_peer;
1616
1617	PIPE_LOCK(rpipe);
1618	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1619		kn->kn_data = 0;
1620		kn->kn_flags |= EV_EOF;
1621		PIPE_UNLOCK(rpipe);
1622		return (1);
1623	}
1624	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1625	if (wpipe->pipe_state & PIPE_DIRECTW)
1626		kn->kn_data = 0;
1627
1628	PIPE_UNLOCK(rpipe);
1629	return (kn->kn_data >= PIPE_BUF);
1630}
1631