sys_pipe.c revision 13909
113675Sdyson/* 213675Sdyson * Copyright (c) 1996 John S. Dyson 313675Sdyson * All rights reserved. 413675Sdyson * 513675Sdyson * Redistribution and use in source and binary forms, with or without 613675Sdyson * modification, are permitted provided that the following conditions 713675Sdyson * are met: 813675Sdyson * 1. Redistributions of source code must retain the above copyright 913675Sdyson * notice immediately at the beginning of the file, without modification, 1013675Sdyson * this list of conditions, and the following disclaimer. 1113675Sdyson * 2. Redistributions in binary form must reproduce the above copyright 1213675Sdyson * notice, this list of conditions and the following disclaimer in the 1313675Sdyson * documentation and/or other materials provided with the distribution. 1413675Sdyson * 3. Absolutely no warranty of function or purpose is made by the author 1513675Sdyson * John S. Dyson. 1613675Sdyson * 4. This work was done expressly for inclusion into FreeBSD. Other use 1713675Sdyson * is allowed if this notation is included. 1813675Sdyson * 5. Modifications may be freely made to this file if the above conditions 1913675Sdyson * are met. 2013675Sdyson * 2113907Sdyson * $Id: sys_pipe.c,v 1.2 1996/01/29 02:57:33 dyson Exp $ 2213675Sdyson */ 2313675Sdyson 2413675Sdyson#ifndef OLD_PIPE 2513675Sdyson 2613675Sdyson/* 2713675Sdyson * This file contains a high-performance replacement for the socket-based 2813675Sdyson * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 2913675Sdyson * all features of sockets, but does do everything that pipes normally 3013675Sdyson * do. 3113675Sdyson */ 3213675Sdyson 3313907Sdyson/* 3413907Sdyson * This code has two modes of operation, a small write mode and a large 3513907Sdyson * write mode. The small write mode acts like conventional pipes with 3613907Sdyson * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 3713907Sdyson * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 3813907Sdyson * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 3913907Sdyson * the receiving process can copy it directly from the pages in the sending 4013907Sdyson * process. 4113907Sdyson * 4213907Sdyson * If the sending process receives a signal, it is possible that it will 4313907Sdyson * go away, and certainly it's address space can change, because control 4413907Sdyson * is returned back to the user-mode side. In that case, the pipe code 4513907Sdyson * arranges to copy the buffer supplied by the user process, to a pageable 4613907Sdyson * kernel buffer, and the receiving process will grab the data from the 4713907Sdyson * pageable kernel buffer. Since signals don't happen all that often, 4813907Sdyson * the copy operation is normally eliminated. 4913907Sdyson * 5013907Sdyson * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 5113907Sdyson * happen for small transfers so that the system will not spend all of 5213907Sdyson * it's time context switching. PIPE_SIZE is constrained by the 5313907Sdyson * amount of kernel virtual memory. 5413907Sdyson */ 5513907Sdyson 5613675Sdyson#include <sys/param.h> 5713675Sdyson#include <sys/systm.h> 5813675Sdyson#include <sys/proc.h> 5913675Sdyson#include <sys/file.h> 6013675Sdyson#include <sys/protosw.h> 6113675Sdyson#include <sys/stat.h> 6213675Sdyson#include <sys/filedesc.h> 6313675Sdyson#include <sys/malloc.h> 6413675Sdyson#include <sys/ioctl.h> 6513675Sdyson#include <sys/stat.h> 6613675Sdyson#include <sys/select.h> 6713675Sdyson#include <sys/signalvar.h> 6813675Sdyson#include <sys/errno.h> 6913675Sdyson#include <sys/queue.h> 7013675Sdyson#include <sys/vmmeter.h> 7113675Sdyson#include <sys/kernel.h> 7213675Sdyson#include <sys/sysproto.h> 7313675Sdyson#include <sys/pipe.h> 7413675Sdyson 7513675Sdyson#include <vm/vm.h> 7613675Sdyson#include <vm/vm_prot.h> 7713675Sdyson#include <vm/vm_param.h> 7813675Sdyson#include <vm/lock.h> 7913675Sdyson#include <vm/vm_object.h> 8013675Sdyson#include <vm/vm_kern.h> 8113675Sdyson#include <vm/vm_extern.h> 8213675Sdyson#include <vm/pmap.h> 8313675Sdyson#include <vm/vm_map.h> 8413907Sdyson#include <vm/vm_page.h> 8513675Sdyson 8613675Sdysonstatic int pipe_read __P((struct file *fp, struct uio *uio, 8713675Sdyson struct ucred *cred)); 8813675Sdysonstatic int pipe_write __P((struct file *fp, struct uio *uio, 8913675Sdyson struct ucred *cred)); 9013675Sdysonstatic int pipe_close __P((struct file *fp, struct proc *p)); 9113675Sdysonstatic int pipe_select __P((struct file *fp, int which, struct proc *p)); 9213675Sdysonstatic int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p)); 9313675Sdyson 9413675Sdysonstatic struct fileops pipeops = 9513675Sdyson { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close }; 9613675Sdyson 9713675Sdyson/* 9813675Sdyson * Default pipe buffer size(s), this can be kind-of large now because pipe 9913675Sdyson * space is pageable. The pipe code will try to maintain locality of 10013675Sdyson * reference for performance reasons, so small amounts of outstanding I/O 10113675Sdyson * will not wipe the cache. 10213675Sdyson */ 10313907Sdyson#define MINPIPESIZE (PIPE_SIZE/3) 10413907Sdyson#define MAXPIPESIZE (2*PIPE_SIZE/3) 10513675Sdyson 10613907Sdyson/* 10713907Sdyson * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 10813907Sdyson * is there so that on large systems, we don't exhaust it. 10913907Sdyson */ 11013907Sdyson#define MAXPIPEKVA (8*1024*1024) 11113907Sdyson 11213907Sdyson/* 11313907Sdyson * Limit for direct transfers, we cannot, of course limit 11413907Sdyson * the amount of kva for pipes in general though. 11513907Sdyson */ 11613907Sdyson#define LIMITPIPEKVA (16*1024*1024) 11713907Sdysonint amountpipekva; 11813907Sdyson 11913675Sdysonstatic void pipeclose __P((struct pipe *cpipe)); 12013675Sdysonstatic void pipebufferinit __P((struct pipe *cpipe)); 12113675Sdysonstatic void pipeinit __P((struct pipe *cpipe)); 12213907Sdysonstatic __inline int pipelock __P((struct pipe *cpipe, int catch)); 12313675Sdysonstatic __inline void pipeunlock __P((struct pipe *cpipe)); 12413907Sdysonstatic int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); 12513907Sdysonstatic void pipe_destroy_write_buffer __P((struct pipe *wpipe)); 12613907Sdysonstatic int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); 12713907Sdysonstatic void pipe_clone_write_buffer __P((struct pipe *wpipe)); 12813907Sdysonstatic void pipe_mark_pages_clean __P((struct pipe *cpipe)); 12913907Sdysonstatic int pipewrite __P((struct pipe *wpipe, struct uio *uio, int nbio)); 13013907Sdysonstatic void pipespace __P((struct pipe *cpipe)); 13113675Sdyson 13213675Sdyson/* 13313675Sdyson * The pipe system call for the DTYPE_PIPE type of pipes 13413675Sdyson */ 13513675Sdyson 13613675Sdyson/* ARGSUSED */ 13713675Sdysonint 13813675Sdysonpipe(p, uap, retval) 13913675Sdyson struct proc *p; 14013675Sdyson struct pipe_args /* { 14113675Sdyson int dummy; 14213675Sdyson } */ *uap; 14313675Sdyson int retval[]; 14413675Sdyson{ 14513675Sdyson register struct filedesc *fdp = p->p_fd; 14613675Sdyson struct file *rf, *wf; 14713675Sdyson struct pipe *rpipe, *wpipe; 14813675Sdyson int fd, error; 14913675Sdyson 15013675Sdyson rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK); 15113675Sdyson pipeinit(rpipe); 15213907Sdyson rpipe->pipe_state |= PIPE_DIRECTOK; 15313675Sdyson wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK); 15413675Sdyson pipeinit(wpipe); 15513907Sdyson wpipe->pipe_state |= PIPE_DIRECTOK; 15613675Sdyson 15713675Sdyson error = falloc(p, &rf, &fd); 15813675Sdyson if (error) 15913675Sdyson goto free2; 16013675Sdyson retval[0] = fd; 16113675Sdyson rf->f_flag = FREAD | FWRITE; 16213675Sdyson rf->f_type = DTYPE_PIPE; 16313675Sdyson rf->f_ops = &pipeops; 16413675Sdyson rf->f_data = (caddr_t)rpipe; 16513675Sdyson error = falloc(p, &wf, &fd); 16613675Sdyson if (error) 16713675Sdyson goto free3; 16813675Sdyson wf->f_flag = FREAD | FWRITE; 16913675Sdyson wf->f_type = DTYPE_PIPE; 17013675Sdyson wf->f_ops = &pipeops; 17113675Sdyson wf->f_data = (caddr_t)wpipe; 17213675Sdyson retval[1] = fd; 17313675Sdyson 17413675Sdyson rpipe->pipe_peer = wpipe; 17513675Sdyson wpipe->pipe_peer = rpipe; 17613675Sdyson 17713675Sdyson return (0); 17813675Sdysonfree3: 17913675Sdyson ffree(rf); 18013675Sdyson fdp->fd_ofiles[retval[0]] = 0; 18113675Sdysonfree2: 18213675Sdyson (void)pipeclose(wpipe); 18313675Sdysonfree1: 18413675Sdyson (void)pipeclose(rpipe); 18513675Sdyson return (error); 18613675Sdyson} 18713675Sdyson 18813909Sdyson/* 18913909Sdyson * Allocate kva for pipe circular buffer, the space is pageable 19013909Sdyson */ 19113675Sdysonstatic void 19213907Sdysonpipespace(cpipe) 19313675Sdyson struct pipe *cpipe; 19413675Sdyson{ 19513688Sdyson int npages, error; 19613675Sdyson 19713907Sdyson npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE; 19813675Sdyson /* 19913675Sdyson * Create an object, I don't like the idea of paging to/from 20013675Sdyson * kernel_object. 20113675Sdyson */ 20213675Sdyson cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages); 20313688Sdyson cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map); 20413675Sdyson 20513675Sdyson /* 20613675Sdyson * Insert the object into the kernel map, and allocate kva for it. 20713675Sdyson * The map entry is, by default, pageable. 20813675Sdyson */ 20913688Sdyson error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0, 21013907Sdyson (vm_offset_t *) &cpipe->pipe_buffer.buffer, 21113907Sdyson cpipe->pipe_buffer.size, 1, 21213688Sdyson VM_PROT_ALL, VM_PROT_ALL, 0); 21313675Sdyson 21413688Sdyson if (error != KERN_SUCCESS) 21513688Sdyson panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error); 21613907Sdyson amountpipekva += cpipe->pipe_buffer.size; 21713907Sdyson} 21813688Sdyson 21913907Sdyson/* 22013907Sdyson * initialize and allocate VM and memory for pipe 22113907Sdyson */ 22213907Sdysonstatic void 22313907Sdysonpipeinit(cpipe) 22413907Sdyson struct pipe *cpipe; 22513907Sdyson{ 22613907Sdyson 22713675Sdyson cpipe->pipe_buffer.in = 0; 22813675Sdyson cpipe->pipe_buffer.out = 0; 22913675Sdyson cpipe->pipe_buffer.cnt = 0; 23013907Sdyson cpipe->pipe_buffer.size = PIPE_SIZE; 23113907Sdyson /* Buffer kva gets dynamically allocated */ 23213907Sdyson cpipe->pipe_buffer.buffer = NULL; 23313675Sdyson 23413675Sdyson cpipe->pipe_state = 0; 23513675Sdyson cpipe->pipe_peer = NULL; 23613675Sdyson cpipe->pipe_busy = 0; 23713675Sdyson cpipe->pipe_ctime = time; 23813675Sdyson cpipe->pipe_atime = time; 23913675Sdyson cpipe->pipe_mtime = time; 24013675Sdyson bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel); 24113907Sdyson 24213907Sdyson /* 24313907Sdyson * pipe data structure initializations to support direct pipe I/O 24413907Sdyson */ 24513907Sdyson cpipe->pipe_map.cnt = 0; 24613907Sdyson cpipe->pipe_map.kva = 0; 24713907Sdyson cpipe->pipe_map.pos = 0; 24813907Sdyson cpipe->pipe_map.npages = 0; 24913675Sdyson} 25013675Sdyson 25113675Sdyson 25213675Sdyson/* 25313675Sdyson * lock a pipe for I/O, blocking other access 25413675Sdyson */ 25513675Sdysonstatic __inline int 25613907Sdysonpipelock(cpipe, catch) 25713675Sdyson struct pipe *cpipe; 25813907Sdyson int catch; 25913675Sdyson{ 26013776Sdyson int error; 26113675Sdyson while (cpipe->pipe_state & PIPE_LOCK) { 26213675Sdyson cpipe->pipe_state |= PIPE_LWANT; 26313907Sdyson if (error = tsleep( &cpipe->pipe_state, 26413907Sdyson catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) { 26513776Sdyson return error; 26613675Sdyson } 26713675Sdyson } 26813675Sdyson cpipe->pipe_state |= PIPE_LOCK; 26913675Sdyson return 0; 27013675Sdyson} 27113675Sdyson 27213675Sdyson/* 27313675Sdyson * unlock a pipe I/O lock 27413675Sdyson */ 27513675Sdysonstatic __inline void 27613675Sdysonpipeunlock(cpipe) 27713675Sdyson struct pipe *cpipe; 27813675Sdyson{ 27913675Sdyson cpipe->pipe_state &= ~PIPE_LOCK; 28013675Sdyson if (cpipe->pipe_state & PIPE_LWANT) { 28113675Sdyson cpipe->pipe_state &= ~PIPE_LWANT; 28213675Sdyson wakeup(&cpipe->pipe_state); 28313675Sdyson } 28413675Sdyson return; 28513675Sdyson} 28613675Sdyson 28713907Sdyson#if 0 28813907Sdysonstatic void 28913907Sdysonpipe_mark_pages_clean(cpipe) 29013907Sdyson struct pipe *cpipe; 29113907Sdyson{ 29213907Sdyson vm_size_t off; 29313907Sdyson vm_page_t m; 29413907Sdyson 29513907Sdyson for(off = 0; off < cpipe->pipe_buffer.object->size; off += 1) { 29613907Sdyson m = vm_page_lookup(cpipe->pipe_buffer.object, off); 29713907Sdyson if ((m != NULL) && (m->busy == 0) && (m->flags & PG_BUSY) == 0) { 29813907Sdyson m->dirty = 0; 29913907Sdyson pmap_clear_modify(VM_PAGE_TO_PHYS(m)); 30013907Sdyson } 30113907Sdyson } 30213907Sdyson} 30313907Sdyson#endif 30413907Sdyson 30513675Sdyson/* ARGSUSED */ 30613675Sdysonstatic int 30713675Sdysonpipe_read(fp, uio, cred) 30813675Sdyson struct file *fp; 30913675Sdyson struct uio *uio; 31013675Sdyson struct ucred *cred; 31113675Sdyson{ 31213675Sdyson 31313675Sdyson struct pipe *rpipe = (struct pipe *) fp->f_data; 31413675Sdyson int error = 0; 31513675Sdyson int nread = 0; 31613907Sdyson int size; 31713675Sdyson 31813675Sdyson ++rpipe->pipe_busy; 31913675Sdyson while (uio->uio_resid) { 32013907Sdyson /* 32113907Sdyson * normal pipe buffer receive 32213907Sdyson */ 32313675Sdyson if (rpipe->pipe_buffer.cnt > 0) { 32413675Sdyson int size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 32513675Sdyson if (size > rpipe->pipe_buffer.cnt) 32613675Sdyson size = rpipe->pipe_buffer.cnt; 32713675Sdyson if (size > uio->uio_resid) 32813675Sdyson size = uio->uio_resid; 32913907Sdyson if ((error = pipelock(rpipe,1)) == 0) { 33013675Sdyson error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 33113675Sdyson size, uio); 33213675Sdyson pipeunlock(rpipe); 33313675Sdyson } 33413675Sdyson if (error) { 33513675Sdyson break; 33613675Sdyson } 33713675Sdyson rpipe->pipe_buffer.out += size; 33813675Sdyson if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 33913675Sdyson rpipe->pipe_buffer.out = 0; 34013675Sdyson 34113675Sdyson rpipe->pipe_buffer.cnt -= size; 34213675Sdyson nread += size; 34313675Sdyson rpipe->pipe_atime = time; 34413907Sdyson /* 34513907Sdyson * Direct copy, bypassing a kernel buffer. 34613907Sdyson */ 34713907Sdyson } else if ((size = rpipe->pipe_map.cnt) && 34813907Sdyson (rpipe->pipe_state & PIPE_DIRECTW)) { 34913907Sdyson caddr_t va; 35013907Sdyson if (size > uio->uio_resid) 35113907Sdyson size = uio->uio_resid; 35213907Sdyson if ((error = pipelock(rpipe,1)) == 0) { 35313907Sdyson va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; 35413907Sdyson error = uiomove(va, size, uio); 35513907Sdyson pipeunlock(rpipe); 35613907Sdyson } 35713907Sdyson if (error) 35813907Sdyson break; 35913907Sdyson nread += size; 36013907Sdyson rpipe->pipe_atime = time; 36113907Sdyson rpipe->pipe_map.pos += size; 36213907Sdyson rpipe->pipe_map.cnt -= size; 36313907Sdyson if (rpipe->pipe_map.cnt == 0) { 36413907Sdyson rpipe->pipe_state &= ~PIPE_DIRECTW; 36513907Sdyson wakeup(rpipe); 36613907Sdyson } 36713675Sdyson } else { 36813675Sdyson /* 36913675Sdyson * detect EOF condition 37013675Sdyson */ 37113675Sdyson if (rpipe->pipe_state & PIPE_EOF) { 37213675Sdyson break; 37313675Sdyson } 37413675Sdyson /* 37513675Sdyson * If the "write-side" has been blocked, wake it up now. 37613675Sdyson */ 37713675Sdyson if (rpipe->pipe_state & PIPE_WANTW) { 37813675Sdyson rpipe->pipe_state &= ~PIPE_WANTW; 37913675Sdyson wakeup(rpipe); 38013675Sdyson } 38113774Sdyson if (nread > 0) 38213675Sdyson break; 38313774Sdyson if (rpipe->pipe_state & PIPE_NBIO) { 38413774Sdyson error = EAGAIN; 38513774Sdyson break; 38613774Sdyson } 38713675Sdyson 38813675Sdyson /* 38913675Sdyson * If there is no more to read in the pipe, reset 39013675Sdyson * it's pointers to the beginning. This improves 39113675Sdyson * cache hit stats. 39213675Sdyson */ 39313675Sdyson 39413907Sdyson if ((error = pipelock(rpipe,1)) == 0) { 39513675Sdyson if (rpipe->pipe_buffer.cnt == 0) { 39613675Sdyson rpipe->pipe_buffer.in = 0; 39713675Sdyson rpipe->pipe_buffer.out = 0; 39813675Sdyson } 39913675Sdyson pipeunlock(rpipe); 40013675Sdyson } else { 40113675Sdyson break; 40213675Sdyson } 40313675Sdyson rpipe->pipe_state |= PIPE_WANTR; 40413776Sdyson if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) { 40513675Sdyson break; 40613675Sdyson } 40713675Sdyson } 40813675Sdyson } 40913675Sdyson 41013675Sdyson --rpipe->pipe_busy; 41113675Sdyson if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 41213675Sdyson rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 41313675Sdyson wakeup(rpipe); 41413675Sdyson } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 41513675Sdyson /* 41613675Sdyson * If there is no more to read in the pipe, reset 41713675Sdyson * it's pointers to the beginning. This improves 41813675Sdyson * cache hit stats. 41913675Sdyson */ 42013907Sdyson if ((error == 0) && (error = pipelock(rpipe,1)) == 0) { 42113675Sdyson if (rpipe->pipe_buffer.cnt == 0) { 42213907Sdyson#if 0 42313907Sdyson pipe_mark_pages_clean(rpipe); 42413907Sdyson#endif 42513675Sdyson rpipe->pipe_buffer.in = 0; 42613675Sdyson rpipe->pipe_buffer.out = 0; 42713675Sdyson } 42813675Sdyson pipeunlock(rpipe); 42913675Sdyson } 43013675Sdyson 43113675Sdyson /* 43213675Sdyson * If the "write-side" has been blocked, wake it up now. 43313675Sdyson */ 43413675Sdyson if (rpipe->pipe_state & PIPE_WANTW) { 43513675Sdyson rpipe->pipe_state &= ~PIPE_WANTW; 43613675Sdyson wakeup(rpipe); 43713675Sdyson } 43813675Sdyson } 43913675Sdyson if (rpipe->pipe_state & PIPE_SEL) { 44013675Sdyson rpipe->pipe_state &= ~PIPE_SEL; 44113675Sdyson selwakeup(&rpipe->pipe_sel); 44213675Sdyson } 44313675Sdyson return error; 44413675Sdyson} 44513675Sdyson 44613907Sdyson/* 44713907Sdyson * Map the sending processes' buffer into kernel space and wire it. 44813907Sdyson * This is similar to a physical write operation. 44913907Sdyson */ 45013675Sdysonstatic int 45113907Sdysonpipe_build_write_buffer(wpipe, uio) 45213907Sdyson struct pipe *wpipe; 45313675Sdyson struct uio *uio; 45413675Sdyson{ 45513907Sdyson int size; 45613907Sdyson int i; 45713907Sdyson vm_offset_t addr, endaddr, paddr; 45813907Sdyson 45913907Sdyson size = uio->uio_iov->iov_len; 46013907Sdyson if (size > wpipe->pipe_buffer.size) 46113907Sdyson size = wpipe->pipe_buffer.size; 46213907Sdyson 46313907Sdyson endaddr = round_page(uio->uio_iov->iov_base + size); 46413907Sdyson for(i = 0, addr = trunc_page(uio->uio_iov->iov_base); 46513907Sdyson addr < endaddr; 46613907Sdyson addr += PAGE_SIZE, i+=1) { 46713907Sdyson 46813907Sdyson vm_page_t m; 46913907Sdyson 47013909Sdyson vm_fault_quick( (caddr_t) addr, VM_PROT_READ); 47113907Sdyson paddr = pmap_kextract(addr); 47213907Sdyson if (!paddr) { 47313907Sdyson int j; 47413907Sdyson for(j=0;j<i;j++) 47513907Sdyson vm_page_unwire(wpipe->pipe_map.ms[j]); 47613907Sdyson return EFAULT; 47713907Sdyson } 47813907Sdyson 47913907Sdyson m = PHYS_TO_VM_PAGE(paddr); 48013907Sdyson vm_page_wire(m); 48113907Sdyson wpipe->pipe_map.ms[i] = m; 48213907Sdyson } 48313907Sdyson 48413907Sdyson/* 48513907Sdyson * set up the control block 48613907Sdyson */ 48713907Sdyson wpipe->pipe_map.npages = i; 48813907Sdyson wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 48913907Sdyson wpipe->pipe_map.cnt = size; 49013907Sdyson 49113907Sdyson/* 49213907Sdyson * and map the buffer 49313907Sdyson */ 49413907Sdyson if (wpipe->pipe_map.kva == 0) { 49513907Sdyson wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, 49613907Sdyson wpipe->pipe_buffer.size); 49713907Sdyson amountpipekva += wpipe->pipe_buffer.size; 49813907Sdyson } 49913907Sdyson pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 50013907Sdyson wpipe->pipe_map.npages); 50113907Sdyson 50213907Sdyson/* 50313907Sdyson * and update the uio data 50413907Sdyson */ 50513907Sdyson 50613907Sdyson uio->uio_iov->iov_len -= size; 50713907Sdyson uio->uio_iov->iov_base += size; 50813907Sdyson if (uio->uio_iov->iov_len == 0) 50913907Sdyson uio->uio_iov++; 51013907Sdyson uio->uio_resid -= size; 51113907Sdyson uio->uio_offset += size; 51213907Sdyson return 0; 51313907Sdyson} 51413907Sdyson 51513907Sdyson/* 51613907Sdyson * unmap and unwire the process buffer 51713907Sdyson */ 51813907Sdysonstatic void 51913907Sdysonpipe_destroy_write_buffer(wpipe) 52013907Sdysonstruct pipe *wpipe; 52113907Sdyson{ 52213907Sdyson int i; 52313907Sdyson pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 52413907Sdyson 52513907Sdyson if (wpipe->pipe_map.kva) { 52613907Sdyson if (amountpipekva > MAXPIPEKVA) { 52713907Sdyson vm_offset_t kva = wpipe->pipe_map.kva; 52813907Sdyson wpipe->pipe_map.kva = 0; 52913907Sdyson kmem_free(kernel_map, kva, 53013907Sdyson wpipe->pipe_buffer.size); 53113907Sdyson amountpipekva -= wpipe->pipe_buffer.size; 53213907Sdyson } 53313907Sdyson } 53413907Sdyson for (i=0;i<wpipe->pipe_map.npages;i++) 53513907Sdyson vm_page_unwire(wpipe->pipe_map.ms[i]); 53613907Sdyson} 53713907Sdyson 53813907Sdyson/* 53913907Sdyson * In the case of a signal, the writing process might go away. This 54013907Sdyson * code copies the data into the circular buffer so that the source 54113907Sdyson * pages can be freed without loss of data. 54213907Sdyson */ 54313907Sdysonstatic void 54413907Sdysonpipe_clone_write_buffer(wpipe) 54513907Sdysonstruct pipe *wpipe; 54613907Sdyson{ 54713907Sdyson int size; 54813907Sdyson int pos; 54913907Sdyson 55013907Sdyson size = wpipe->pipe_map.cnt; 55113907Sdyson pos = wpipe->pipe_map.pos; 55213907Sdyson bcopy((caddr_t) wpipe->pipe_map.kva+pos, 55313907Sdyson (caddr_t) wpipe->pipe_buffer.buffer, 55413907Sdyson size); 55513907Sdyson 55613907Sdyson wpipe->pipe_buffer.in = size; 55713907Sdyson wpipe->pipe_buffer.out = 0; 55813907Sdyson wpipe->pipe_buffer.cnt = size; 55913907Sdyson wpipe->pipe_state &= ~PIPE_DIRECTW; 56013907Sdyson 56113907Sdyson pipe_destroy_write_buffer(wpipe); 56213907Sdyson} 56313907Sdyson 56413907Sdyson/* 56513907Sdyson * This implements the pipe buffer write mechanism. Note that only 56613907Sdyson * a direct write OR a normal pipe write can be pending at any given time. 56713907Sdyson * If there are any characters in the pipe buffer, the direct write will 56813907Sdyson * be deferred until the receiving process grabs all of the bytes from 56913907Sdyson * the pipe buffer. Then the direct mapping write is set-up. 57013907Sdyson */ 57113907Sdysonstatic int 57213907Sdysonpipe_direct_write(wpipe, uio) 57313907Sdyson struct pipe *wpipe; 57413907Sdyson struct uio *uio; 57513907Sdyson{ 57613907Sdyson int error; 57713907Sdyson while (wpipe->pipe_state & PIPE_DIRECTW) { 57813907Sdyson error = tsleep(wpipe, 57913907Sdyson PRIBIO|PCATCH, "pipdww", 0); 58013907Sdyson if (error || (wpipe->pipe_state & PIPE_EOF)) 58113907Sdyson goto error1; 58213907Sdyson } 58313907Sdyson wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 58413907Sdyson wpipe->pipe_state |= PIPE_DIRECTW; 58513907Sdyson while (wpipe->pipe_buffer.cnt > 0) { 58613907Sdyson error = tsleep(wpipe, 58713907Sdyson PRIBIO|PCATCH, "pipdwc", 0); 58813907Sdyson if (error || (wpipe->pipe_state & PIPE_EOF)) { 58913907Sdyson wpipe->pipe_state &= ~PIPE_DIRECTW; 59013907Sdyson if (error == 0) 59113907Sdyson error = EPIPE; 59213907Sdyson goto error1; 59313907Sdyson } 59413907Sdyson } 59513907Sdyson 59613907Sdyson error = pipe_build_write_buffer(wpipe, uio); 59713907Sdyson if (error) { 59813907Sdyson wpipe->pipe_state &= ~PIPE_DIRECTW; 59913907Sdyson goto error1; 60013907Sdyson } 60113907Sdyson 60213907Sdyson if (wpipe->pipe_state & PIPE_WANTR) { 60313907Sdyson wpipe->pipe_state &= ~PIPE_WANTR; 60413907Sdyson wakeup(wpipe); 60513907Sdyson } 60613907Sdyson 60713907Sdyson error = 0; 60813907Sdyson while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 60913907Sdyson if (wpipe->pipe_state & PIPE_EOF) { 61013907Sdyson pipelock(wpipe, 0); 61113907Sdyson pipe_destroy_write_buffer(wpipe); 61213907Sdyson pipeunlock(wpipe); 61313907Sdyson wakeup(wpipe); 61413907Sdyson return EPIPE; 61513907Sdyson } 61613907Sdyson error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0); 61713907Sdyson } 61813907Sdyson 61913907Sdyson pipelock(wpipe,0); 62013907Sdyson if (wpipe->pipe_state & PIPE_DIRECTW) { 62113907Sdyson /* 62213907Sdyson * this bit of trickery substitutes a kernel buffer for 62313907Sdyson * the process that might be going away. 62413907Sdyson */ 62513907Sdyson pipe_clone_write_buffer(wpipe); 62613907Sdyson } else { 62713907Sdyson pipe_destroy_write_buffer(wpipe); 62813907Sdyson } 62913907Sdyson pipeunlock(wpipe); 63013907Sdyson return error; 63113907Sdyson 63213907Sdysonerror1: 63313907Sdyson wakeup(wpipe); 63413907Sdyson return error; 63513907Sdyson} 63613907Sdyson 63713907Sdysonstatic __inline int 63813907Sdysonpipewrite(wpipe, uio, nbio) 63913907Sdyson struct pipe *wpipe; 64013907Sdyson struct uio *uio; 64113907Sdyson int nbio; 64213907Sdyson{ 64313675Sdyson int error = 0; 64413675Sdyson 64513675Sdyson /* 64613675Sdyson * detect loss of pipe read side, issue SIGPIPE if lost. 64713675Sdyson */ 64813675Sdyson if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) { 64913774Sdyson return EPIPE; 65013675Sdyson } 65113675Sdyson 65213907Sdyson if( wpipe->pipe_buffer.buffer == NULL) { 65313907Sdyson if ((error = pipelock(wpipe,1)) == 0) { 65413907Sdyson pipespace(wpipe); 65513907Sdyson pipeunlock(wpipe); 65613907Sdyson } else { 65713907Sdyson return error; 65813907Sdyson } 65913907Sdyson } 66013907Sdyson 66113675Sdyson ++wpipe->pipe_busy; 66213675Sdyson while (uio->uio_resid) { 66313907Sdyson int space; 66413907Sdyson /* 66513907Sdyson * If the transfer is large, we can gain performance if 66613907Sdyson * we do process-to-process copies directly. 66713907Sdyson */ 66813907Sdyson if ((amountpipekva < LIMITPIPEKVA) && 66913907Sdyson (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { 67013907Sdyson error = pipe_direct_write( wpipe, uio); 67113907Sdyson if (error) { 67213907Sdyson break; 67313907Sdyson } 67413907Sdyson continue; 67513907Sdyson } 67613907Sdyson 67713907Sdyson /* 67813907Sdyson * Pipe buffered writes cannot be coincidental with 67913907Sdyson * direct writes. We wait until the currently executing 68013907Sdyson * direct write is completed before we start filling the 68113907Sdyson * pipe buffer. 68213907Sdyson */ 68313907Sdyson retrywrite: 68413907Sdyson while (wpipe->pipe_state & PIPE_DIRECTW) { 68513907Sdyson error = tsleep(wpipe, 68613907Sdyson PRIBIO|PCATCH, "pipbww", 0); 68713907Sdyson if (error) 68813907Sdyson break; 68913907Sdyson } 69013907Sdyson 69113907Sdyson space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 69213907Sdyson 69313907Sdyson /* 69413907Sdyson * We must afford contiguous writes on buffers of size 69513907Sdyson * PIPE_BUF or less. 69613907Sdyson */ 69713907Sdyson if ((space > 0) && 69813907Sdyson ((uio->uio_resid > PIPE_BUF) || (uio->uio_resid <= space))) { 69913675Sdyson int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; 70013675Sdyson if (size > space) 70113675Sdyson size = space; 70213675Sdyson if (size > uio->uio_resid) 70313675Sdyson size = uio->uio_resid; 70413907Sdyson if ((error = pipelock(wpipe,1)) == 0) { 70513907Sdyson /* 70613907Sdyson * It is possible for a direct write to 70713907Sdyson * slip in on us... handle it here... 70813907Sdyson */ 70913907Sdyson if (wpipe->pipe_state & PIPE_DIRECTW) { 71013907Sdyson pipeunlock(wpipe); 71113907Sdyson goto retrywrite; 71213907Sdyson } 71313675Sdyson error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 71413675Sdyson size, uio); 71513675Sdyson pipeunlock(wpipe); 71613675Sdyson } 71713675Sdyson if (error) 71813675Sdyson break; 71913675Sdyson 72013675Sdyson wpipe->pipe_buffer.in += size; 72113675Sdyson if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) 72213675Sdyson wpipe->pipe_buffer.in = 0; 72313675Sdyson 72413675Sdyson wpipe->pipe_buffer.cnt += size; 72513675Sdyson wpipe->pipe_mtime = time; 72613675Sdyson } else { 72713675Sdyson /* 72813675Sdyson * If the "read-side" has been blocked, wake it up now. 72913675Sdyson */ 73013675Sdyson if (wpipe->pipe_state & PIPE_WANTR) { 73113675Sdyson wpipe->pipe_state &= ~PIPE_WANTR; 73213675Sdyson wakeup(wpipe); 73313675Sdyson } 73413675Sdyson /* 73513675Sdyson * don't block on non-blocking I/O 73613675Sdyson */ 73713907Sdyson if (nbio) { 73813907Sdyson error = EAGAIN; 73913675Sdyson break; 74013675Sdyson } 74113907Sdyson 74213675Sdyson wpipe->pipe_state |= PIPE_WANTW; 74313776Sdyson if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) { 74413675Sdyson break; 74513675Sdyson } 74613675Sdyson /* 74713675Sdyson * If read side wants to go away, we just issue a signal 74813675Sdyson * to ourselves. 74913675Sdyson */ 75013675Sdyson if (wpipe->pipe_state & PIPE_EOF) { 75113774Sdyson error = EPIPE; 75213907Sdyson break; 75313675Sdyson } 75413675Sdyson } 75513675Sdyson } 75613675Sdyson 75713675Sdyson if ((wpipe->pipe_busy == 0) && 75813675Sdyson (wpipe->pipe_state & PIPE_WANT)) { 75913675Sdyson wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR); 76013675Sdyson wakeup(wpipe); 76113675Sdyson } else if (wpipe->pipe_buffer.cnt > 0) { 76213675Sdyson /* 76313675Sdyson * If we have put any characters in the buffer, we wake up 76413675Sdyson * the reader. 76513675Sdyson */ 76613675Sdyson if (wpipe->pipe_state & PIPE_WANTR) { 76713675Sdyson wpipe->pipe_state &= ~PIPE_WANTR; 76813675Sdyson wakeup(wpipe); 76913675Sdyson } 77013675Sdyson } 77113909Sdyson 77213909Sdyson /* 77313909Sdyson * Don't return EPIPE if I/O was successful 77413909Sdyson */ 77513907Sdyson if ((wpipe->pipe_buffer.cnt == 0) && 77613907Sdyson (uio->uio_resid == 0) && 77713907Sdyson (error == EPIPE)) 77813907Sdyson error = 0; 77913907Sdyson 78013675Sdyson if (wpipe->pipe_state & PIPE_SEL) { 78113675Sdyson wpipe->pipe_state &= ~PIPE_SEL; 78213675Sdyson selwakeup(&wpipe->pipe_sel); 78313675Sdyson } 78413907Sdyson 78513907Sdyson --wpipe->pipe_busy; 78613675Sdyson return error; 78713675Sdyson} 78813675Sdyson 78913907Sdyson/* ARGSUSED */ 79013907Sdysonstatic int 79113907Sdysonpipe_write(fp, uio, cred) 79213907Sdyson struct file *fp; 79313907Sdyson struct uio *uio; 79413907Sdyson struct ucred *cred; 79513907Sdyson{ 79613907Sdyson struct pipe *rpipe = (struct pipe *) fp->f_data; 79713907Sdyson struct pipe *wpipe = rpipe->pipe_peer; 79813907Sdyson return pipewrite(wpipe, uio, (rpipe->pipe_state & PIPE_NBIO)?1:0); 79913907Sdyson} 80013907Sdyson 80113675Sdyson/* 80213675Sdyson * we implement a very minimal set of ioctls for compatibility with sockets. 80313675Sdyson */ 80413675Sdysonint 80513675Sdysonpipe_ioctl(fp, cmd, data, p) 80613675Sdyson struct file *fp; 80713675Sdyson int cmd; 80813675Sdyson register caddr_t data; 80913675Sdyson struct proc *p; 81013675Sdyson{ 81113675Sdyson register struct pipe *mpipe = (struct pipe *)fp->f_data; 81213675Sdyson 81313675Sdyson switch (cmd) { 81413675Sdyson 81513675Sdyson case FIONBIO: 81613675Sdyson if (*(int *)data) 81713675Sdyson mpipe->pipe_state |= PIPE_NBIO; 81813675Sdyson else 81913675Sdyson mpipe->pipe_state &= ~PIPE_NBIO; 82013675Sdyson return (0); 82113675Sdyson 82213675Sdyson case FIOASYNC: 82313675Sdyson if (*(int *)data) { 82413675Sdyson mpipe->pipe_state |= PIPE_ASYNC; 82513675Sdyson } else { 82613675Sdyson mpipe->pipe_state &= ~PIPE_ASYNC; 82713675Sdyson } 82813675Sdyson return (0); 82913675Sdyson 83013675Sdyson case FIONREAD: 83113675Sdyson *(int *)data = mpipe->pipe_buffer.cnt; 83213675Sdyson return (0); 83313675Sdyson 83413675Sdyson case SIOCSPGRP: 83513675Sdyson mpipe->pipe_pgid = *(int *)data; 83613675Sdyson return (0); 83713675Sdyson 83813675Sdyson case SIOCGPGRP: 83913675Sdyson *(int *)data = mpipe->pipe_pgid; 84013675Sdyson return (0); 84113675Sdyson 84213675Sdyson } 84313675Sdyson return ENOSYS; 84413675Sdyson} 84513675Sdyson 84613675Sdysonint 84713675Sdysonpipe_select(fp, which, p) 84813675Sdyson struct file *fp; 84913675Sdyson int which; 85013675Sdyson struct proc *p; 85113675Sdyson{ 85213675Sdyson register struct pipe *rpipe = (struct pipe *)fp->f_data; 85313675Sdyson struct pipe *wpipe; 85413675Sdyson register int s = splnet(); 85513675Sdyson 85613675Sdyson wpipe = rpipe->pipe_peer; 85713675Sdyson switch (which) { 85813675Sdyson 85913675Sdyson case FREAD: 86013907Sdyson if (rpipe->pipe_buffer.cnt > 0 || 86113907Sdyson (rpipe->pipe_state & PIPE_EOF)) { 86213675Sdyson splx(s); 86313675Sdyson return (1); 86413675Sdyson } 86513675Sdyson selrecord(p, &rpipe->pipe_sel); 86613675Sdyson rpipe->pipe_state |= PIPE_SEL; 86713675Sdyson break; 86813675Sdyson 86913675Sdyson case FWRITE: 87013907Sdyson if ((wpipe == NULL) || 87113907Sdyson (wpipe->pipe_state & PIPE_EOF) || 87213907Sdyson ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { 87313675Sdyson splx(s); 87413675Sdyson return (1); 87513675Sdyson } 87613675Sdyson selrecord(p, &wpipe->pipe_sel); 87713675Sdyson wpipe->pipe_state |= PIPE_SEL; 87813675Sdyson break; 87913675Sdyson 88013675Sdyson case 0: 88113907Sdyson if ((rpipe->pipe_state & PIPE_EOF) || 88213907Sdyson (wpipe == NULL) || 88313907Sdyson (wpipe->pipe_state & PIPE_EOF)) { 88413907Sdyson splx(s); 88513907Sdyson return (1); 88613907Sdyson } 88713907Sdyson 88813675Sdyson selrecord(p, &rpipe->pipe_sel); 88913675Sdyson rpipe->pipe_state |= PIPE_SEL; 89013675Sdyson break; 89113675Sdyson } 89213675Sdyson splx(s); 89313675Sdyson return (0); 89413675Sdyson} 89513675Sdyson 89613675Sdysonint 89713675Sdysonpipe_stat(pipe, ub) 89813675Sdyson register struct pipe *pipe; 89913675Sdyson register struct stat *ub; 90013675Sdyson{ 90113675Sdyson bzero((caddr_t)ub, sizeof (*ub)); 90213675Sdyson ub->st_mode = S_IFSOCK; 90313907Sdyson ub->st_blksize = pipe->pipe_buffer.size; 90413675Sdyson ub->st_size = pipe->pipe_buffer.cnt; 90513675Sdyson ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 90613675Sdyson TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec); 90713675Sdyson TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); 90813675Sdyson TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); 90913675Sdyson return 0; 91013675Sdyson} 91113675Sdyson 91213675Sdyson/* ARGSUSED */ 91313675Sdysonstatic int 91413675Sdysonpipe_close(fp, p) 91513675Sdyson struct file *fp; 91613675Sdyson struct proc *p; 91713675Sdyson{ 91813675Sdyson int error = 0; 91913675Sdyson struct pipe *cpipe = (struct pipe *)fp->f_data; 92013675Sdyson pipeclose(cpipe); 92113675Sdyson fp->f_data = NULL; 92213675Sdyson return 0; 92313675Sdyson} 92413675Sdyson 92513675Sdyson/* 92613675Sdyson * shutdown the pipe 92713675Sdyson */ 92813675Sdysonstatic void 92913675Sdysonpipeclose(cpipe) 93013675Sdyson struct pipe *cpipe; 93113675Sdyson{ 93213907Sdyson struct pipe *ppipe; 93313675Sdyson if (cpipe) { 93413907Sdyson 93513907Sdyson if (cpipe->pipe_state & PIPE_SEL) { 93613907Sdyson cpipe->pipe_state &= ~PIPE_SEL; 93713907Sdyson selwakeup(&cpipe->pipe_sel); 93813907Sdyson } 93913907Sdyson 94013675Sdyson /* 94113675Sdyson * If the other side is blocked, wake it up saying that 94213675Sdyson * we want to close it down. 94313675Sdyson */ 94413675Sdyson while (cpipe->pipe_busy) { 94513675Sdyson wakeup(cpipe); 94613675Sdyson cpipe->pipe_state |= PIPE_WANT|PIPE_EOF; 94713675Sdyson tsleep(cpipe, PRIBIO, "pipecl", 0); 94813675Sdyson } 94913675Sdyson 95013675Sdyson /* 95113675Sdyson * Disconnect from peer 95213675Sdyson */ 95313907Sdyson if (ppipe = cpipe->pipe_peer) { 95413907Sdyson if (ppipe->pipe_state & PIPE_SEL) { 95513907Sdyson ppipe->pipe_state &= ~PIPE_SEL; 95613907Sdyson selwakeup(&ppipe->pipe_sel); 95713907Sdyson } 95813907Sdyson 95913907Sdyson ppipe->pipe_state |= PIPE_EOF; 96013907Sdyson wakeup(ppipe); 96113907Sdyson ppipe->pipe_peer = NULL; 96213675Sdyson } 96313675Sdyson 96413675Sdyson /* 96513675Sdyson * free resources 96613675Sdyson */ 96713907Sdyson if (cpipe->pipe_buffer.buffer) { 96813907Sdyson amountpipekva -= cpipe->pipe_buffer.size; 96913907Sdyson kmem_free(kernel_map, 97013907Sdyson (vm_offset_t)cpipe->pipe_buffer.buffer, 97113907Sdyson cpipe->pipe_buffer.size); 97213907Sdyson } 97313907Sdyson if (cpipe->pipe_map.kva) { 97413907Sdyson amountpipekva -= cpipe->pipe_buffer.size; 97513907Sdyson kmem_free(kernel_map, 97613907Sdyson cpipe->pipe_map.kva, 97713907Sdyson cpipe->pipe_buffer.size); 97813907Sdyson } 97913675Sdyson free(cpipe, M_TEMP); 98013675Sdyson } 98113675Sdyson} 98213675Sdyson#endif 983