sendfile.c revision 8348:4137e18bfaf0
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/t_lock.h>
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/buf.h>
32#include <sys/conf.h>
33#include <sys/cred.h>
34#include <sys/kmem.h>
35#include <sys/sysmacros.h>
36#include <sys/vfs.h>
37#include <sys/vnode.h>
38#include <sys/debug.h>
39#include <sys/errno.h>
40#include <sys/time.h>
41#include <sys/file.h>
42#include <sys/open.h>
43#include <sys/user.h>
44#include <sys/termios.h>
45#include <sys/stream.h>
46#include <sys/strsubr.h>
47#include <sys/sunddi.h>
48#include <sys/esunddi.h>
49#include <sys/flock.h>
50#include <sys/modctl.h>
51#include <sys/cmn_err.h>
52#include <sys/vmsystm.h>
53
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <fs/sockfs/sockcommon.h>
57#include <fs/sockfs/socktpi.h>
58
59#include <netinet/in.h>
60#include <sys/sendfile.h>
61#include <sys/un.h>
62#include <sys/tihdr.h>
63#include <sys/atomic.h>
64
65#include <inet/common.h>
66#include <inet/ip.h>
67#include <inet/ip6.h>
68#include <inet/tcp.h>
69
70extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
71		ssize32_t *);
72extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
73		int, ssize_t *);
74extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
75		boolean_t);
76extern sotpi_info_t *sotpi_sototpi(struct sonode *);
77
78#define	readflg	(V_WRITELOCK_FALSE)
79#define	rwflag	(V_WRITELOCK_TRUE)
80
81#define	SEND_MAX_CHUNK	16
82
83#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
84/*
85 * 64 bit offsets for 32 bit applications only running either on
86 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
87 * more than 2GB of data.
88 */
89int
90sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
91    int copy_cnt, ssize32_t *count)
92{
93	struct vnode *vp;
94	ushort_t fflag;
95	int ioflag;
96	size32_t cnt;
97	ssize32_t sfv_len;
98	ssize32_t tmpcount;
99	u_offset_t sfv_off;
100	struct uio auio;
101	struct iovec aiov;
102	int i, error;
103
104	fflag = fp->f_flag;
105	vp = fp->f_vnode;
106	for (i = 0; i < copy_cnt; i++) {
107
108		if (ISSIG(curthread, JUSTLOOKING))
109			return (EINTR);
110
111		/*
112		 * Do similar checks as "write" as we are writing
113		 * sfv_len bytes into "vp".
114		 */
115		sfv_len = (ssize32_t)sfv->sfv_len;
116
117		if (sfv_len == 0) {
118			sfv++;
119			continue;
120		}
121
122		if (sfv_len < 0)
123			return (EINVAL);
124
125		if (vp->v_type == VREG) {
126			if (*fileoff >= curproc->p_fsz_ctl) {
127				mutex_enter(&curproc->p_lock);
128				(void) rctl_action(
129				    rctlproc_legacy[RLIMIT_FSIZE],
130				    curproc->p_rctls, curproc, RCA_SAFE);
131				mutex_exit(&curproc->p_lock);
132				return (EFBIG);
133			}
134
135			if (*fileoff >= OFFSET_MAX(fp))
136				return (EFBIG);
137
138			if (*fileoff + sfv_len > OFFSET_MAX(fp))
139				return (EINVAL);
140		}
141
142		tmpcount = *count + sfv_len;
143		if (tmpcount < 0)
144			return (EINVAL);
145
146		sfv_off = sfv->sfv_off;
147
148		auio.uio_extflg = UIO_COPY_DEFAULT;
149		if (sfv->sfv_fd == SFV_FD_SELF) {
150			aiov.iov_len = sfv_len;
151			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
152			auio.uio_loffset = *fileoff;
153			auio.uio_iovcnt = 1;
154			auio.uio_resid = sfv_len;
155			auio.uio_iov = &aiov;
156			auio.uio_segflg = UIO_USERSPACE;
157			auio.uio_llimit = curproc->p_fsz_ctl;
158			auio.uio_fmode = fflag;
159			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
160			while (sfv_len > 0) {
161				error = VOP_WRITE(vp, &auio, ioflag,
162				    fp->f_cred, NULL);
163				cnt = sfv_len - auio.uio_resid;
164				sfv_len -= cnt;
165				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
166				if (vp->v_type == VREG)
167					*fileoff += cnt;
168				*count += cnt;
169				if (error != 0)
170					return (error);
171			}
172		} else {
173			file_t	*ffp;
174			vnode_t	*readvp;
175			size_t	size;
176			caddr_t	ptr;
177
178			if ((ffp = getf(sfv->sfv_fd)) == NULL)
179				return (EBADF);
180
181			if ((ffp->f_flag & FREAD) == 0) {
182				releasef(sfv->sfv_fd);
183				return (EBADF);
184			}
185
186			readvp = ffp->f_vnode;
187			if (readvp->v_type != VREG) {
188				releasef(sfv->sfv_fd);
189				return (EINVAL);
190			}
191
192			/*
193			 * No point reading and writing to same vp,
194			 * as long as both are regular files. readvp is not
195			 * locked; but since we got it from an open file the
196			 * contents will be valid during the time of access.
197			 */
198			if (vn_compare(vp, readvp)) {
199				releasef(sfv->sfv_fd);
200				return (EINVAL);
201			}
202
203			/*
204			 * Note: we assume readvp != vp. "vp" is already
205			 * locked, and "readvp" must not be.
206			 */
207			(void) VOP_RWLOCK(readvp, readflg, NULL);
208
209			/*
210			 * Same checks as in pread64.
211			 */
212			if (sfv_off > MAXOFFSET_T) {
213				VOP_RWUNLOCK(readvp, readflg, NULL);
214				releasef(sfv->sfv_fd);
215				return (EINVAL);
216			}
217
218			if (sfv_off + sfv_len > MAXOFFSET_T)
219				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
220
221			/* Find the native blocksize to transfer data */
222			size = MIN(vp->v_vfsp->vfs_bsize,
223			    readvp->v_vfsp->vfs_bsize);
224			size = sfv_len < size ? sfv_len : size;
225			ptr = kmem_alloc(size, KM_SLEEP);
226
227			while (sfv_len > 0) {
228				size_t	iov_len;
229
230				iov_len = MIN(size, sfv_len);
231				aiov.iov_base = ptr;
232				aiov.iov_len = iov_len;
233				auio.uio_loffset = sfv_off;
234				auio.uio_iov = &aiov;
235				auio.uio_iovcnt = 1;
236				auio.uio_resid = iov_len;
237				auio.uio_segflg = UIO_SYSSPACE;
238				auio.uio_llimit = MAXOFFSET_T;
239				auio.uio_fmode = ffp->f_flag;
240				ioflag = auio.uio_fmode &
241				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
242
243				/*
244				 * If read sync is not asked for,
245				 * filter sync flags
246				 */
247				if ((ioflag & FRSYNC) == 0)
248					ioflag &= ~(FSYNC|FDSYNC);
249				error = VOP_READ(readvp, &auio, ioflag,
250				    fp->f_cred, NULL);
251				if (error) {
252					kmem_free(ptr, size);
253					VOP_RWUNLOCK(readvp, readflg, NULL);
254					releasef(sfv->sfv_fd);
255					return (error);
256				}
257
258				/*
259				 * Check how must data was really read.
260				 * Decrement the 'len' and increment the
261				 * 'off' appropriately.
262				 */
263				cnt = iov_len - auio.uio_resid;
264				if (cnt == 0) {
265					/*
266					 * If we were reading a pipe (currently
267					 * not implemented), we may now lose
268					 * data.
269					 */
270					kmem_free(ptr, size);
271					VOP_RWUNLOCK(readvp, readflg, NULL);
272					releasef(sfv->sfv_fd);
273					return (EINVAL);
274				}
275				sfv_len -= cnt;
276				sfv_off += cnt;
277
278				aiov.iov_base = ptr;
279				aiov.iov_len = cnt;
280				auio.uio_loffset = *fileoff;
281				auio.uio_iov = &aiov;
282				auio.uio_iovcnt = 1;
283				auio.uio_resid = cnt;
284				auio.uio_segflg = UIO_SYSSPACE;
285				auio.uio_llimit = curproc->p_fsz_ctl;
286				auio.uio_fmode = fflag;
287				ioflag = auio.uio_fmode &
288				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
289				error = VOP_WRITE(vp, &auio, ioflag,
290				    fp->f_cred, NULL);
291
292				/*
293				 * Check how much data was written. Increment
294				 * the 'len' and decrement the 'off' if all
295				 * the data was not written.
296				 */
297				cnt -= auio.uio_resid;
298				sfv_len += auio.uio_resid;
299				sfv_off -= auio.uio_resid;
300				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
301				if (vp->v_type == VREG)
302					*fileoff += cnt;
303				*count += cnt;
304				if (error != 0) {
305					kmem_free(ptr, size);
306					VOP_RWUNLOCK(readvp, readflg, NULL);
307					releasef(sfv->sfv_fd);
308					return (error);
309				}
310			}
311			VOP_RWUNLOCK(readvp, readflg, NULL);
312			releasef(sfv->sfv_fd);
313			kmem_free(ptr, size);
314		}
315		sfv++;
316	}
317	return (0);
318}
319
320ssize32_t
321sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
322	size32_t *xferred, int fildes)
323{
324	u_offset_t		fileoff;
325	int			copy_cnt;
326	const struct ksendfilevec64 *copy_vec;
327	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
328	struct vnode *vp;
329	int error;
330	ssize32_t count = 0;
331
332	vp = fp->f_vnode;
333	(void) VOP_RWLOCK(vp, rwflag, NULL);
334
335	copy_vec = vec;
336	fileoff = fp->f_offset;
337
338	do {
339		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
340		if (copyin(copy_vec, sfv, copy_cnt *
341		    sizeof (struct ksendfilevec64))) {
342			error = EFAULT;
343			break;
344		}
345
346		/*
347		 * Optimize the regular file over
348		 * the socket case.
349		 */
350		if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) {
351			file_t *rfp;
352			vnode_t *rvp;
353
354			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
355				error = EBADF;
356				break;
357			}
358			if ((rfp->f_flag & FREAD) == 0) {
359				releasef(sfv->sfv_fd);
360				error = EBADF;
361				break;
362			}
363			rvp = rfp->f_vnode;
364			if (rvp->v_type == VREG) {
365				error = sosendfile64(fp, rfp, sfv, &count);
366				if (error)
367					break;
368				copy_vec++;
369				sfvcnt--;
370				continue;
371			}
372			releasef(sfv->sfv_fd);
373		}
374		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
375		if (error != 0)
376			break;
377
378		copy_vec += copy_cnt;
379		sfvcnt -= copy_cnt;
380	} while (sfvcnt > 0);
381
382	if (vp->v_type == VREG)
383		fp->f_offset += count;
384
385	VOP_RWUNLOCK(vp, rwflag, NULL);
386	if (copyout(&count, xferred, sizeof (count)))
387		error = EFAULT;
388	releasef(fildes);
389	if (error != 0)
390		return (set_errno(error));
391	return (count);
392}
393#endif
394
395int
396sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
397    int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
398{
399	struct vnode *vp;
400	struct uio auio;
401	struct iovec aiov;
402	ushort_t fflag;
403	int ioflag;
404	int i, error;
405	size_t cnt;
406	ssize_t sfv_len;
407	u_offset_t sfv_off;
408#ifdef _SYSCALL32_IMPL
409	model_t model = get_udatamodel();
410	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
411	    MAXOFF32_T : MAXOFFSET_T;
412#else
413	const u_offset_t maxoff = MAXOFF32_T;
414#endif
415	mblk_t *dmp = NULL;
416	int wroff;
417	int buf_left = 0;
418	size_t	iov_len;
419	mblk_t  *head, *tmp;
420	size_t  size = total_size;
421	size_t  extra;
422	int tail_len;
423	struct nmsghdr msg;
424
425	fflag = fp->f_flag;
426	vp = fp->f_vnode;
427
428	ASSERT(vp->v_type == VSOCK);
429	ASSERT(maxblk > 0);
430
431	/* If nothing to send, return */
432	if (total_size == 0)
433		return (0);
434
435	if (vp->v_stream != NULL) {
436		wroff = (int)vp->v_stream->sd_wroff;
437		tail_len = (int)vp->v_stream->sd_tail;
438	} else {
439		struct sonode *so;
440
441		so = VTOSO(vp);
442		wroff = so->so_proto_props.sopp_wroff;
443		tail_len = so->so_proto_props.sopp_tail;
444	}
445
446	extra = wroff + tail_len;
447
448	buf_left = MIN(total_size, maxblk);
449	head = dmp = allocb(buf_left + extra, BPRI_HI);
450	if (head == NULL)
451		return (ENOMEM);
452	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
453	bzero(&msg, sizeof (msg));
454
455	auio.uio_extflg = UIO_COPY_DEFAULT;
456	for (i = 0; i < copy_cnt; i++) {
457		if (ISSIG(curthread, JUSTLOOKING)) {
458			freemsg(head);
459			return (EINTR);
460		}
461
462		/*
463		 * Do similar checks as "write" as we are writing
464		 * sfv_len bytes into "vp".
465		 */
466		sfv_len = (ssize_t)sfv->sfv_len;
467
468		if (sfv_len == 0) {
469			sfv++;
470			continue;
471		}
472
473		/* Check for overflow */
474#ifdef _SYSCALL32_IMPL
475		if (model == DATAMODEL_ILP32) {
476			if (((ssize32_t)(*count + sfv_len)) < 0) {
477				freemsg(head);
478				return (EINVAL);
479			}
480		} else
481#endif
482		if ((*count + sfv_len) < 0) {
483			freemsg(head);
484			return (EINVAL);
485		}
486
487		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
488
489		if (sfv->sfv_fd == SFV_FD_SELF) {
490			while (sfv_len > 0) {
491				if (buf_left == 0) {
492					tmp = dmp;
493					buf_left = MIN(total_size, maxblk);
494					iov_len = MIN(buf_left, sfv_len);
495					dmp = allocb(buf_left + extra, BPRI_HI);
496					if (dmp == NULL) {
497						freemsg(head);
498						return (ENOMEM);
499					}
500					dmp->b_wptr = dmp->b_rptr =
501					    dmp->b_rptr + wroff;
502					tmp->b_cont = dmp;
503				} else {
504					iov_len = MIN(buf_left, sfv_len);
505				}
506
507				aiov.iov_len = iov_len;
508				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
509				auio.uio_loffset = *fileoff;
510				auio.uio_iovcnt = 1;
511				auio.uio_resid = iov_len;
512				auio.uio_iov = &aiov;
513				auio.uio_segflg = UIO_USERSPACE;
514				auio.uio_llimit = curproc->p_fsz_ctl;
515				auio.uio_fmode = fflag;
516
517				buf_left -= iov_len;
518				total_size -= iov_len;
519				sfv_len -= iov_len;
520				sfv_off += iov_len;
521
522				error = uiomove((caddr_t)dmp->b_wptr,
523				    iov_len, UIO_WRITE, &auio);
524				if (error != 0) {
525					freemsg(head);
526					return (error);
527				}
528				dmp->b_wptr += iov_len;
529			}
530		} else {
531			file_t	*ffp;
532			vnode_t	*readvp;
533
534			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
535				freemsg(head);
536				return (EBADF);
537			}
538
539			if ((ffp->f_flag & FREAD) == 0) {
540				releasef(sfv->sfv_fd);
541				freemsg(head);
542				return (EACCES);
543			}
544
545			readvp = ffp->f_vnode;
546			if (readvp->v_type != VREG) {
547				releasef(sfv->sfv_fd);
548				freemsg(head);
549				return (EINVAL);
550			}
551
552			/*
553			 * No point reading and writing to same vp,
554			 * as long as both are regular files. readvp is not
555			 * locked; but since we got it from an open file the
556			 * contents will be valid during the time of access.
557			 */
558
559			if (vn_compare(vp, readvp)) {
560				releasef(sfv->sfv_fd);
561				freemsg(head);
562				return (EINVAL);
563			}
564
565			/*
566			 * Note: we assume readvp != vp. "vp" is already
567			 * locked, and "readvp" must not be.
568			 */
569
570			(void) VOP_RWLOCK(readvp, readflg, NULL);
571
572			/* Same checks as in pread */
573			if (sfv_off > maxoff) {
574				VOP_RWUNLOCK(readvp, readflg, NULL);
575				releasef(sfv->sfv_fd);
576				freemsg(head);
577				return (EINVAL);
578			}
579			if (sfv_off + sfv_len > maxoff) {
580				total_size -= (sfv_off + sfv_len - maxoff);
581				sfv_len = (ssize_t)((offset_t)maxoff -
582				    sfv_off);
583			}
584
585			while (sfv_len > 0) {
586				if (buf_left == 0) {
587					tmp = dmp;
588					buf_left = MIN(total_size, maxblk);
589					iov_len = MIN(buf_left, sfv_len);
590					dmp = allocb(buf_left + extra, BPRI_HI);
591					if (dmp == NULL) {
592						VOP_RWUNLOCK(readvp, readflg,
593						    NULL);
594						releasef(sfv->sfv_fd);
595						freemsg(head);
596						return (ENOMEM);
597					}
598					dmp->b_wptr = dmp->b_rptr =
599					    dmp->b_rptr + wroff;
600					tmp->b_cont = dmp;
601				} else {
602					iov_len = MIN(buf_left, sfv_len);
603				}
604				aiov.iov_base = (caddr_t)dmp->b_wptr;
605				aiov.iov_len = iov_len;
606				auio.uio_loffset = sfv_off;
607				auio.uio_iov = &aiov;
608				auio.uio_iovcnt = 1;
609				auio.uio_resid = iov_len;
610				auio.uio_segflg = UIO_SYSSPACE;
611				auio.uio_llimit = MAXOFFSET_T;
612				auio.uio_fmode = ffp->f_flag;
613				ioflag = auio.uio_fmode &
614				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
615
616				/*
617				 * If read sync is not asked for,
618				 * filter sync flags
619				 */
620				if ((ioflag & FRSYNC) == 0)
621					ioflag &= ~(FSYNC|FDSYNC);
622				error = VOP_READ(readvp, &auio, ioflag,
623				    fp->f_cred, NULL);
624				if (error != 0) {
625					/*
626					 * If we were reading a pipe (currently
627					 * not implemented), we may now loose
628					 * data.
629					 */
630					VOP_RWUNLOCK(readvp, readflg, NULL);
631					releasef(sfv->sfv_fd);
632					freemsg(head);
633					return (error);
634				}
635
636				/*
637				 * Check how much data was really read.
638				 * Decrement the 'len' and increment the
639				 * 'off' appropriately.
640				 */
641				cnt = iov_len - auio.uio_resid;
642				if (cnt == 0) {
643					VOP_RWUNLOCK(readvp, readflg, NULL);
644					releasef(sfv->sfv_fd);
645					freemsg(head);
646					return (EINVAL);
647				}
648				sfv_len -= cnt;
649				sfv_off += cnt;
650				total_size -= cnt;
651				buf_left -= cnt;
652
653				dmp->b_wptr += cnt;
654			}
655			VOP_RWUNLOCK(readvp, readflg, NULL);
656			releasef(sfv->sfv_fd);
657		}
658		sfv++;
659	}
660
661	ASSERT(total_size == 0);
662	error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
663	if (error != 0) {
664		if (head != NULL)
665			freemsg(head);
666		return (error);
667	}
668	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
669	*count += size;
670
671	return (0);
672}
673
674
675int
676sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
677    int copy_cnt, ssize_t *count)
678{
679	struct vnode *vp;
680	struct uio auio;
681	struct iovec aiov;
682	ushort_t fflag;
683	int ioflag;
684	int i, error;
685	size_t cnt;
686	ssize_t sfv_len;
687	u_offset_t sfv_off;
688#ifdef _SYSCALL32_IMPL
689	model_t model = get_udatamodel();
690	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
691	    MAXOFF32_T : MAXOFFSET_T;
692#else
693	const u_offset_t maxoff = MAXOFF32_T;
694#endif
695	mblk_t	*dmp = NULL;
696	char	*buf = NULL;
697	size_t  extra;
698	int maxblk, wroff, tail_len;
699	struct sonode *so;
700	stdata_t *stp;
701	struct nmsghdr msg;
702
703	fflag = fp->f_flag;
704	vp = fp->f_vnode;
705
706	if (vp->v_type == VSOCK) {
707		so = VTOSO(vp);
708		if (vp->v_stream != NULL) {
709			stp = vp->v_stream;
710			wroff = (int)stp->sd_wroff;
711			tail_len = (int)stp->sd_tail;
712			maxblk = (int)stp->sd_maxblk;
713		} else {
714			stp = NULL;
715			wroff = so->so_proto_props.sopp_wroff;
716			tail_len = so->so_proto_props.sopp_tail;
717			maxblk = so->so_proto_props.sopp_maxblk;
718		}
719		extra = wroff + tail_len;
720	}
721
722	bzero(&msg, sizeof (msg));
723	auio.uio_extflg = UIO_COPY_DEFAULT;
724	for (i = 0; i < copy_cnt; i++) {
725		if (ISSIG(curthread, JUSTLOOKING))
726			return (EINTR);
727
728		/*
729		 * Do similar checks as "write" as we are writing
730		 * sfv_len bytes into "vp".
731		 */
732		sfv_len = (ssize_t)sfv->sfv_len;
733
734		if (sfv_len == 0) {
735			sfv++;
736			continue;
737		}
738
739		if (vp->v_type == VREG) {
740			if (*fileoff >= curproc->p_fsz_ctl) {
741				mutex_enter(&curproc->p_lock);
742				(void) rctl_action(
743				    rctlproc_legacy[RLIMIT_FSIZE],
744				    curproc->p_rctls, curproc, RCA_SAFE);
745				mutex_exit(&curproc->p_lock);
746
747				return (EFBIG);
748			}
749
750			if (*fileoff >= maxoff)
751				return (EFBIG);
752
753			if (*fileoff + sfv_len > maxoff)
754				return (EINVAL);
755		}
756
757		/* Check for overflow */
758#ifdef _SYSCALL32_IMPL
759		if (model == DATAMODEL_ILP32) {
760			if (((ssize32_t)(*count + sfv_len)) < 0)
761				return (EINVAL);
762		} else
763#endif
764		if ((*count + sfv_len) < 0)
765			return (EINVAL);
766
767		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
768
769		if (sfv->sfv_fd == SFV_FD_SELF) {
770			if (vp->v_type == VSOCK) {
771				while (sfv_len > 0) {
772					size_t iov_len;
773
774					iov_len = sfv_len;
775					if (!SOCK_IS_NONSTR(so) &&
776					    SOTOTPI(so)->sti_kssl_ctx != NULL)
777						iov_len = MIN(iov_len, maxblk);
778
779					aiov.iov_len = iov_len;
780					aiov.iov_base =
781					    (caddr_t)(uintptr_t)sfv_off;
782
783					auio.uio_iov = &aiov;
784					auio.uio_iovcnt = 1;
785					auio.uio_loffset = *fileoff;
786					auio.uio_segflg = UIO_USERSPACE;
787					auio.uio_fmode = fflag;
788					auio.uio_llimit = curproc->p_fsz_ctl;
789					auio.uio_resid = iov_len;
790
791					dmp = allocb(iov_len + extra, BPRI_HI);
792					if (dmp == NULL)
793						return (ENOMEM);
794					dmp->b_wptr = dmp->b_rptr =
795					    dmp->b_rptr + wroff;
796					error = uiomove((caddr_t)dmp->b_wptr,
797					    iov_len, UIO_WRITE, &auio);
798					if (error != 0) {
799						freeb(dmp);
800						return (error);
801					}
802					dmp->b_wptr += iov_len;
803					error = socket_sendmblk(VTOSO(vp),
804					    &msg, fflag, CRED(), &dmp);
805
806					if (error != 0) {
807						if (dmp != NULL)
808							freeb(dmp);
809						return (error);
810					}
811					ttolwp(curthread)->lwp_ru.ioch +=
812					    (ulong_t)iov_len;
813					*count += iov_len;
814					sfv_len -= iov_len;
815					sfv_off += iov_len;
816				}
817			} else {
818				ttolwp(curthread)->lwp_ru.ioch +=
819				    (ulong_t)sfv_len;
820				*count += sfv_len;
821				aiov.iov_len = sfv_len;
822				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
823
824				auio.uio_iov = &aiov;
825				auio.uio_iovcnt = 1;
826				auio.uio_loffset = *fileoff;
827				auio.uio_segflg = UIO_USERSPACE;
828				auio.uio_fmode = fflag;
829				auio.uio_llimit = curproc->p_fsz_ctl;
830				auio.uio_resid = sfv_len;
831
832				ioflag = auio.uio_fmode &
833				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
834				while (sfv_len > 0) {
835					error = VOP_WRITE(vp, &auio, ioflag,
836					    fp->f_cred, NULL);
837					cnt = sfv_len - auio.uio_resid;
838					sfv_len -= cnt;
839					ttolwp(curthread)->lwp_ru.ioch +=
840					    (ulong_t)cnt;
841					*fileoff += cnt;
842					*count += cnt;
843					if (error != 0)
844						return (error);
845				}
846			}
847		} else {
848			int segmapit = 0;
849			file_t	*ffp;
850			vnode_t	*readvp;
851			struct vnode *realvp;
852			size_t	size;
853			caddr_t	ptr;
854
855			if ((ffp = getf(sfv->sfv_fd)) == NULL)
856				return (EBADF);
857
858			if ((ffp->f_flag & FREAD) == 0) {
859				releasef(sfv->sfv_fd);
860				return (EBADF);
861			}
862
863			readvp = ffp->f_vnode;
864			if (VOP_REALVP(readvp, &realvp, NULL) == 0)
865				readvp = realvp;
866			if (readvp->v_type != VREG) {
867				releasef(sfv->sfv_fd);
868				return (EINVAL);
869			}
870
871			/*
872			 * No point reading and writing to same vp,
873			 * as long as both are regular files. readvp is not
874			 * locked; but since we got it from an open file the
875			 * contents will be valid during the time of access.
876			 */
877			if (vn_compare(vp, readvp)) {
878				releasef(sfv->sfv_fd);
879				return (EINVAL);
880			}
881
882			/*
883			 * Note: we assume readvp != vp. "vp" is already
884			 * locked, and "readvp" must not be.
885			 */
886			(void) VOP_RWLOCK(readvp, readflg, NULL);
887
888			/* Same checks as in pread */
889			if (sfv_off > maxoff) {
890				VOP_RWUNLOCK(readvp, readflg, NULL);
891				releasef(sfv->sfv_fd);
892				return (EINVAL);
893			}
894			if (sfv_off + sfv_len > maxoff) {
895				sfv_len = (ssize_t)((offset_t)maxoff -
896				    sfv_off);
897			}
898			/* Find the native blocksize to transfer data */
899			size = MIN(vp->v_vfsp->vfs_bsize,
900			    readvp->v_vfsp->vfs_bsize);
901			size = sfv_len < size ? sfv_len : size;
902
903			if (vp->v_type != VSOCK) {
904				segmapit = 0;
905				buf = kmem_alloc(size, KM_NOSLEEP);
906				if (buf == NULL) {
907					VOP_RWUNLOCK(readvp, readflg, NULL);
908					releasef(sfv->sfv_fd);
909					return (ENOMEM);
910				}
911			} else {
912				uint_t	copyflag;
913
914				copyflag = stp != NULL ? stp->sd_copyflag :
915				    so->so_proto_props.sopp_zcopyflag;
916				/*
917				 * For sockets acting as an SSL proxy, we
918				 * need to adjust the size to the maximum
919				 * SSL record size set in the stream head.
920				 */
921				if (!SOCK_IS_NONSTR(so) &&
922				    _SOTOTPI(so)->sti_kssl_ctx != NULL)
923					size = MIN(size, maxblk);
924
925				if (vn_has_flocks(readvp) ||
926				    readvp->v_flag & VNOMAP ||
927				    copyflag & STZCVMUNSAFE) {
928					segmapit = 0;
929				} else if (copyflag & STZCVMSAFE) {
930					segmapit = 1;
931				} else {
932					int on = 1;
933					if (socket_setsockopt(VTOSO(vp),
934					    SOL_SOCKET, SO_SND_COPYAVOID,
935					    &on, sizeof (on), CRED()) == 0)
936					segmapit = 1;
937				}
938			}
939
940			if (segmapit) {
941				boolean_t nowait;
942
943				nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
944				error = snf_segmap(fp, readvp, sfv_off,
945				    (u_offset_t)sfv_len, (ssize_t *)&cnt,
946				    nowait);
947				releasef(sfv->sfv_fd);
948				*count += cnt;
949				if (error)
950					return (error);
951				sfv++;
952				continue;
953			}
954
955			while (sfv_len > 0) {
956				size_t	iov_len;
957
958				iov_len = MIN(size, sfv_len);
959
960				if (vp->v_type == VSOCK) {
961					dmp = allocb(iov_len + extra, BPRI_HI);
962					if (dmp == NULL) {
963						VOP_RWUNLOCK(readvp, readflg,
964						    NULL);
965						releasef(sfv->sfv_fd);
966						return (ENOMEM);
967					}
968					dmp->b_wptr = dmp->b_rptr =
969					    dmp->b_rptr + wroff;
970					ptr = (caddr_t)dmp->b_rptr;
971				} else {
972					ptr = buf;
973				}
974
975				aiov.iov_base = ptr;
976				aiov.iov_len = iov_len;
977				auio.uio_loffset = sfv_off;
978				auio.uio_iov = &aiov;
979				auio.uio_iovcnt = 1;
980				auio.uio_resid = iov_len;
981				auio.uio_segflg = UIO_SYSSPACE;
982				auio.uio_llimit = MAXOFFSET_T;
983				auio.uio_fmode = ffp->f_flag;
984				ioflag = auio.uio_fmode &
985				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
986
987				/*
988				 * If read sync is not asked for,
989				 * filter sync flags
990				 */
991				if ((ioflag & FRSYNC) == 0)
992					ioflag &= ~(FSYNC|FDSYNC);
993				error = VOP_READ(readvp, &auio, ioflag,
994				    fp->f_cred, NULL);
995				if (error != 0) {
996					/*
997					 * If we were reading a pipe (currently
998					 * not implemented), we may now lose
999					 * data.
1000					 */
1001					if (vp->v_type == VSOCK)
1002						freeb(dmp);
1003					else
1004						kmem_free(buf, size);
1005					VOP_RWUNLOCK(readvp, readflg, NULL);
1006					releasef(sfv->sfv_fd);
1007					return (error);
1008				}
1009
1010				/*
1011				 * Check how much data was really read.
1012				 * Decrement the 'len' and increment the
1013				 * 'off' appropriately.
1014				 */
1015				cnt = iov_len - auio.uio_resid;
1016				if (cnt == 0) {
1017					if (vp->v_type == VSOCK)
1018						freeb(dmp);
1019					else
1020						kmem_free(buf, size);
1021					VOP_RWUNLOCK(readvp, readflg, NULL);
1022					releasef(sfv->sfv_fd);
1023					return (EINVAL);
1024				}
1025				sfv_len -= cnt;
1026				sfv_off += cnt;
1027
1028				if (vp->v_type == VSOCK) {
1029					dmp->b_wptr = dmp->b_rptr + cnt;
1030
1031					error = socket_sendmblk(VTOSO(vp),
1032					    &msg, fflag, CRED(), &dmp);
1033
1034					if (error != 0) {
1035						if (dmp != NULL)
1036							freeb(dmp);
1037						VOP_RWUNLOCK(readvp, readflg,
1038						    NULL);
1039						releasef(sfv->sfv_fd);
1040						return (error);
1041					}
1042
1043					ttolwp(curthread)->lwp_ru.ioch +=
1044					    (ulong_t)cnt;
1045					*count += cnt;
1046				} else {
1047
1048					aiov.iov_base = ptr;
1049					aiov.iov_len = cnt;
1050					auio.uio_loffset = *fileoff;
1051					auio.uio_resid = cnt;
1052					auio.uio_iov = &aiov;
1053					auio.uio_iovcnt = 1;
1054					auio.uio_segflg = UIO_SYSSPACE;
1055					auio.uio_llimit = curproc->p_fsz_ctl;
1056					auio.uio_fmode = fflag;
1057					ioflag = auio.uio_fmode &
1058					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1059					error = VOP_WRITE(vp, &auio, ioflag,
1060					    fp->f_cred, NULL);
1061
1062					/*
1063					 * Check how much data was written.
1064					 * Increment the 'len' and decrement the
1065					 * 'off' if all the data was not
1066					 * written.
1067					 */
1068					cnt -= auio.uio_resid;
1069					sfv_len += auio.uio_resid;
1070					sfv_off -= auio.uio_resid;
1071					ttolwp(curthread)->lwp_ru.ioch +=
1072					    (ulong_t)cnt;
1073					*fileoff += cnt;
1074					*count += cnt;
1075					if (error != 0) {
1076						kmem_free(buf, size);
1077						VOP_RWUNLOCK(readvp, readflg,
1078						    NULL);
1079						releasef(sfv->sfv_fd);
1080						return (error);
1081					}
1082				}
1083			}
1084			if (buf) {
1085				kmem_free(buf, size);
1086				buf = NULL;
1087			}
1088			VOP_RWUNLOCK(readvp, readflg, NULL);
1089			releasef(sfv->sfv_fd);
1090		}
1091		sfv++;
1092	}
1093	return (0);
1094}
1095
1096ssize_t
1097sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1098    size_t *xferred)
1099{
1100	int error = 0;
1101	int first_vector_error = 0;
1102	file_t *fp;
1103	struct vnode *vp;
1104	struct sonode *so;
1105	u_offset_t fileoff;
1106	int copy_cnt;
1107	const struct sendfilevec *copy_vec;
1108	struct sendfilevec sfv[SEND_MAX_CHUNK];
1109	ssize_t count = 0;
1110#ifdef _SYSCALL32_IMPL
1111	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1112#endif
1113	ssize_t total_size;
1114	int i;
1115	boolean_t is_sock = B_FALSE;
1116	int maxblk = 0;
1117
1118	if (sfvcnt <= 0)
1119		return (set_errno(EINVAL));
1120
1121	if ((fp = getf(fildes)) == NULL)
1122		return (set_errno(EBADF));
1123
1124	if (((fp->f_flag) & FWRITE) == 0) {
1125		error = EBADF;
1126		goto err;
1127	}
1128
1129	fileoff = fp->f_offset;
1130	vp = fp->f_vnode;
1131
1132	switch (vp->v_type) {
1133	case VSOCK:
1134		so = VTOSO(vp);
1135		is_sock = B_TRUE;
1136		if (SOCK_IS_NONSTR(so)) {
1137			maxblk = so->so_proto_props.sopp_maxblk;
1138		} else {
1139			maxblk = (int)vp->v_stream->sd_maxblk;
1140		}
1141		break;
1142	case VREG:
1143		break;
1144	default:
1145		error = EINVAL;
1146		goto err;
1147	}
1148
1149	switch (opcode) {
1150	case SENDFILEV :
1151		break;
1152#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1153	case SENDFILEV64 :
1154		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1155		    (size32_t *)xferred, fildes));
1156#endif
1157	default :
1158		error = ENOSYS;
1159		break;
1160	}
1161
1162	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1163	copy_vec = vec;
1164
1165	do {
1166		total_size = 0;
1167		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1168#ifdef _SYSCALL32_IMPL
1169		/* 32-bit callers need to have their iovec expanded. */
1170		if (get_udatamodel() == DATAMODEL_ILP32) {
1171			if (copyin(copy_vec, sfv32,
1172			    copy_cnt * sizeof (ksendfilevec32_t))) {
1173				error = EFAULT;
1174				break;
1175			}
1176
1177			for (i = 0; i < copy_cnt; i++) {
1178				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1179				sfv[i].sfv_off =
1180				    (off_t)(uint32_t)sfv32[i].sfv_off;
1181				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1182				total_size += sfv[i].sfv_len;
1183				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1184				/*
1185				 * Individual elements of the vector must not
1186				 * wrap or overflow, as later math is signed.
1187				 * Equally total_size needs to be checked after
1188				 * each vector is added in, to be sure that
1189				 * rogue values haven't overflowed the counter.
1190				 */
1191				if (((ssize32_t)sfv[i].sfv_len < 0) ||
1192				    ((ssize32_t)total_size < 0)) {
1193					/*
1194					 * Truncate the vector to send data
1195					 * described by elements before the
1196					 * error.
1197					 */
1198					copy_cnt = i;
1199					first_vector_error = EINVAL;
1200					/* total_size can't be trusted */
1201					if ((ssize32_t)total_size < 0)
1202						error = EINVAL;
1203					break;
1204				}
1205			}
1206			/* Nothing to do, process errors */
1207			if (copy_cnt == 0)
1208				break;
1209
1210		} else {
1211#endif
1212			if (copyin(copy_vec, sfv,
1213			    copy_cnt * sizeof (sendfilevec_t))) {
1214				error = EFAULT;
1215				break;
1216			}
1217
1218			for (i = 0; i < copy_cnt; i++) {
1219				total_size += sfv[i].sfv_len;
1220				/*
1221				 * Individual elements of the vector must not
1222				 * wrap or overflow, as later math is signed.
1223				 * Equally total_size needs to be checked after
1224				 * each vector is added in, to be sure that
1225				 * rogue values haven't overflowed the counter.
1226				 */
1227				if (((ssize_t)sfv[i].sfv_len < 0) ||
1228				    (total_size < 0)) {
1229					/*
1230					 * Truncate the vector to send data
1231					 * described by elements before the
1232					 * error.
1233					 */
1234					copy_cnt = i;
1235					first_vector_error = EINVAL;
1236					/* total_size can't be trusted */
1237					if (total_size < 0)
1238						error = EINVAL;
1239					break;
1240				}
1241			}
1242			/* Nothing to do, process errors */
1243			if (copy_cnt == 0)
1244				break;
1245#ifdef _SYSCALL32_IMPL
1246		}
1247#endif
1248
1249		/*
1250		 * The task between deciding to use sendvec_small_chunk
1251		 * and sendvec_chunk is dependant on multiple things:
1252		 *
1253		 * i) latency is important for smaller files. So if the
1254		 * data is smaller than 'tcp_slow_start_initial' times
1255		 * maxblk, then use sendvec_small_chunk which creates
1256		 * maxblk size mblks and chains them together and sends
1257		 * them to TCP in one shot. It also leaves 'wroff' size
1258		 * space for the headers in each mblk.
1259		 *
1260		 * ii) for total size bigger than 'tcp_slow_start_initial'
1261		 * time maxblk, its probably real file data which is
1262		 * dominating. So its better to use sendvec_chunk because
1263		 * performance goes to dog if we don't do pagesize reads.
1264		 * sendvec_chunk will do pagesize reads and write them
1265		 * in pagesize mblks to TCP.
1266		 *
1267		 * Side Notes: A write to file has not been optimized.
1268		 * Future zero copy code will plugin into sendvec_chunk
1269		 * only because doing zero copy for files smaller then
1270		 * pagesize is useless.
1271		 *
1272		 * Note, if socket has NL7C enabled then call NL7C's
1273		 * senfilev() function to consume the sfv[].
1274		 */
1275		if (is_sock) {
1276			if (!SOCK_IS_NONSTR(so) &&
1277			    _SOTOTPI(so)->sti_nl7c_flags != 0) {
1278				error = nl7c_sendfilev(so, &fileoff,
1279				    sfv, copy_cnt, &count);
1280			} else if ((total_size <= (4 * maxblk)) &&
1281			    error == 0) {
1282				error = sendvec_small_chunk(fp,
1283				    &fileoff, sfv, copy_cnt,
1284				    total_size, maxblk, &count);
1285			} else {
1286				error = sendvec_chunk(fp, &fileoff,
1287				    sfv, copy_cnt, &count);
1288			}
1289		} else {
1290			ASSERT(vp->v_type == VREG);
1291			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1292			    &count);
1293		}
1294
1295
1296#ifdef _SYSCALL32_IMPL
1297	if (get_udatamodel() == DATAMODEL_ILP32)
1298		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1299		    (copy_cnt * sizeof (ksendfilevec32_t)));
1300	else
1301#endif
1302		copy_vec += copy_cnt;
1303		sfvcnt -= copy_cnt;
1304
1305	/* Process all vector members up to first error */
1306	} while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1307
1308	if (vp->v_type == VREG)
1309		fp->f_offset += count;
1310
1311	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1312
1313#ifdef _SYSCALL32_IMPL
1314	if (get_udatamodel() == DATAMODEL_ILP32) {
1315		ssize32_t count32 = (ssize32_t)count;
1316		if (copyout(&count32, xferred, sizeof (count32)))
1317			error = EFAULT;
1318		releasef(fildes);
1319		if (error != 0)
1320			return (set_errno(error));
1321		if (first_vector_error != 0)
1322			return (set_errno(first_vector_error));
1323		return (count32);
1324	}
1325#endif
1326	if (copyout(&count, xferred, sizeof (count)))
1327		error = EFAULT;
1328	releasef(fildes);
1329	if (error != 0)
1330		return (set_errno(error));
1331	if (first_vector_error != 0)
1332		return (set_errno(first_vector_error));
1333	return (count);
1334err:
1335	ASSERT(error != 0);
1336	releasef(fildes);
1337	return (set_errno(error));
1338}
1339