150959Speter/*-
2227906Smarius * SPDX-License-Identifier: BSD-3-Clause
350120Swpaul *
450120Swpaul * Copyright (c) 2001 Dag-Erling Sm��rgrav
550120Swpaul * All rights reserved.
650120Swpaul *
750120Swpaul * Redistribution and use in source and binary forms, with or without
850120Swpaul * modification, are permitted provided that the following conditions
950120Swpaul * are met:
1050120Swpaul * 1. Redistributions of source code must retain the above copyright
1150120Swpaul *    notice, this list of conditions and the following disclaimer
1250120Swpaul *    in this position and unchanged.
1350120Swpaul * 2. Redistributions in binary form must reproduce the above copyright
1450120Swpaul *    notice, this list of conditions and the following disclaimer in the
1550120Swpaul *    documentation and/or other materials provided with the distribution.
1650120Swpaul * 3. The name of the author may not be used to endorse or promote products
1750120Swpaul *    derived from this software without specific prior written permission.
1850120Swpaul *
1950120Swpaul * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
2050120Swpaul * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
2150120Swpaul * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
2250120Swpaul * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
2350120Swpaul * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
2450120Swpaul * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2550120Swpaul * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2650120Swpaul * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2750120Swpaul * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
2850120Swpaul * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2950120Swpaul */
3050120Swpaul
3150120Swpaul#include <sys/cdefs.h>
3250120Swpaul#include "opt_pseudofs.h"
3350120Swpaul
3450120Swpaul#include <sys/param.h>
3550120Swpaul#include <sys/kernel.h>
3650120Swpaul#include <sys/systm.h>
3750120Swpaul#include <sys/ctype.h>
38221407Smarius#include <sys/dirent.h>
39221407Smarius#include <sys/fcntl.h>
40221407Smarius#include <sys/limits.h>
41221407Smarius#include <sys/lock.h>
42221407Smarius#include <sys/malloc.h>
43221407Smarius#include <sys/mount.h>
44221407Smarius#include <sys/mutex.h>
45221407Smarius#include <sys/namei.h>
46221407Smarius#include <sys/proc.h>
4750120Swpaul#include <sys/sbuf.h>
4850120Swpaul#include <sys/sx.h>
49221407Smarius#include <sys/sysctl.h>
5061907Ssemenu#include <sys/vnode.h>
5150120Swpaul
5250577Swpaul#include <fs/pseudofs/pseudofs.h>
53178667Sjhb#include <fs/pseudofs/pseudofs_internal.h>
54221407Smarius
55135048Swpaul#define KASSERT_PN_IS_DIR(pn)						\
5650120Swpaul	KASSERT((pn)->pn_type == pfstype_root ||			\
57221407Smarius	    (pn)->pn_type == pfstype_dir ||				\
58160637Syongari	    (pn)->pn_type == pfstype_procdir,				\
5950120Swpaul	    ("%s(): VDIR vnode refers to non-directory pfs_node", __func__))
60221407Smarius
61221407Smarius#define KASSERT_PN_IS_FILE(pn)						\
6250120Swpaul	KASSERT((pn)->pn_type == pfstype_file,				\
63221407Smarius	    ("%s(): VREG vnode refers to non-file pfs_node", __func__))
64221407Smarius
6550120Swpaul#define KASSERT_PN_IS_LINK(pn)						\
66221407Smarius	KASSERT((pn)->pn_type == pfstype_symlink,			\
6750120Swpaul	    ("%s(): VLNK vnode refers to non-link pfs_node", __func__))
68221407Smarius
69221407Smarius#define	PFS_MAXBUFSIZ		1024 * 1024
70221407Smarius
7150120Swpaul/*
7250120Swpaul * Returns the fileno, adjusted for target pid
73221407Smarius */
7459475Swpaulstatic uint32_t
7550120Swpaulpn_fileno(struct pfs_node *pn, pid_t pid)
76221407Smarius{
77221407Smarius
7850120Swpaul	KASSERT(pn->pn_fileno > 0,
79221407Smarius	    ("%s(): no fileno allocated", __func__));
80221407Smarius	if (pid != NO_PID)
8174129Sjlemon		return (pn->pn_fileno * NO_PID + pid);
82221407Smarius	return (pn->pn_fileno);
83221407Smarius}
84221407Smarius
8559475Swpaul/*
86165090Sscottl * Returns non-zero if given file is visible to given thread.
87221407Smarius */
88221407Smariusstatic int
89221407Smariuspfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc)
90221407Smarius{
91221407Smarius
92221407Smarius	if (proc == NULL)
93221407Smarius		return (0);
94221407Smarius
95221407Smarius	PROC_LOCK_ASSERT(proc, MA_OWNED);
9650120Swpaul
97221407Smarius	if ((proc->p_flag & P_WEXIT) != 0)
98221407Smarius		return (0);
99221407Smarius	if (p_cansee(td, proc) != 0)
10050120Swpaul		return (0);
10150120Swpaul	return (pn_vis(td, proc, pn));
102221407Smarius}
103221407Smarius
104221407Smariusstatic int
105221407Smariuspfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid,
106221407Smarius    struct proc **p)
107221407Smarius{
108221407Smarius	struct proc *proc;
10950120Swpaul
11050120Swpaul	PFS_TRACE(("%s (pid: %d, req: %d)",
11150120Swpaul	    pn->pn_name, pid, td->td_proc->p_pid));
11250120Swpaul
11350120Swpaul	if (p)
114179895Sdelphij		*p = NULL;
115206563Syongari	if (pid == NO_PID)
116179895Sdelphij		PFS_RETURN (pn_vis(td, NULL, pn));
117179895Sdelphij	proc = pfind(pid);
11861907Ssemenu	if (proc == NULL)
119221407Smarius		PFS_RETURN (0);
120221407Smarius	if (pfs_visible_proc(td, pn, proc)) {
121221407Smarius		if (p)
122221407Smarius			*p = proc;
123221407Smarius		else
124221407Smarius			PROC_UNLOCK(proc);
12561907Ssemenu		PFS_RETURN (1);
12650120Swpaul	}
127221407Smarius	PROC_UNLOCK(proc);
128221407Smarius	PFS_RETURN (0);
129221407Smarius}
130221407Smarius
131221407Smariusstatic int
13250120Swpaulpfs_lookup_proc(pid_t pid, struct proc **p)
133221407Smarius{
134221407Smarius	struct proc *proc;
135221407Smarius
136221407Smarius	proc = pfind(pid);
137190538Simp	if (proc == NULL)
138221407Smarius		return (0);
139221407Smarius	if ((proc->p_flag & P_WEXIT) != 0) {
140179098Syongari		PROC_UNLOCK(proc);
141221407Smarius		return (0);
142221407Smarius	}
143221407Smarius	_PHOLD(proc);
144221407Smarius	PROC_UNLOCK(proc);
145221407Smarius	*p = proc;
146221407Smarius	return (1);
147221407Smarius}
148221407Smarius
149227906Smarius/*
150221407Smarius * Verify permissions
151221407Smarius */
152221407Smariusstatic int
153221407Smariuspfs_access(struct vop_access_args *va)
154221407Smarius{
155221407Smarius	struct vnode *vn = va->a_vp;
156221407Smarius	struct pfs_vdata *pvd = vn->v_data;
157221407Smarius	struct vattr vattr;
158221407Smarius	int error;
159221407Smarius
160221407Smarius	PFS_TRACE(("%s", pvd->pvd_pn->pn_name));
161221407Smarius	(void)pvd;
162221407Smarius
163221407Smarius	error = VOP_GETATTR(vn, &vattr, va->a_cred);
164221407Smarius	if (error)
165221407Smarius		PFS_RETURN (error);
166221407Smarius	error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid,
167221407Smarius	    va->a_accmode, va->a_cred);
168227906Smarius	PFS_RETURN (error);
169221407Smarius}
170221407Smarius
171221407Smarius/*
172221407Smarius * Close a file or directory
173221407Smarius */
174221407Smariusstatic int
175227906Smariuspfs_close(struct vop_close_args *va)
176221407Smarius{
177221407Smarius	struct vnode *vn = va->a_vp;
178221407Smarius	struct pfs_vdata *pvd = vn->v_data;
179221407Smarius	struct pfs_node *pn = pvd->pvd_pn;
180221407Smarius	struct proc *proc;
181221407Smarius	int error;
182221407Smarius
183221713Syongari	PFS_TRACE(("%s", pn->pn_name));
184221407Smarius	pfs_assert_not_owned(pn);
185226870Syongari
186221407Smarius	/*
18750577Swpaul	 * Do nothing unless this is the last close and the node has a
188135048Swpaul	 * last-close handler.
189221407Smarius	 */
190221407Smarius	if (vrefcnt(vn) > 1 || pn->pn_close == NULL)
191221407Smarius		PFS_RETURN (0);
192221407Smarius
193221407Smarius	if (pvd->pvd_pid != NO_PID) {
194221407Smarius		proc = pfind(pvd->pvd_pid);
195221407Smarius	} else {
196135048Swpaul		proc = NULL;
19750120Swpaul	}
198221407Smarius
199221407Smarius	error = pn_close(va->a_td, proc, pn);
200221407Smarius
201221407Smarius	if (proc != NULL)
20250120Swpaul		PROC_UNLOCK(proc);
203221407Smarius
204221407Smarius	PFS_RETURN (error);
205221407Smarius}
206221407Smarius
207221407Smarius/*
20850120Swpaul * Get file attributes
209221407Smarius */
210221407Smariusstatic int
211221407Smariuspfs_getattr(struct vop_getattr_args *va)
212221407Smarius{
21350120Swpaul	struct vnode *vn = va->a_vp;
214221407Smarius	struct pfs_vdata *pvd = vn->v_data;
215221407Smarius	struct pfs_node *pn = pvd->pvd_pn;
216221407Smarius	struct vattr *vap = va->a_vap;
217221407Smarius	struct proc *proc;
218221407Smarius	int error = 0;
219221407Smarius
220221407Smarius	PFS_TRACE(("%s", pn->pn_name));
221221407Smarius	pfs_assert_not_owned(pn);
222221407Smarius
223221407Smarius	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
224227906Smarius		PFS_RETURN (ENOENT);
225227906Smarius
226160637Syongari	vap->va_type = vn->v_type;
227227906Smarius	vap->va_fileid = pn_fileno(pn, pvd->pvd_pid);
228227906Smarius	vap->va_flags = 0;
22977078Swpaul	vap->va_blocksize = PAGE_SIZE;
230221407Smarius	vap->va_bytes = vap->va_size = 0;
23177078Swpaul	vap->va_filerev = 0;
232179335Syongari	vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0];
233179335Syongari	vap->va_nlink = 1;
234179335Syongari	nanotime(&vap->va_ctime);
235179335Syongari	vap->va_atime = vap->va_mtime = vap->va_ctime;
23650120Swpaul
23750120Swpaul	switch (pn->pn_type) {
238221407Smarius	case pfstype_procdir:
239221407Smarius	case pfstype_root:
240221407Smarius	case pfstype_dir:
241221407Smarius#if 0
242221407Smarius		pfs_lock(pn);
243221407Smarius		/* compute link count */
24450120Swpaul		pfs_unlock(pn);
245221407Smarius#endif
246221407Smarius		vap->va_mode = 0555;
247221407Smarius		break;
248221407Smarius	case pfstype_file:
249221407Smarius	case pfstype_symlink:
250221407Smarius		vap->va_mode = 0444;
251221407Smarius		break;
252221407Smarius	default:
253221407Smarius		printf("shouldn't be here!\n");
254221407Smarius		vap->va_mode = 0;
255221407Smarius		break;
256227906Smarius	}
257221407Smarius
258221407Smarius	if (proc != NULL) {
259221407Smarius		vap->va_uid = proc->p_ucred->cr_ruid;
260223688Simp		vap->va_gid = proc->p_ucred->cr_rgid;
261221407Smarius	} else {
262221407Smarius		vap->va_uid = 0;
263227906Smarius		vap->va_gid = 0;
264221407Smarius	}
265221407Smarius
266221407Smarius	if (pn->pn_attr != NULL)
267221407Smarius		error = pn_attr(curthread, proc, pn, vap);
268221407Smarius
269221407Smarius	if(proc != NULL)
270227906Smarius		PROC_UNLOCK(proc);
271227906Smarius
272227906Smarius	PFS_RETURN (error);
273227906Smarius}
27450120Swpaul
275221407Smarius/*
276221407Smarius * Perform an ioctl
277221407Smarius */
278221407Smariusstatic int
279221407Smariuspfs_ioctl(struct vop_ioctl_args *va)
280221407Smarius{
281221407Smarius	struct vnode *vn;
28250120Swpaul	struct pfs_vdata *pvd;
283221407Smarius	struct pfs_node *pn;
284221407Smarius	struct proc *proc;
285221407Smarius	int error;
286221407Smarius
287221407Smarius	vn = va->a_vp;
288221407Smarius	vn_lock(vn, LK_SHARED | LK_RETRY);
28950120Swpaul	if (VN_IS_DOOMED(vn)) {
290221407Smarius		VOP_UNLOCK(vn);
29150120Swpaul		return (EBADF);
292216828Syongari	}
293216828Syongari	pvd = vn->v_data;
294216828Syongari	pn = pvd->pvd_pn;
295221407Smarius
296221407Smarius	PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command));
297221407Smarius	pfs_assert_not_owned(pn);
298221407Smarius
299221407Smarius	if (vn->v_type != VREG) {
300221407Smarius		VOP_UNLOCK(vn);
30194149Swpaul		PFS_RETURN (EINVAL);
302221407Smarius	}
303221407Smarius	KASSERT_PN_IS_FILE(pn);
304221407Smarius
305221407Smarius	if (pn->pn_ioctl == NULL) {
30650120Swpaul		VOP_UNLOCK(vn);
30750120Swpaul		PFS_RETURN (ENOTTY);
308221407Smarius	}
30950120Swpaul
31050120Swpaul	/*
311221407Smarius	 * This is necessary because process' privileges may
312221407Smarius	 * have changed since the open() call.
313221407Smarius	 */
31459475Swpaul	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) {
315221407Smarius		VOP_UNLOCK(vn);
316221407Smarius		PFS_RETURN (EIO);
317221407Smarius	}
31875353Smjacob
319221407Smarius	error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data);
320221407Smarius
321	if (proc != NULL)
322		PROC_UNLOCK(proc);
323
324	VOP_UNLOCK(vn);
325	PFS_RETURN (error);
326}
327
328/*
329 * Perform getextattr
330 */
331static int
332pfs_getextattr(struct vop_getextattr_args *va)
333{
334	struct vnode *vn = va->a_vp;
335	struct pfs_vdata *pvd = vn->v_data;
336	struct pfs_node *pn = pvd->pvd_pn;
337	struct proc *proc;
338	int error;
339
340	PFS_TRACE(("%s", pn->pn_name));
341	pfs_assert_not_owned(pn);
342
343	/*
344	 * This is necessary because either process' privileges may
345	 * have changed since the open() call.
346	 */
347	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
348		PFS_RETURN (EIO);
349
350	if (pn->pn_getextattr == NULL)
351		error = EOPNOTSUPP;
352	else
353		error = pn_getextattr(curthread, proc, pn,
354		    va->a_attrnamespace, va->a_name, va->a_uio,
355		    va->a_size, va->a_cred);
356
357	if (proc != NULL)
358		PROC_UNLOCK(proc);
359
360	PFS_RETURN (error);
361}
362
363/*
364 * Convert a vnode to its component name
365 */
366static int
367pfs_vptocnp(struct vop_vptocnp_args *ap)
368{
369	struct vnode *vp = ap->a_vp;
370	struct vnode **dvp = ap->a_vpp;
371	struct pfs_vdata *pvd = vp->v_data;
372	struct pfs_node *pd = pvd->pvd_pn;
373	struct pfs_node *pn;
374	struct mount *mp;
375	char *buf = ap->a_buf;
376	size_t *buflen = ap->a_buflen;
377	char pidbuf[PFS_NAMELEN];
378	pid_t pid = pvd->pvd_pid;
379	int len, i, error, locked;
380
381	i = *buflen;
382	error = 0;
383
384	pfs_lock(pd);
385
386	if (vp->v_type == VDIR && pd->pn_type == pfstype_root) {
387		*dvp = vp;
388		vhold(*dvp);
389		pfs_unlock(pd);
390		PFS_RETURN (0);
391	} else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) {
392		len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
393		i -= len;
394		if (i < 0) {
395			error = ENOMEM;
396			goto failed;
397		}
398		bcopy(pidbuf, buf + i, len);
399	} else {
400		len = strlen(pd->pn_name);
401		i -= len;
402		if (i < 0) {
403			error = ENOMEM;
404			goto failed;
405		}
406		bcopy(pd->pn_name, buf + i, len);
407	}
408
409	pn = pd->pn_parent;
410	pfs_unlock(pd);
411
412	mp = vp->v_mount;
413	error = vfs_busy(mp, 0);
414	if (error)
415		return (error);
416
417	/*
418	 * vp is held by caller.
419	 */
420	locked = VOP_ISLOCKED(vp);
421	VOP_UNLOCK(vp);
422
423	error = pfs_vncache_alloc(mp, dvp, pn, pid);
424	if (error) {
425		vn_lock(vp, locked | LK_RETRY);
426		vfs_unbusy(mp);
427		PFS_RETURN(error);
428	}
429
430	*buflen = i;
431	VOP_UNLOCK(*dvp);
432	vn_lock(vp, locked | LK_RETRY);
433	vfs_unbusy(mp);
434
435	PFS_RETURN (0);
436failed:
437	pfs_unlock(pd);
438	PFS_RETURN(error);
439}
440
441/*
442 * Look up a file or directory
443 */
444static int
445pfs_lookup(struct vop_cachedlookup_args *va)
446{
447	struct vnode *vn = va->a_dvp;
448	struct vnode **vpp = va->a_vpp;
449	struct componentname *cnp = va->a_cnp;
450	struct pfs_vdata *pvd = vn->v_data;
451	struct pfs_node *pd = pvd->pvd_pn;
452	struct pfs_node *pn, *pdn = NULL;
453	struct mount *mp;
454	pid_t pid = pvd->pvd_pid;
455	char *pname;
456	int error, i, namelen, visible;
457
458	PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr));
459	pfs_assert_not_owned(pd);
460
461	if (vn->v_type != VDIR)
462		PFS_RETURN (ENOTDIR);
463	KASSERT_PN_IS_DIR(pd);
464
465	/*
466	 * Don't support DELETE or RENAME.  CREATE is supported so
467	 * that O_CREAT will work, but the lookup will still fail if
468	 * the file does not exist.
469	 */
470	if ((cnp->cn_flags & ISLASTCN) &&
471	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
472		PFS_RETURN (EOPNOTSUPP);
473
474	/* shortcut: check if the name is too long */
475	if (cnp->cn_namelen >= PFS_NAMELEN)
476		PFS_RETURN (ENOENT);
477
478	/* check that parent directory is visible... */
479	if (!pfs_visible(curthread, pd, pvd->pvd_pid, NULL))
480		PFS_RETURN (ENOENT);
481
482	/* self */
483	namelen = cnp->cn_namelen;
484	pname = cnp->cn_nameptr;
485	if (namelen == 1 && pname[0] == '.') {
486		pn = pd;
487		*vpp = vn;
488		VREF(vn);
489		PFS_RETURN (0);
490	}
491
492	mp = vn->v_mount;
493
494	/* parent */
495	if (cnp->cn_flags & ISDOTDOT) {
496		if (pd->pn_type == pfstype_root)
497			PFS_RETURN (EIO);
498		error = vfs_busy(mp, MBF_NOWAIT);
499		if (error != 0) {
500			vfs_ref(mp);
501			VOP_UNLOCK(vn);
502			error = vfs_busy(mp, 0);
503			vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
504			vfs_rel(mp);
505			if (error != 0)
506				PFS_RETURN(ENOENT);
507			if (VN_IS_DOOMED(vn)) {
508				vfs_unbusy(mp);
509				PFS_RETURN(ENOENT);
510			}
511		}
512		VOP_UNLOCK(vn);
513		KASSERT(pd->pn_parent != NULL,
514		    ("%s(): non-root directory has no parent", __func__));
515		/*
516		 * This one is tricky.  Descendents of procdir nodes
517		 * inherit their parent's process affinity, but
518		 * there's no easy reverse mapping.  For simplicity,
519		 * we assume that if this node is a procdir, its
520		 * parent isn't (which is correct as long as
521		 * descendents of procdir nodes are never procdir
522		 * nodes themselves)
523		 */
524		if (pd->pn_type == pfstype_procdir)
525			pid = NO_PID;
526		pfs_lock(pd);
527		pn = pd->pn_parent;
528		pfs_unlock(pd);
529		goto got_pnode;
530	}
531
532	pfs_lock(pd);
533
534	/* named node */
535	for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next)
536		if (pn->pn_type == pfstype_procdir)
537			pdn = pn;
538		else if (strncmp(pname, pn->pn_name, namelen) == 0 &&
539		    pn->pn_name[namelen] == '\0') {
540			pfs_unlock(pd);
541			goto got_pnode;
542		}
543
544	/* process dependent node */
545	if ((pn = pdn) != NULL) {
546		pid = 0;
547		for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i)
548			if ((pid = pid * 10 + pname[i] - '0') > PID_MAX)
549				break;
550		if (i == cnp->cn_namelen) {
551			pfs_unlock(pd);
552			goto got_pnode;
553		}
554	}
555
556	pfs_unlock(pd);
557
558	PFS_RETURN (ENOENT);
559
560 got_pnode:
561	pfs_assert_not_owned(pd);
562	pfs_assert_not_owned(pn);
563	visible = pfs_visible(curthread, pn, pid, NULL);
564	if (!visible) {
565		error = ENOENT;
566		goto failed;
567	}
568
569	error = pfs_vncache_alloc(mp, vpp, pn, pid);
570	if (error)
571		goto failed;
572
573	if (cnp->cn_flags & ISDOTDOT) {
574		vfs_unbusy(mp);
575		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
576		if (VN_IS_DOOMED(vn)) {
577			vput(*vpp);
578			*vpp = NULL;
579			PFS_RETURN(ENOENT);
580		}
581	}
582	if (cnp->cn_flags & MAKEENTRY && !VN_IS_DOOMED(vn))
583		cache_enter(vn, *vpp, cnp);
584	PFS_RETURN (0);
585 failed:
586	if (cnp->cn_flags & ISDOTDOT) {
587		vfs_unbusy(mp);
588		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
589		*vpp = NULL;
590	}
591	PFS_RETURN(error);
592}
593
594/*
595 * Open a file or directory.
596 */
597static int
598pfs_open(struct vop_open_args *va)
599{
600	struct vnode *vn = va->a_vp;
601	struct pfs_vdata *pvd = vn->v_data;
602	struct pfs_node *pn = pvd->pvd_pn;
603	int mode = va->a_mode;
604
605	PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode));
606	pfs_assert_not_owned(pn);
607
608	/* check if the requested mode is permitted */
609	if (((mode & FREAD) && !(mode & PFS_RD)) ||
610	    ((mode & FWRITE) && !(mode & PFS_WR)))
611		PFS_RETURN (EPERM);
612
613	/* we don't support locking */
614	if ((mode & O_SHLOCK) || (mode & O_EXLOCK))
615		PFS_RETURN (EOPNOTSUPP);
616
617	PFS_RETURN (0);
618}
619
620struct sbuf_seek_helper {
621	off_t		skip_bytes;
622	struct uio	*uio;
623};
624
625static int
626pfs_sbuf_uio_drain(void *arg, const char *data, int len)
627{
628	struct sbuf_seek_helper *ssh;
629	struct uio *uio;
630	int error, skipped;
631
632	ssh = arg;
633	uio = ssh->uio;
634	skipped = 0;
635
636	/* Need to discard first uio_offset bytes. */
637	if (ssh->skip_bytes > 0) {
638		if (ssh->skip_bytes >= len) {
639			ssh->skip_bytes -= len;
640			return (len);
641		}
642
643		data += ssh->skip_bytes;
644		len -= ssh->skip_bytes;
645		skipped = ssh->skip_bytes;
646		ssh->skip_bytes = 0;
647	}
648
649	error = uiomove(__DECONST(void *, data), len, uio);
650	if (error != 0)
651		return (-error);
652
653	/*
654	 * The fill function has more to emit, but the reader is finished.
655	 * This is similar to the truncated read case for non-draining PFS
656	 * sbufs, and should be handled appropriately in fill-routines.
657	 */
658	if (uio->uio_resid == 0)
659		return (-ENOBUFS);
660
661	return (skipped + len);
662}
663
664/*
665 * Read from a file
666 */
667static int
668pfs_read(struct vop_read_args *va)
669{
670	struct vnode *vn = va->a_vp;
671	struct pfs_vdata *pvd = vn->v_data;
672	struct pfs_node *pn = pvd->pvd_pn;
673	struct uio *uio = va->a_uio;
674	struct proc *proc;
675	struct sbuf *sb = NULL;
676	int error, locked;
677	off_t buflen, buflim;
678	struct sbuf_seek_helper ssh;
679
680	PFS_TRACE(("%s", pn->pn_name));
681	pfs_assert_not_owned(pn);
682
683	if (vn->v_type != VREG)
684		PFS_RETURN (EINVAL);
685	KASSERT_PN_IS_FILE(pn);
686
687	if (!(pn->pn_flags & PFS_RD))
688		PFS_RETURN (EBADF);
689
690	if (pn->pn_fill == NULL)
691		PFS_RETURN (EIO);
692
693	/*
694	 * This is necessary because either process' privileges may
695	 * have changed since the open() call.
696	 */
697	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
698		PFS_RETURN (EIO);
699	if (proc != NULL) {
700		_PHOLD(proc);
701		PROC_UNLOCK(proc);
702	}
703
704	vhold(vn);
705	locked = VOP_ISLOCKED(vn);
706	VOP_UNLOCK(vn);
707
708	if (pn->pn_flags & PFS_RAWRD) {
709		PFS_TRACE(("%zd resid", uio->uio_resid));
710		error = pn_fill(curthread, proc, pn, NULL, uio);
711		PFS_TRACE(("%zd resid", uio->uio_resid));
712		goto ret;
713	}
714
715	if (uio->uio_resid < 0 || uio->uio_offset < 0 ||
716	    uio->uio_resid > OFF_MAX - uio->uio_offset) {
717		error = EINVAL;
718		goto ret;
719	}
720	buflen = uio->uio_offset + uio->uio_resid + 1;
721	if (pn->pn_flags & PFS_AUTODRAIN)
722		/*
723		 * We can use a smaller buffer if we can stream output to the
724		 * consumer.
725		 */
726		buflim = PAGE_SIZE;
727	else
728		buflim = PFS_MAXBUFSIZ;
729	if (buflen > buflim)
730		buflen = buflim;
731
732	sb = sbuf_new(sb, NULL, buflen, 0);
733	if (sb == NULL) {
734		error = EIO;
735		goto ret;
736	}
737
738	if (pn->pn_flags & PFS_AUTODRAIN) {
739		ssh.skip_bytes = uio->uio_offset;
740		ssh.uio = uio;
741		sbuf_set_drain(sb, pfs_sbuf_uio_drain, &ssh);
742	}
743
744	error = pn_fill(curthread, proc, pn, sb, uio);
745
746	if (error) {
747		sbuf_delete(sb);
748		goto ret;
749	}
750
751	/*
752	 * XXX: If the buffer overflowed, sbuf_len() will not return
753	 * the data length. Then just use the full length because an
754	 * overflowed sbuf must be full.
755	 */
756	error = sbuf_finish(sb);
757	if ((pn->pn_flags & PFS_AUTODRAIN)) {
758		/*
759		 * ENOBUFS just indicates early termination of the fill
760		 * function as the caller's buffer was already filled.  Squash
761		 * to zero.
762		 */
763		if (uio->uio_resid == 0 && error == ENOBUFS)
764			error = 0;
765	} else {
766		if (error == 0)
767			buflen = sbuf_len(sb);
768		else
769			/* The trailing byte is not valid. */
770			buflen--;
771		error = uiomove_frombuf(sbuf_data(sb), buflen, uio);
772	}
773	sbuf_delete(sb);
774ret:
775	vn_lock(vn, locked | LK_RETRY);
776	vdrop(vn);
777	if (proc != NULL)
778		PRELE(proc);
779	PFS_RETURN (error);
780}
781
782/*
783 * Iterate through directory entries
784 */
785static int
786pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd,
787	    struct pfs_node **pn, struct proc **p)
788{
789	int visible;
790
791	sx_assert(&allproc_lock, SX_SLOCKED);
792	pfs_assert_owned(pd);
793 again:
794	if (*pn == NULL) {
795		/* first node */
796		*pn = pd->pn_nodes;
797	} else if ((*pn)->pn_type != pfstype_procdir) {
798		/* next node */
799		*pn = (*pn)->pn_next;
800	}
801	if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
802		/* next process */
803		if (*p == NULL)
804			*p = LIST_FIRST(&allproc);
805		else
806			*p = LIST_NEXT(*p, p_list);
807		/* out of processes: next node */
808		if (*p == NULL)
809			*pn = (*pn)->pn_next;
810		else
811			PROC_LOCK(*p);
812	}
813
814	if ((*pn) == NULL)
815		return (-1);
816
817	if (*p != NULL) {
818		visible = pfs_visible_proc(td, *pn, *p);
819		PROC_UNLOCK(*p);
820	} else if (proc != NULL) {
821		visible = pfs_visible_proc(td, *pn, proc);
822	} else {
823		visible = pn_vis(td, NULL, *pn);
824	}
825	if (!visible)
826		goto again;
827
828	return (0);
829}
830
831/* Directory entry list */
832struct pfsentry {
833	STAILQ_ENTRY(pfsentry)	link;
834	struct dirent		entry;
835};
836STAILQ_HEAD(pfsdirentlist, pfsentry);
837
838/*
839 * Return directory entries.
840 */
841static int
842pfs_readdir(struct vop_readdir_args *va)
843{
844	struct vnode *vn = va->a_vp;
845	struct pfs_vdata *pvd = vn->v_data;
846	struct pfs_node *pd = pvd->pvd_pn;
847	pid_t pid = pvd->pvd_pid;
848	struct proc *p, *proc;
849	struct pfs_node *pn;
850	struct uio *uio;
851	struct pfsentry *pfsent, *pfsent2;
852	struct pfsdirentlist lst;
853	off_t offset;
854	int error, i, resid;
855
856	STAILQ_INIT(&lst);
857	error = 0;
858	KASSERT(pd->pn_info == vn->v_mount->mnt_data,
859	    ("%s(): pn_info does not match mountpoint", __func__));
860	PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
861	pfs_assert_not_owned(pd);
862
863	if (vn->v_type != VDIR)
864		PFS_RETURN (ENOTDIR);
865	KASSERT_PN_IS_DIR(pd);
866	uio = va->a_uio;
867
868	/* only allow reading entire entries */
869	offset = uio->uio_offset;
870	resid = uio->uio_resid;
871	if (offset < 0 || offset % PFS_DELEN != 0 ||
872	    (resid && resid < PFS_DELEN))
873		PFS_RETURN (EINVAL);
874	if (resid == 0)
875		PFS_RETURN (0);
876
877	proc = NULL;
878	if (pid != NO_PID && !pfs_lookup_proc(pid, &proc))
879		PFS_RETURN (ENOENT);
880
881	sx_slock(&allproc_lock);
882	pfs_lock(pd);
883
884	KASSERT(pid == NO_PID || proc != NULL,
885	    ("%s(): no process for pid %lu", __func__, (unsigned long)pid));
886
887	if (pid != NO_PID) {
888		PROC_LOCK(proc);
889
890		/* check if the directory is visible to the caller */
891		if (!pfs_visible_proc(curthread, pd, proc)) {
892			_PRELE(proc);
893			PROC_UNLOCK(proc);
894			pfs_unlock(pd);
895			sx_sunlock(&allproc_lock);
896			PFS_RETURN (ENOENT);
897		}
898	}
899
900	/* skip unwanted entries */
901	for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
902		if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
903			/* nothing left... */
904			if (proc != NULL) {
905				_PRELE(proc);
906				PROC_UNLOCK(proc);
907			}
908			pfs_unlock(pd);
909			sx_sunlock(&allproc_lock);
910			PFS_RETURN (0);
911		}
912	}
913
914	/* fill in entries */
915	while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
916	    resid >= PFS_DELEN) {
917		if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV,
918		    M_NOWAIT | M_ZERO)) == NULL) {
919			error = ENOMEM;
920			break;
921		}
922		pfsent->entry.d_reclen = PFS_DELEN;
923		pfsent->entry.d_fileno = pn_fileno(pn, pid);
924		/* PFS_DELEN was picked to fit PFS_NAMLEN */
925		for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i)
926			pfsent->entry.d_name[i] = pn->pn_name[i];
927		pfsent->entry.d_namlen = i;
928		/* NOTE: d_off is the offset of the *next* entry. */
929		pfsent->entry.d_off = offset + PFS_DELEN;
930		switch (pn->pn_type) {
931		case pfstype_procdir:
932			KASSERT(p != NULL,
933			    ("reached procdir node with p == NULL"));
934			pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name,
935			    PFS_NAMELEN, "%d", p->p_pid);
936			/* fall through */
937		case pfstype_root:
938		case pfstype_dir:
939		case pfstype_this:
940		case pfstype_parent:
941			pfsent->entry.d_type = DT_DIR;
942			break;
943		case pfstype_file:
944			pfsent->entry.d_type = DT_REG;
945			break;
946		case pfstype_symlink:
947			pfsent->entry.d_type = DT_LNK;
948			break;
949		default:
950			panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type);
951		}
952		PFS_TRACE(("%s", pfsent->entry.d_name));
953		dirent_terminate(&pfsent->entry);
954		STAILQ_INSERT_TAIL(&lst, pfsent, link);
955		offset += PFS_DELEN;
956		resid -= PFS_DELEN;
957	}
958	if (proc != NULL) {
959		_PRELE(proc);
960		PROC_UNLOCK(proc);
961	}
962	pfs_unlock(pd);
963	sx_sunlock(&allproc_lock);
964	i = 0;
965	STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
966		if (error == 0)
967			error = uiomove(&pfsent->entry, PFS_DELEN, uio);
968		free(pfsent, M_IOV);
969		i++;
970	}
971	PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN)));
972	PFS_RETURN (error);
973}
974
975/*
976 * Read a symbolic link
977 */
978static int
979pfs_readlink(struct vop_readlink_args *va)
980{
981	struct vnode *vn = va->a_vp;
982	struct pfs_vdata *pvd = vn->v_data;
983	struct pfs_node *pn = pvd->pvd_pn;
984	struct uio *uio = va->a_uio;
985	struct proc *proc = NULL;
986	char buf[PATH_MAX];
987	struct sbuf sb;
988	int error, locked;
989
990	PFS_TRACE(("%s", pn->pn_name));
991	pfs_assert_not_owned(pn);
992
993	if (vn->v_type != VLNK)
994		PFS_RETURN (EINVAL);
995	KASSERT_PN_IS_LINK(pn);
996
997	if (pn->pn_fill == NULL)
998		PFS_RETURN (EIO);
999
1000	if (pvd->pvd_pid != NO_PID) {
1001		if ((proc = pfind(pvd->pvd_pid)) == NULL)
1002			PFS_RETURN (EIO);
1003		if (proc->p_flag & P_WEXIT) {
1004			PROC_UNLOCK(proc);
1005			PFS_RETURN (EIO);
1006		}
1007		_PHOLD(proc);
1008		PROC_UNLOCK(proc);
1009	}
1010	vhold(vn);
1011	locked = VOP_ISLOCKED(vn);
1012	VOP_UNLOCK(vn);
1013
1014	/* sbuf_new() can't fail with a static buffer */
1015	sbuf_new(&sb, buf, sizeof buf, 0);
1016
1017	error = pn_fill(curthread, proc, pn, &sb, NULL);
1018
1019	if (proc != NULL)
1020		PRELE(proc);
1021	vn_lock(vn, locked | LK_RETRY);
1022	vdrop(vn);
1023
1024	if (error) {
1025		sbuf_delete(&sb);
1026		PFS_RETURN (error);
1027	}
1028
1029	if (sbuf_finish(&sb) != 0) {
1030		sbuf_delete(&sb);
1031		PFS_RETURN (ENAMETOOLONG);
1032	}
1033
1034	error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
1035	sbuf_delete(&sb);
1036	PFS_RETURN (error);
1037}
1038
1039/*
1040 * Reclaim a vnode
1041 */
1042static int
1043pfs_reclaim(struct vop_reclaim_args *va)
1044{
1045	struct vnode *vn = va->a_vp;
1046	struct pfs_vdata *pvd = vn->v_data;
1047	struct pfs_node *pn = pvd->pvd_pn;
1048
1049	PFS_TRACE(("%s", pn->pn_name));
1050	pfs_assert_not_owned(pn);
1051
1052	return (pfs_vncache_free(va->a_vp));
1053}
1054
1055/*
1056 * Set attributes
1057 */
1058static int
1059pfs_setattr(struct vop_setattr_args *va)
1060{
1061	struct vnode *vn = va->a_vp;
1062	struct pfs_vdata *pvd = vn->v_data;
1063	struct pfs_node *pn = pvd->pvd_pn;
1064
1065	PFS_TRACE(("%s", pn->pn_name));
1066	pfs_assert_not_owned(pn);
1067
1068	/* Silently ignore unchangeable attributes. */
1069	PFS_RETURN (0);
1070}
1071
1072/*
1073 * Write to a file
1074 */
1075static int
1076pfs_write(struct vop_write_args *va)
1077{
1078	struct vnode *vn = va->a_vp;
1079	struct pfs_vdata *pvd = vn->v_data;
1080	struct pfs_node *pn = pvd->pvd_pn;
1081	struct uio *uio = va->a_uio;
1082	struct proc *proc;
1083	struct sbuf sb;
1084	int error;
1085
1086	PFS_TRACE(("%s", pn->pn_name));
1087	pfs_assert_not_owned(pn);
1088
1089	if (vn->v_type != VREG)
1090		PFS_RETURN (EINVAL);
1091	KASSERT_PN_IS_FILE(pn);
1092
1093	if (!(pn->pn_flags & PFS_WR))
1094		PFS_RETURN (EBADF);
1095
1096	if (pn->pn_fill == NULL)
1097		PFS_RETURN (EIO);
1098
1099	if (uio->uio_resid > PFS_MAXBUFSIZ)
1100		PFS_RETURN (EIO);
1101
1102	/*
1103	 * This is necessary because either process' privileges may
1104	 * have changed since the open() call.
1105	 */
1106	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
1107		PFS_RETURN (EIO);
1108	if (proc != NULL) {
1109		_PHOLD(proc);
1110		PROC_UNLOCK(proc);
1111	}
1112
1113	if (pn->pn_flags & PFS_RAWWR) {
1114		error = pn_fill(curthread, proc, pn, NULL, uio);
1115		if (proc != NULL)
1116			PRELE(proc);
1117		PFS_RETURN (error);
1118	}
1119
1120	sbuf_uionew(&sb, uio, &error);
1121	if (error) {
1122		if (proc != NULL)
1123			PRELE(proc);
1124		PFS_RETURN (error);
1125	}
1126
1127	error = pn_fill(curthread, proc, pn, &sb, uio);
1128
1129	sbuf_delete(&sb);
1130	if (proc != NULL)
1131		PRELE(proc);
1132	PFS_RETURN (error);
1133}
1134
1135/*
1136 * Vnode operations
1137 */
1138struct vop_vector pfs_vnodeops = {
1139	.vop_default =		&default_vnodeops,
1140
1141	.vop_access =		pfs_access,
1142	.vop_cachedlookup =	pfs_lookup,
1143	.vop_close =		pfs_close,
1144	.vop_create =		VOP_EOPNOTSUPP,
1145	.vop_getattr =		pfs_getattr,
1146	.vop_getextattr =	pfs_getextattr,
1147	.vop_ioctl =		pfs_ioctl,
1148	.vop_link =		VOP_EOPNOTSUPP,
1149	.vop_lookup =		vfs_cache_lookup,
1150	.vop_mkdir =		VOP_EOPNOTSUPP,
1151	.vop_mknod =		VOP_EOPNOTSUPP,
1152	.vop_open =		pfs_open,
1153	.vop_read =		pfs_read,
1154	.vop_readdir =		pfs_readdir,
1155	.vop_readlink =		pfs_readlink,
1156	.vop_reclaim =		pfs_reclaim,
1157	.vop_remove =		VOP_EOPNOTSUPP,
1158	.vop_rename =		VOP_EOPNOTSUPP,
1159	.vop_rmdir =		VOP_EOPNOTSUPP,
1160	.vop_setattr =		pfs_setattr,
1161	.vop_symlink =		VOP_EOPNOTSUPP,
1162	.vop_vptocnp =		pfs_vptocnp,
1163	.vop_write =		pfs_write,
1164	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
1165	/* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */
1166};
1167VFS_VOP_VECTOR_REGISTER(pfs_vnodeops);
1168