1/*
2 *  linux/fs/proc/base.c
3 *
4 *  Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 *  proc base directory handling functions
7 *
8 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
9 *  Instead of using magical inumbers to determine the kind of object
10 *  we allocate and fill in-core inodes upon lookup. They don't even
11 *  go into icache. We cache the reference to task_struct upon lookup too.
12 *  Eventually it should become a filesystem in its own. We don't use the
13 *  rest of procfs anymore.
14 *
15 *
16 *  Changelog:
17 *  17-Jan-2005
18 *  Allan Bezerra
19 *  Bruna Moreira <bruna.moreira@indt.org.br>
20 *  Edjard Mota <edjard.mota@indt.org.br>
21 *  Ilias Biris <ilias.biris@indt.org.br>
22 *  Mauricio Lin <mauricio.lin@indt.org.br>
23 *
24 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
25 *
26 *  A new process specific entry (smaps) included in /proc. It shows the
27 *  size of rss for each memory area. The maps entry lacks information
28 *  about physical memory size (rss) for each mapped file, i.e.,
29 *  rss information for executables and library files.
30 *  This additional information is useful for any tools that need to know
31 *  about physical memory consumption for a process specific library.
32 *
33 *  Changelog:
34 *  21-Feb-2005
35 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
36 *  Pud inclusion in the page table walking.
37 *
38 *  ChangeLog:
39 *  10-Mar-2005
40 *  10LE Instituto Nokia de Tecnologia - INdT:
41 *  A better way to walks through the page table as suggested by Hugh Dickins.
42 *
43 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
44 *  Smaps information related to shared, private, clean and dirty pages.
45 *
46 *  Paul Mundt <paul.mundt@nokia.com>:
47 *  Overall revision about smaps.
48 */
49
50#include <asm/uaccess.h>
51
52#include <linux/errno.h>
53#include <linux/time.h>
54#include <linux/proc_fs.h>
55#include <linux/stat.h>
56#include <linux/init.h>
57#include <linux/capability.h>
58#include <linux/file.h>
59#include <linux/string.h>
60#include <linux/seq_file.h>
61#include <linux/namei.h>
62#include <linux/mnt_namespace.h>
63#include <linux/mm.h>
64#include <linux/rcupdate.h>
65#include <linux/kallsyms.h>
66#include <linux/module.h>
67#include <linux/mount.h>
68#include <linux/security.h>
69#include <linux/ptrace.h>
70#include <linux/seccomp.h>
71#include <linux/cpuset.h>
72#include <linux/audit.h>
73#include <linux/poll.h>
74#include <linux/nsproxy.h>
75#include <linux/oom.h>
76#include "internal.h"
77
78/* NOTE:
79 *	Implementing inode permission operations in /proc is almost
80 *	certainly an error.  Permission checks need to happen during
81 *	each system call not at open time.  The reason is that most of
82 *	what we wish to check for permissions in /proc varies at runtime.
83 *
84 *	The classic example of a problem is opening file descriptors
85 *	in /proc for a task before it execs a suid executable.
86 */
87
88
89/* Worst case buffer size needed for holding an integer. */
90#define PROC_NUMBUF 13
91
92struct pid_entry {
93	char *name;
94	int len;
95	mode_t mode;
96	const struct inode_operations *iop;
97	const struct file_operations *fop;
98	union proc_op op;
99};
100
101#define NOD(NAME, MODE, IOP, FOP, OP) {			\
102	.name = (NAME),					\
103	.len  = sizeof(NAME) - 1,			\
104	.mode = MODE,					\
105	.iop  = IOP,					\
106	.fop  = FOP,					\
107	.op   = OP,					\
108}
109
110#define DIR(NAME, MODE, OTYPE)							\
111	NOD(NAME, (S_IFDIR|(MODE)),						\
112		&proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,	\
113		{} )
114#define LNK(NAME, OTYPE)					\
115	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
116		&proc_pid_link_inode_operations, NULL,		\
117		{ .proc_get_link = &proc_##OTYPE##_link } )
118#define REG(NAME, MODE, OTYPE)				\
119	NOD(NAME, (S_IFREG|(MODE)), NULL,		\
120		&proc_##OTYPE##_operations, {})
121#define INF(NAME, MODE, OTYPE)				\
122	NOD(NAME, (S_IFREG|(MODE)), 			\
123		NULL, &proc_info_file_operations,	\
124		{ .proc_read = &proc_##OTYPE } )
125
126int maps_protect;
127EXPORT_SYMBOL(maps_protect);
128
129static struct fs_struct *get_fs_struct(struct task_struct *task)
130{
131	struct fs_struct *fs;
132	task_lock(task);
133	fs = task->fs;
134	if(fs)
135		atomic_inc(&fs->count);
136	task_unlock(task);
137	return fs;
138}
139
140static int get_nr_threads(struct task_struct *tsk)
141{
142	/* Must be called with the rcu_read_lock held */
143	unsigned long flags;
144	int count = 0;
145
146	if (lock_task_sighand(tsk, &flags)) {
147		count = atomic_read(&tsk->signal->count);
148		unlock_task_sighand(tsk, &flags);
149	}
150	return count;
151}
152
153static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
154{
155	struct task_struct *task = get_proc_task(inode);
156	struct fs_struct *fs = NULL;
157	int result = -ENOENT;
158
159	if (task) {
160		fs = get_fs_struct(task);
161		put_task_struct(task);
162	}
163	if (fs) {
164		read_lock(&fs->lock);
165		*mnt = mntget(fs->pwdmnt);
166		*dentry = dget(fs->pwd);
167		read_unlock(&fs->lock);
168		result = 0;
169		put_fs_struct(fs);
170	}
171	return result;
172}
173
174static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
175{
176	struct task_struct *task = get_proc_task(inode);
177	struct fs_struct *fs = NULL;
178	int result = -ENOENT;
179
180	if (task) {
181		fs = get_fs_struct(task);
182		put_task_struct(task);
183	}
184	if (fs) {
185		read_lock(&fs->lock);
186		*mnt = mntget(fs->rootmnt);
187		*dentry = dget(fs->root);
188		read_unlock(&fs->lock);
189		result = 0;
190		put_fs_struct(fs);
191	}
192	return result;
193}
194
195#define MAY_PTRACE(task) \
196	(task == current || \
197	(task->parent == current && \
198	(task->ptrace & PT_PTRACED) && \
199	 (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
200	 security_ptrace(current,task) == 0))
201
202static int proc_pid_environ(struct task_struct *task, char * buffer)
203{
204	int res = 0;
205	struct mm_struct *mm = get_task_mm(task);
206	if (mm) {
207		unsigned int len = mm->env_end - mm->env_start;
208		if (len > PAGE_SIZE)
209			len = PAGE_SIZE;
210		res = access_process_vm(task, mm->env_start, buffer, len, 0);
211		if (!ptrace_may_attach(task))
212			res = -ESRCH;
213		mmput(mm);
214	}
215	return res;
216}
217
218static int proc_pid_cmdline(struct task_struct *task, char * buffer)
219{
220	int res = 0;
221	unsigned int len;
222	struct mm_struct *mm = get_task_mm(task);
223	if (!mm)
224		goto out;
225	if (!mm->arg_end)
226		goto out_mm;	/* Shh! No looking before we're done */
227
228 	len = mm->arg_end - mm->arg_start;
229
230	if (len > PAGE_SIZE)
231		len = PAGE_SIZE;
232
233	res = access_process_vm(task, mm->arg_start, buffer, len, 0);
234
235	// If the nul at the end of args has been overwritten, then
236	// assume application is using setproctitle(3).
237	if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
238		len = strnlen(buffer, res);
239		if (len < res) {
240		    res = len;
241		} else {
242			len = mm->env_end - mm->env_start;
243			if (len > PAGE_SIZE - res)
244				len = PAGE_SIZE - res;
245			res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
246			res = strnlen(buffer, res);
247		}
248	}
249out_mm:
250	mmput(mm);
251out:
252	return res;
253}
254
255static int proc_pid_auxv(struct task_struct *task, char *buffer)
256{
257	int res = 0;
258	struct mm_struct *mm = get_task_mm(task);
259	if (mm) {
260		unsigned int nwords = 0;
261		do
262			nwords += 2;
263		while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
264		res = nwords * sizeof(mm->saved_auxv[0]);
265		if (res > PAGE_SIZE)
266			res = PAGE_SIZE;
267		memcpy(buffer, mm->saved_auxv, res);
268		mmput(mm);
269	}
270	return res;
271}
272
273
274#ifdef CONFIG_KALLSYMS
275/*
276 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
277 * Returns the resolved symbol.  If that fails, simply return the address.
278 */
279static int proc_pid_wchan(struct task_struct *task, char *buffer)
280{
281	unsigned long wchan;
282	char symname[KSYM_NAME_LEN+1];
283
284	wchan = get_wchan(task);
285
286	if (lookup_symbol_name(wchan, symname) < 0)
287		return sprintf(buffer, "%lu", wchan);
288	else
289		return sprintf(buffer, "%s", symname);
290}
291#endif /* CONFIG_KALLSYMS */
292
293#ifdef CONFIG_SCHEDSTATS
294/*
295 * Provides /proc/PID/schedstat
296 */
297static int proc_pid_schedstat(struct task_struct *task, char *buffer)
298{
299	return sprintf(buffer, "%llu %llu %lu\n",
300			task->sched_info.cpu_time,
301			task->sched_info.run_delay,
302			task->sched_info.pcount);
303}
304#endif
305
306/* The badness from the OOM killer */
307unsigned long badness(struct task_struct *p, unsigned long uptime);
308static int proc_oom_score(struct task_struct *task, char *buffer)
309{
310	unsigned long points;
311	struct timespec uptime;
312
313	do_posix_clock_monotonic_gettime(&uptime);
314	read_lock(&tasklist_lock);
315	points = badness(task, uptime.tv_sec);
316	read_unlock(&tasklist_lock);
317	return sprintf(buffer, "%lu\n", points);
318}
319
320/************************************************************************/
321/*                       Here the fs part begins                        */
322/************************************************************************/
323
324/* permission checks */
325static int proc_fd_access_allowed(struct inode *inode)
326{
327	struct task_struct *task;
328	int allowed = 0;
329	/* Allow access to a task's file descriptors if it is us or we
330	 * may use ptrace attach to the process and find out that
331	 * information.
332	 */
333	task = get_proc_task(inode);
334	if (task) {
335		allowed = ptrace_may_attach(task);
336		put_task_struct(task);
337	}
338	return allowed;
339}
340
341static int proc_setattr(struct dentry *dentry, struct iattr *attr)
342{
343	int error;
344	struct inode *inode = dentry->d_inode;
345
346	if (attr->ia_valid & ATTR_MODE)
347		return -EPERM;
348
349	error = inode_change_ok(inode, attr);
350	if (!error)
351		error = inode_setattr(inode, attr);
352	return error;
353}
354
355static const struct inode_operations proc_def_inode_operations = {
356	.setattr	= proc_setattr,
357};
358
359extern struct seq_operations mounts_op;
360struct proc_mounts {
361	struct seq_file m;
362	int event;
363};
364
365static int mounts_open(struct inode *inode, struct file *file)
366{
367	struct task_struct *task = get_proc_task(inode);
368	struct mnt_namespace *ns = NULL;
369	struct proc_mounts *p;
370	int ret = -EINVAL;
371
372	if (task) {
373		task_lock(task);
374		if (task->nsproxy) {
375			ns = task->nsproxy->mnt_ns;
376			if (ns)
377				get_mnt_ns(ns);
378		}
379		task_unlock(task);
380		put_task_struct(task);
381	}
382
383	if (ns) {
384		ret = -ENOMEM;
385		p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
386		if (p) {
387			file->private_data = &p->m;
388			ret = seq_open(file, &mounts_op);
389			if (!ret) {
390				p->m.private = ns;
391				p->event = ns->event;
392				return 0;
393			}
394			kfree(p);
395		}
396		put_mnt_ns(ns);
397	}
398	return ret;
399}
400
401static int mounts_release(struct inode *inode, struct file *file)
402{
403	struct seq_file *m = file->private_data;
404	struct mnt_namespace *ns = m->private;
405	put_mnt_ns(ns);
406	return seq_release(inode, file);
407}
408
409static unsigned mounts_poll(struct file *file, poll_table *wait)
410{
411	struct proc_mounts *p = file->private_data;
412	struct mnt_namespace *ns = p->m.private;
413	unsigned res = 0;
414
415	poll_wait(file, &ns->poll, wait);
416
417	spin_lock(&vfsmount_lock);
418	if (p->event != ns->event) {
419		p->event = ns->event;
420		res = POLLERR;
421	}
422	spin_unlock(&vfsmount_lock);
423
424	return res;
425}
426
427static const struct file_operations proc_mounts_operations = {
428	.open		= mounts_open,
429	.read		= seq_read,
430	.llseek		= seq_lseek,
431	.release	= mounts_release,
432	.poll		= mounts_poll,
433};
434
435extern struct seq_operations mountstats_op;
436static int mountstats_open(struct inode *inode, struct file *file)
437{
438	int ret = seq_open(file, &mountstats_op);
439
440	if (!ret) {
441		struct seq_file *m = file->private_data;
442		struct mnt_namespace *mnt_ns = NULL;
443		struct task_struct *task = get_proc_task(inode);
444
445		if (task) {
446			task_lock(task);
447			if (task->nsproxy)
448				mnt_ns = task->nsproxy->mnt_ns;
449			if (mnt_ns)
450				get_mnt_ns(mnt_ns);
451			task_unlock(task);
452			put_task_struct(task);
453		}
454
455		if (mnt_ns)
456			m->private = mnt_ns;
457		else {
458			seq_release(inode, file);
459			ret = -EINVAL;
460		}
461	}
462	return ret;
463}
464
465static const struct file_operations proc_mountstats_operations = {
466	.open		= mountstats_open,
467	.read		= seq_read,
468	.llseek		= seq_lseek,
469	.release	= mounts_release,
470};
471
472#define PROC_BLOCK_SIZE	(3*1024)		/* 4K page size but our output routines use some slack for overruns */
473
474static ssize_t proc_info_read(struct file * file, char __user * buf,
475			  size_t count, loff_t *ppos)
476{
477	struct inode * inode = file->f_path.dentry->d_inode;
478	unsigned long page;
479	ssize_t length;
480	struct task_struct *task = get_proc_task(inode);
481
482	length = -ESRCH;
483	if (!task)
484		goto out_no_task;
485
486	if (count > PROC_BLOCK_SIZE)
487		count = PROC_BLOCK_SIZE;
488
489	length = -ENOMEM;
490	if (!(page = __get_free_page(GFP_KERNEL)))
491		goto out;
492
493	length = PROC_I(inode)->op.proc_read(task, (char*)page);
494
495	if (length >= 0)
496		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
497	free_page(page);
498out:
499	put_task_struct(task);
500out_no_task:
501	return length;
502}
503
504static const struct file_operations proc_info_file_operations = {
505	.read		= proc_info_read,
506};
507
508static int mem_open(struct inode* inode, struct file* file)
509{
510	file->private_data = (void*)((long)current->self_exec_id);
511	return 0;
512}
513
514static ssize_t mem_read(struct file * file, char __user * buf,
515			size_t count, loff_t *ppos)
516{
517	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
518	char *page;
519	unsigned long src = *ppos;
520	int ret = -ESRCH;
521	struct mm_struct *mm;
522
523	if (!task)
524		goto out_no_task;
525
526	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
527		goto out;
528
529	ret = -ENOMEM;
530	page = (char *)__get_free_page(GFP_USER);
531	if (!page)
532		goto out;
533
534	ret = 0;
535
536	mm = get_task_mm(task);
537	if (!mm)
538		goto out_free;
539
540	ret = -EIO;
541
542	if (file->private_data != (void*)((long)current->self_exec_id))
543		goto out_put;
544
545	ret = 0;
546
547	while (count > 0) {
548		int this_len, retval;
549
550		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
551		retval = access_process_vm(task, src, page, this_len, 0);
552		if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
553			if (!ret)
554				ret = -EIO;
555			break;
556		}
557
558		if (copy_to_user(buf, page, retval)) {
559			ret = -EFAULT;
560			break;
561		}
562
563		ret += retval;
564		src += retval;
565		buf += retval;
566		count -= retval;
567	}
568	*ppos = src;
569
570out_put:
571	mmput(mm);
572out_free:
573	free_page((unsigned long) page);
574out:
575	put_task_struct(task);
576out_no_task:
577	return ret;
578}
579
580#define mem_write NULL
581
582#ifndef mem_write
583/* This is a security hazard */
584static ssize_t mem_write(struct file * file, const char __user *buf,
585			 size_t count, loff_t *ppos)
586{
587	int copied;
588	char *page;
589	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
590	unsigned long dst = *ppos;
591
592	copied = -ESRCH;
593	if (!task)
594		goto out_no_task;
595
596	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
597		goto out;
598
599	copied = -ENOMEM;
600	page = (char *)__get_free_page(GFP_USER);
601	if (!page)
602		goto out;
603
604	copied = 0;
605	while (count > 0) {
606		int this_len, retval;
607
608		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
609		if (copy_from_user(page, buf, this_len)) {
610			copied = -EFAULT;
611			break;
612		}
613		retval = access_process_vm(task, dst, page, this_len, 1);
614		if (!retval) {
615			if (!copied)
616				copied = -EIO;
617			break;
618		}
619		copied += retval;
620		buf += retval;
621		dst += retval;
622		count -= retval;
623	}
624	*ppos = dst;
625	free_page((unsigned long) page);
626out:
627	put_task_struct(task);
628out_no_task:
629	return copied;
630}
631#endif
632
633static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
634{
635	switch (orig) {
636	case 0:
637		file->f_pos = offset;
638		break;
639	case 1:
640		file->f_pos += offset;
641		break;
642	default:
643		return -EINVAL;
644	}
645	force_successful_syscall_return();
646	return file->f_pos;
647}
648
649static const struct file_operations proc_mem_operations = {
650	.llseek		= mem_lseek,
651	.read		= mem_read,
652	.write		= mem_write,
653	.open		= mem_open,
654};
655
656static ssize_t oom_adjust_read(struct file *file, char __user *buf,
657				size_t count, loff_t *ppos)
658{
659	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
660	char buffer[PROC_NUMBUF];
661	size_t len;
662	int oom_adjust;
663
664	if (!task)
665		return -ESRCH;
666	oom_adjust = task->oomkilladj;
667	put_task_struct(task);
668
669	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
670
671	return simple_read_from_buffer(buf, count, ppos, buffer, len);
672}
673
674static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
675				size_t count, loff_t *ppos)
676{
677	struct task_struct *task;
678	char buffer[PROC_NUMBUF], *end;
679	int oom_adjust;
680
681	memset(buffer, 0, sizeof(buffer));
682	if (count > sizeof(buffer) - 1)
683		count = sizeof(buffer) - 1;
684	if (copy_from_user(buffer, buf, count))
685		return -EFAULT;
686	oom_adjust = simple_strtol(buffer, &end, 0);
687	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
688	     oom_adjust != OOM_DISABLE)
689		return -EINVAL;
690	if (*end == '\n')
691		end++;
692	task = get_proc_task(file->f_path.dentry->d_inode);
693	if (!task)
694		return -ESRCH;
695	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
696		put_task_struct(task);
697		return -EACCES;
698	}
699	task->oomkilladj = oom_adjust;
700	put_task_struct(task);
701	if (end - buffer == 0)
702		return -EIO;
703	return end - buffer;
704}
705
706static const struct file_operations proc_oom_adjust_operations = {
707	.read		= oom_adjust_read,
708	.write		= oom_adjust_write,
709};
710
711#ifdef CONFIG_MMU
712static ssize_t clear_refs_write(struct file *file, const char __user *buf,
713				size_t count, loff_t *ppos)
714{
715	struct task_struct *task;
716	char buffer[PROC_NUMBUF], *end;
717	struct mm_struct *mm;
718
719	memset(buffer, 0, sizeof(buffer));
720	if (count > sizeof(buffer) - 1)
721		count = sizeof(buffer) - 1;
722	if (copy_from_user(buffer, buf, count))
723		return -EFAULT;
724	if (!simple_strtol(buffer, &end, 0))
725		return -EINVAL;
726	if (*end == '\n')
727		end++;
728	task = get_proc_task(file->f_path.dentry->d_inode);
729	if (!task)
730		return -ESRCH;
731	mm = get_task_mm(task);
732	if (mm) {
733		clear_refs_smap(mm);
734		mmput(mm);
735	}
736	put_task_struct(task);
737	if (end - buffer == 0)
738		return -EIO;
739	return end - buffer;
740}
741
742static struct file_operations proc_clear_refs_operations = {
743	.write		= clear_refs_write,
744};
745#endif
746
747#ifdef CONFIG_AUDITSYSCALL
748#define TMPBUFLEN 21
749static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
750				  size_t count, loff_t *ppos)
751{
752	struct inode * inode = file->f_path.dentry->d_inode;
753	struct task_struct *task = get_proc_task(inode);
754	ssize_t length;
755	char tmpbuf[TMPBUFLEN];
756
757	if (!task)
758		return -ESRCH;
759	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
760				audit_get_loginuid(task->audit_context));
761	put_task_struct(task);
762	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
763}
764
765static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
766				   size_t count, loff_t *ppos)
767{
768	struct inode * inode = file->f_path.dentry->d_inode;
769	char *page, *tmp;
770	ssize_t length;
771	uid_t loginuid;
772
773	if (!capable(CAP_AUDIT_CONTROL))
774		return -EPERM;
775
776	if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
777		return -EPERM;
778
779	if (count >= PAGE_SIZE)
780		count = PAGE_SIZE - 1;
781
782	if (*ppos != 0) {
783		/* No partial writes. */
784		return -EINVAL;
785	}
786	page = (char*)__get_free_page(GFP_USER);
787	if (!page)
788		return -ENOMEM;
789	length = -EFAULT;
790	if (copy_from_user(page, buf, count))
791		goto out_free_page;
792
793	page[count] = '\0';
794	loginuid = simple_strtoul(page, &tmp, 10);
795	if (tmp == page) {
796		length = -EINVAL;
797		goto out_free_page;
798
799	}
800	length = audit_set_loginuid(current, loginuid);
801	if (likely(length == 0))
802		length = count;
803
804out_free_page:
805	free_page((unsigned long) page);
806	return length;
807}
808
809static const struct file_operations proc_loginuid_operations = {
810	.read		= proc_loginuid_read,
811	.write		= proc_loginuid_write,
812};
813#endif
814
815#ifdef CONFIG_SECCOMP
816static ssize_t seccomp_read(struct file *file, char __user *buf,
817			    size_t count, loff_t *ppos)
818{
819	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
820	char __buf[20];
821	size_t len;
822
823	if (!tsk)
824		return -ESRCH;
825	/* no need to print the trailing zero, so use only len */
826	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
827	put_task_struct(tsk);
828
829	return simple_read_from_buffer(buf, count, ppos, __buf, len);
830}
831
832static ssize_t seccomp_write(struct file *file, const char __user *buf,
833			     size_t count, loff_t *ppos)
834{
835	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
836	char __buf[20], *end;
837	unsigned int seccomp_mode;
838	ssize_t result;
839
840	result = -ESRCH;
841	if (!tsk)
842		goto out_no_task;
843
844	/* can set it only once to be even more secure */
845	result = -EPERM;
846	if (unlikely(tsk->seccomp.mode))
847		goto out;
848
849	result = -EFAULT;
850	memset(__buf, 0, sizeof(__buf));
851	count = min(count, sizeof(__buf) - 1);
852	if (copy_from_user(__buf, buf, count))
853		goto out;
854
855	seccomp_mode = simple_strtoul(__buf, &end, 0);
856	if (*end == '\n')
857		end++;
858	result = -EINVAL;
859	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
860		tsk->seccomp.mode = seccomp_mode;
861		set_tsk_thread_flag(tsk, TIF_SECCOMP);
862	} else
863		goto out;
864	result = -EIO;
865	if (unlikely(!(end - __buf)))
866		goto out;
867	result = end - __buf;
868out:
869	put_task_struct(tsk);
870out_no_task:
871	return result;
872}
873
874static const struct file_operations proc_seccomp_operations = {
875	.read		= seccomp_read,
876	.write		= seccomp_write,
877};
878#endif /* CONFIG_SECCOMP */
879
880#ifdef CONFIG_FAULT_INJECTION
881static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
882				      size_t count, loff_t *ppos)
883{
884	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
885	char buffer[PROC_NUMBUF];
886	size_t len;
887	int make_it_fail;
888
889	if (!task)
890		return -ESRCH;
891	make_it_fail = task->make_it_fail;
892	put_task_struct(task);
893
894	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
895
896	return simple_read_from_buffer(buf, count, ppos, buffer, len);
897}
898
899static ssize_t proc_fault_inject_write(struct file * file,
900			const char __user * buf, size_t count, loff_t *ppos)
901{
902	struct task_struct *task;
903	char buffer[PROC_NUMBUF], *end;
904	int make_it_fail;
905
906	if (!capable(CAP_SYS_RESOURCE))
907		return -EPERM;
908	memset(buffer, 0, sizeof(buffer));
909	if (count > sizeof(buffer) - 1)
910		count = sizeof(buffer) - 1;
911	if (copy_from_user(buffer, buf, count))
912		return -EFAULT;
913	make_it_fail = simple_strtol(buffer, &end, 0);
914	if (*end == '\n')
915		end++;
916	task = get_proc_task(file->f_dentry->d_inode);
917	if (!task)
918		return -ESRCH;
919	task->make_it_fail = make_it_fail;
920	put_task_struct(task);
921	if (end - buffer == 0)
922		return -EIO;
923	return end - buffer;
924}
925
926static const struct file_operations proc_fault_inject_operations = {
927	.read		= proc_fault_inject_read,
928	.write		= proc_fault_inject_write,
929};
930#endif
931
932#ifdef CONFIG_SCHED_DEBUG
933/*
934 * Print out various scheduling related per-task fields:
935 */
936static int sched_show(struct seq_file *m, void *v)
937{
938	struct inode *inode = m->private;
939	struct task_struct *p;
940
941	WARN_ON(!inode);
942
943	p = get_proc_task(inode);
944	if (!p)
945		return -ESRCH;
946	proc_sched_show_task(p, m);
947
948	put_task_struct(p);
949
950	return 0;
951}
952
953static ssize_t
954sched_write(struct file *file, const char __user *buf,
955	    size_t count, loff_t *offset)
956{
957	struct inode *inode = file->f_path.dentry->d_inode;
958	struct task_struct *p;
959
960	WARN_ON(!inode);
961
962	p = get_proc_task(inode);
963	if (!p)
964		return -ESRCH;
965	proc_sched_set_task(p);
966
967	put_task_struct(p);
968
969	return count;
970}
971
972static int sched_open(struct inode *inode, struct file *filp)
973{
974	int ret;
975
976	ret = single_open(filp, sched_show, NULL);
977	if (!ret) {
978		struct seq_file *m = filp->private_data;
979
980		m->private = inode;
981	}
982	return ret;
983}
984
985static const struct file_operations proc_pid_sched_operations = {
986	.open		= sched_open,
987	.read		= seq_read,
988	.write		= sched_write,
989	.llseek		= seq_lseek,
990	.release	= seq_release,
991};
992
993#endif
994
995static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
996{
997	struct inode *inode = dentry->d_inode;
998	int error = -EACCES;
999
1000	/* We don't need a base pointer in the /proc filesystem */
1001	path_release(nd);
1002
1003	/* Are we allowed to snoop on the tasks file descriptors? */
1004	if (!proc_fd_access_allowed(inode))
1005		goto out;
1006
1007	error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
1008	nd->last_type = LAST_BIND;
1009out:
1010	return ERR_PTR(error);
1011}
1012
1013static int do_proc_readlink(struct dentry *dentry, struct vfsmount *mnt,
1014			    char __user *buffer, int buflen)
1015{
1016	struct inode * inode;
1017	char *tmp = (char*)__get_free_page(GFP_KERNEL), *path;
1018	int len;
1019
1020	if (!tmp)
1021		return -ENOMEM;
1022
1023	inode = dentry->d_inode;
1024	path = d_path(dentry, mnt, tmp, PAGE_SIZE);
1025	len = PTR_ERR(path);
1026	if (IS_ERR(path))
1027		goto out;
1028	len = tmp + PAGE_SIZE - 1 - path;
1029
1030	if (len > buflen)
1031		len = buflen;
1032	if (copy_to_user(buffer, path, len))
1033		len = -EFAULT;
1034 out:
1035	free_page((unsigned long)tmp);
1036	return len;
1037}
1038
1039static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1040{
1041	int error = -EACCES;
1042	struct inode *inode = dentry->d_inode;
1043	struct dentry *de;
1044	struct vfsmount *mnt = NULL;
1045
1046	/* Are we allowed to snoop on the tasks file descriptors? */
1047	if (!proc_fd_access_allowed(inode))
1048		goto out;
1049
1050	error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
1051	if (error)
1052		goto out;
1053
1054	error = do_proc_readlink(de, mnt, buffer, buflen);
1055	dput(de);
1056	mntput(mnt);
1057out:
1058	return error;
1059}
1060
1061static const struct inode_operations proc_pid_link_inode_operations = {
1062	.readlink	= proc_pid_readlink,
1063	.follow_link	= proc_pid_follow_link,
1064	.setattr	= proc_setattr,
1065};
1066
1067
1068/* building an inode */
1069
1070static int task_dumpable(struct task_struct *task)
1071{
1072	int dumpable = 0;
1073	struct mm_struct *mm;
1074
1075	task_lock(task);
1076	mm = task->mm;
1077	if (mm)
1078		dumpable = mm->dumpable;
1079	task_unlock(task);
1080	if(dumpable == 1)
1081		return 1;
1082	return 0;
1083}
1084
1085
1086static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1087{
1088	struct inode * inode;
1089	struct proc_inode *ei;
1090
1091	/* We need a new inode */
1092
1093	inode = new_inode(sb);
1094	if (!inode)
1095		goto out;
1096
1097	/* Common stuff */
1098	ei = PROC_I(inode);
1099	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1100	inode->i_op = &proc_def_inode_operations;
1101
1102	/*
1103	 * grab the reference to task.
1104	 */
1105	ei->pid = get_task_pid(task, PIDTYPE_PID);
1106	if (!ei->pid)
1107		goto out_unlock;
1108
1109	inode->i_uid = 0;
1110	inode->i_gid = 0;
1111	if (task_dumpable(task)) {
1112		inode->i_uid = task->euid;
1113		inode->i_gid = task->egid;
1114	}
1115	security_task_to_inode(task, inode);
1116
1117out:
1118	return inode;
1119
1120out_unlock:
1121	iput(inode);
1122	return NULL;
1123}
1124
1125static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1126{
1127	struct inode *inode = dentry->d_inode;
1128	struct task_struct *task;
1129	generic_fillattr(inode, stat);
1130
1131	rcu_read_lock();
1132	stat->uid = 0;
1133	stat->gid = 0;
1134	task = pid_task(proc_pid(inode), PIDTYPE_PID);
1135	if (task) {
1136		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1137		    task_dumpable(task)) {
1138			stat->uid = task->euid;
1139			stat->gid = task->egid;
1140		}
1141	}
1142	rcu_read_unlock();
1143	return 0;
1144}
1145
1146/* dentry stuff */
1147
1148/*
1149 *	Exceptional case: normally we are not allowed to unhash a busy
1150 * directory. In this case, however, we can do it - no aliasing problems
1151 * due to the way we treat inodes.
1152 *
1153 * Rewrite the inode's ownerships here because the owning task may have
1154 * performed a setuid(), etc.
1155 *
1156 * Before the /proc/pid/status file was created the only way to read
1157 * the effective uid of a /process was to stat /proc/pid.  Reading
1158 * /proc/pid/status is slow enough that procps and other packages
1159 * kept stating /proc/pid.  To keep the rules in /proc simple I have
1160 * made this apply to all per process world readable and executable
1161 * directories.
1162 */
1163static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1164{
1165	struct inode *inode = dentry->d_inode;
1166	struct task_struct *task = get_proc_task(inode);
1167	if (task) {
1168		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1169		    task_dumpable(task)) {
1170			inode->i_uid = task->euid;
1171			inode->i_gid = task->egid;
1172		} else {
1173			inode->i_uid = 0;
1174			inode->i_gid = 0;
1175		}
1176		inode->i_mode &= ~(S_ISUID | S_ISGID);
1177		security_task_to_inode(task, inode);
1178		put_task_struct(task);
1179		return 1;
1180	}
1181	d_drop(dentry);
1182	return 0;
1183}
1184
1185static int pid_delete_dentry(struct dentry * dentry)
1186{
1187	/* Is the task we represent dead?
1188	 * If so, then don't put the dentry on the lru list,
1189	 * kill it immediately.
1190	 */
1191	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1192}
1193
1194static struct dentry_operations pid_dentry_operations =
1195{
1196	.d_revalidate	= pid_revalidate,
1197	.d_delete	= pid_delete_dentry,
1198};
1199
1200/* Lookups */
1201
1202typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1203				struct task_struct *, const void *);
1204
1205/*
1206 * Fill a directory entry.
1207 *
1208 * If possible create the dcache entry and derive our inode number and
1209 * file type from dcache entry.
1210 *
1211 * Since all of the proc inode numbers are dynamically generated, the inode
1212 * numbers do not exist until the inode is cache.  This means creating the
1213 * the dcache entry in readdir is necessary to keep the inode numbers
1214 * reported by readdir in sync with the inode numbers reported
1215 * by stat.
1216 */
1217static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1218	char *name, int len,
1219	instantiate_t instantiate, struct task_struct *task, const void *ptr)
1220{
1221	struct dentry *child, *dir = filp->f_path.dentry;
1222	struct inode *inode;
1223	struct qstr qname;
1224	ino_t ino = 0;
1225	unsigned type = DT_UNKNOWN;
1226
1227	qname.name = name;
1228	qname.len  = len;
1229	qname.hash = full_name_hash(name, len);
1230
1231	child = d_lookup(dir, &qname);
1232	if (!child) {
1233		struct dentry *new;
1234		new = d_alloc(dir, &qname);
1235		if (new) {
1236			child = instantiate(dir->d_inode, new, task, ptr);
1237			if (child)
1238				dput(new);
1239			else
1240				child = new;
1241		}
1242	}
1243	if (!child || IS_ERR(child) || !child->d_inode)
1244		goto end_instantiate;
1245	inode = child->d_inode;
1246	if (inode) {
1247		ino = inode->i_ino;
1248		type = inode->i_mode >> 12;
1249	}
1250	dput(child);
1251end_instantiate:
1252	if (!ino)
1253		ino = find_inode_number(dir, &qname);
1254	if (!ino)
1255		ino = 1;
1256	return filldir(dirent, name, len, filp->f_pos, ino, type);
1257}
1258
1259static unsigned name_to_int(struct dentry *dentry)
1260{
1261	const char *name = dentry->d_name.name;
1262	int len = dentry->d_name.len;
1263	unsigned n = 0;
1264
1265	if (len > 1 && *name == '0')
1266		goto out;
1267	while (len-- > 0) {
1268		unsigned c = *name++ - '0';
1269		if (c > 9)
1270			goto out;
1271		if (n >= (~0U-9)/10)
1272			goto out;
1273		n *= 10;
1274		n += c;
1275	}
1276	return n;
1277out:
1278	return ~0U;
1279}
1280
1281#define PROC_FDINFO_MAX 64
1282
1283static int proc_fd_info(struct inode *inode, struct dentry **dentry,
1284			struct vfsmount **mnt, char *info)
1285{
1286	struct task_struct *task = get_proc_task(inode);
1287	struct files_struct *files = NULL;
1288	struct file *file;
1289	int fd = proc_fd(inode);
1290
1291	if (task) {
1292		files = get_files_struct(task);
1293		put_task_struct(task);
1294	}
1295	if (files) {
1296		/*
1297		 * We are not taking a ref to the file structure, so we must
1298		 * hold ->file_lock.
1299		 */
1300		spin_lock(&files->file_lock);
1301		file = fcheck_files(files, fd);
1302		if (file) {
1303			if (mnt)
1304				*mnt = mntget(file->f_path.mnt);
1305			if (dentry)
1306				*dentry = dget(file->f_path.dentry);
1307			if (info)
1308				snprintf(info, PROC_FDINFO_MAX,
1309					 "pos:\t%lli\n"
1310					 "flags:\t0%o\n",
1311					 (long long) file->f_pos,
1312					 file->f_flags);
1313			spin_unlock(&files->file_lock);
1314			put_files_struct(files);
1315			return 0;
1316		}
1317		spin_unlock(&files->file_lock);
1318		put_files_struct(files);
1319	}
1320	return -ENOENT;
1321}
1322
1323static int proc_fd_link(struct inode *inode, struct dentry **dentry,
1324			struct vfsmount **mnt)
1325{
1326	return proc_fd_info(inode, dentry, mnt, NULL);
1327}
1328
1329static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1330{
1331	struct inode *inode = dentry->d_inode;
1332	struct task_struct *task = get_proc_task(inode);
1333	int fd = proc_fd(inode);
1334	struct files_struct *files;
1335
1336	if (task) {
1337		files = get_files_struct(task);
1338		if (files) {
1339			rcu_read_lock();
1340			if (fcheck_files(files, fd)) {
1341				rcu_read_unlock();
1342				put_files_struct(files);
1343				if (task_dumpable(task)) {
1344					inode->i_uid = task->euid;
1345					inode->i_gid = task->egid;
1346				} else {
1347					inode->i_uid = 0;
1348					inode->i_gid = 0;
1349				}
1350				inode->i_mode &= ~(S_ISUID | S_ISGID);
1351				security_task_to_inode(task, inode);
1352				put_task_struct(task);
1353				return 1;
1354			}
1355			rcu_read_unlock();
1356			put_files_struct(files);
1357		}
1358		put_task_struct(task);
1359	}
1360	d_drop(dentry);
1361	return 0;
1362}
1363
1364static struct dentry_operations tid_fd_dentry_operations =
1365{
1366	.d_revalidate	= tid_fd_revalidate,
1367	.d_delete	= pid_delete_dentry,
1368};
1369
1370static struct dentry *proc_fd_instantiate(struct inode *dir,
1371	struct dentry *dentry, struct task_struct *task, const void *ptr)
1372{
1373	unsigned fd = *(const unsigned *)ptr;
1374	struct file *file;
1375	struct files_struct *files;
1376 	struct inode *inode;
1377 	struct proc_inode *ei;
1378	struct dentry *error = ERR_PTR(-ENOENT);
1379
1380	inode = proc_pid_make_inode(dir->i_sb, task);
1381	if (!inode)
1382		goto out;
1383	ei = PROC_I(inode);
1384	ei->fd = fd;
1385	files = get_files_struct(task);
1386	if (!files)
1387		goto out_iput;
1388	inode->i_mode = S_IFLNK;
1389
1390	/*
1391	 * We are not taking a ref to the file structure, so we must
1392	 * hold ->file_lock.
1393	 */
1394	spin_lock(&files->file_lock);
1395	file = fcheck_files(files, fd);
1396	if (!file)
1397		goto out_unlock;
1398	if (file->f_mode & 1)
1399		inode->i_mode |= S_IRUSR | S_IXUSR;
1400	if (file->f_mode & 2)
1401		inode->i_mode |= S_IWUSR | S_IXUSR;
1402	spin_unlock(&files->file_lock);
1403	put_files_struct(files);
1404
1405	inode->i_op = &proc_pid_link_inode_operations;
1406	inode->i_size = 64;
1407	ei->op.proc_get_link = proc_fd_link;
1408	dentry->d_op = &tid_fd_dentry_operations;
1409	d_add(dentry, inode);
1410	/* Close the race of the process dying before we return the dentry */
1411	if (tid_fd_revalidate(dentry, NULL))
1412		error = NULL;
1413
1414 out:
1415	return error;
1416out_unlock:
1417	spin_unlock(&files->file_lock);
1418	put_files_struct(files);
1419out_iput:
1420	iput(inode);
1421	goto out;
1422}
1423
1424static struct dentry *proc_lookupfd_common(struct inode *dir,
1425					   struct dentry *dentry,
1426					   instantiate_t instantiate)
1427{
1428	struct task_struct *task = get_proc_task(dir);
1429	unsigned fd = name_to_int(dentry);
1430	struct dentry *result = ERR_PTR(-ENOENT);
1431
1432	if (!task)
1433		goto out_no_task;
1434	if (fd == ~0U)
1435		goto out;
1436
1437	result = instantiate(dir, dentry, task, &fd);
1438out:
1439	put_task_struct(task);
1440out_no_task:
1441	return result;
1442}
1443
1444static int proc_readfd_common(struct file * filp, void * dirent,
1445			      filldir_t filldir, instantiate_t instantiate)
1446{
1447	struct dentry *dentry = filp->f_path.dentry;
1448	struct inode *inode = dentry->d_inode;
1449	struct task_struct *p = get_proc_task(inode);
1450	unsigned int fd, tid, ino;
1451	int retval;
1452	struct files_struct * files;
1453	struct fdtable *fdt;
1454
1455	retval = -ENOENT;
1456	if (!p)
1457		goto out_no_task;
1458	retval = 0;
1459	tid = p->pid;
1460
1461	fd = filp->f_pos;
1462	switch (fd) {
1463		case 0:
1464			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
1465				goto out;
1466			filp->f_pos++;
1467		case 1:
1468			ino = parent_ino(dentry);
1469			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1470				goto out;
1471			filp->f_pos++;
1472		default:
1473			files = get_files_struct(p);
1474			if (!files)
1475				goto out;
1476			rcu_read_lock();
1477			fdt = files_fdtable(files);
1478			for (fd = filp->f_pos-2;
1479			     fd < fdt->max_fds;
1480			     fd++, filp->f_pos++) {
1481				char name[PROC_NUMBUF];
1482				int len;
1483
1484				if (!fcheck_files(files, fd))
1485					continue;
1486				rcu_read_unlock();
1487
1488				len = snprintf(name, sizeof(name), "%d", fd);
1489				if (proc_fill_cache(filp, dirent, filldir,
1490						    name, len, instantiate,
1491						    p, &fd) < 0) {
1492					rcu_read_lock();
1493					break;
1494				}
1495				rcu_read_lock();
1496			}
1497			rcu_read_unlock();
1498			put_files_struct(files);
1499	}
1500out:
1501	put_task_struct(p);
1502out_no_task:
1503	return retval;
1504}
1505
1506static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1507				    struct nameidata *nd)
1508{
1509	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1510}
1511
1512static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1513{
1514	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
1515}
1516
1517static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1518				      size_t len, loff_t *ppos)
1519{
1520	char tmp[PROC_FDINFO_MAX];
1521	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, NULL, tmp);
1522	if (!err)
1523		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
1524	return err;
1525}
1526
1527static const struct file_operations proc_fdinfo_file_operations = {
1528	.open		= nonseekable_open,
1529	.read		= proc_fdinfo_read,
1530};
1531
1532static const struct file_operations proc_fd_operations = {
1533	.read		= generic_read_dir,
1534	.readdir	= proc_readfd,
1535};
1536
1537/*
1538 * /proc/pid/fd needs a special permission handler so that a process can still
1539 * access /proc/self/fd after it has executed a setuid().
1540 */
1541static int proc_fd_permission(struct inode *inode, int mask,
1542				struct nameidata *nd)
1543{
1544	int rv;
1545
1546	rv = generic_permission(inode, mask, NULL);
1547	if (rv == 0)
1548		return 0;
1549	if (task_pid(current) == proc_pid(inode))
1550		rv = 0;
1551	return rv;
1552}
1553
1554/*
1555 * proc directories can do almost nothing..
1556 */
1557static const struct inode_operations proc_fd_inode_operations = {
1558	.lookup		= proc_lookupfd,
1559	.permission	= proc_fd_permission,
1560	.setattr	= proc_setattr,
1561};
1562
1563static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
1564	struct dentry *dentry, struct task_struct *task, const void *ptr)
1565{
1566	unsigned fd = *(unsigned *)ptr;
1567 	struct inode *inode;
1568 	struct proc_inode *ei;
1569	struct dentry *error = ERR_PTR(-ENOENT);
1570
1571	inode = proc_pid_make_inode(dir->i_sb, task);
1572	if (!inode)
1573		goto out;
1574	ei = PROC_I(inode);
1575	ei->fd = fd;
1576	inode->i_mode = S_IFREG | S_IRUSR;
1577	inode->i_fop = &proc_fdinfo_file_operations;
1578	dentry->d_op = &tid_fd_dentry_operations;
1579	d_add(dentry, inode);
1580	/* Close the race of the process dying before we return the dentry */
1581	if (tid_fd_revalidate(dentry, NULL))
1582		error = NULL;
1583
1584 out:
1585	return error;
1586}
1587
1588static struct dentry *proc_lookupfdinfo(struct inode *dir,
1589					struct dentry *dentry,
1590					struct nameidata *nd)
1591{
1592	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
1593}
1594
1595static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
1596{
1597	return proc_readfd_common(filp, dirent, filldir,
1598				  proc_fdinfo_instantiate);
1599}
1600
1601static const struct file_operations proc_fdinfo_operations = {
1602	.read		= generic_read_dir,
1603	.readdir	= proc_readfdinfo,
1604};
1605
1606/*
1607 * proc directories can do almost nothing..
1608 */
1609static const struct inode_operations proc_fdinfo_inode_operations = {
1610	.lookup		= proc_lookupfdinfo,
1611	.setattr	= proc_setattr,
1612};
1613
1614
1615static struct dentry *proc_pident_instantiate(struct inode *dir,
1616	struct dentry *dentry, struct task_struct *task, const void *ptr)
1617{
1618	const struct pid_entry *p = ptr;
1619	struct inode *inode;
1620	struct proc_inode *ei;
1621	struct dentry *error = ERR_PTR(-EINVAL);
1622
1623	inode = proc_pid_make_inode(dir->i_sb, task);
1624	if (!inode)
1625		goto out;
1626
1627	ei = PROC_I(inode);
1628	inode->i_mode = p->mode;
1629	if (S_ISDIR(inode->i_mode))
1630		inode->i_nlink = 2;	/* Use getattr to fix if necessary */
1631	if (p->iop)
1632		inode->i_op = p->iop;
1633	if (p->fop)
1634		inode->i_fop = p->fop;
1635	ei->op = p->op;
1636	dentry->d_op = &pid_dentry_operations;
1637	d_add(dentry, inode);
1638	/* Close the race of the process dying before we return the dentry */
1639	if (pid_revalidate(dentry, NULL))
1640		error = NULL;
1641out:
1642	return error;
1643}
1644
1645static struct dentry *proc_pident_lookup(struct inode *dir,
1646					 struct dentry *dentry,
1647					 const struct pid_entry *ents,
1648					 unsigned int nents)
1649{
1650	struct inode *inode;
1651	struct dentry *error;
1652	struct task_struct *task = get_proc_task(dir);
1653	const struct pid_entry *p, *last;
1654
1655	error = ERR_PTR(-ENOENT);
1656	inode = NULL;
1657
1658	if (!task)
1659		goto out_no_task;
1660
1661	/*
1662	 * Yes, it does not scale. And it should not. Don't add
1663	 * new entries into /proc/<tgid>/ without very good reasons.
1664	 */
1665	last = &ents[nents - 1];
1666	for (p = ents; p <= last; p++) {
1667		if (p->len != dentry->d_name.len)
1668			continue;
1669		if (!memcmp(dentry->d_name.name, p->name, p->len))
1670			break;
1671	}
1672	if (p > last)
1673		goto out;
1674
1675	error = proc_pident_instantiate(dir, dentry, task, p);
1676out:
1677	put_task_struct(task);
1678out_no_task:
1679	return error;
1680}
1681
1682static int proc_pident_fill_cache(struct file *filp, void *dirent,
1683	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
1684{
1685	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
1686				proc_pident_instantiate, task, p);
1687}
1688
1689static int proc_pident_readdir(struct file *filp,
1690		void *dirent, filldir_t filldir,
1691		const struct pid_entry *ents, unsigned int nents)
1692{
1693	int i;
1694	int pid;
1695	struct dentry *dentry = filp->f_path.dentry;
1696	struct inode *inode = dentry->d_inode;
1697	struct task_struct *task = get_proc_task(inode);
1698	const struct pid_entry *p, *last;
1699	ino_t ino;
1700	int ret;
1701
1702	ret = -ENOENT;
1703	if (!task)
1704		goto out_no_task;
1705
1706	ret = 0;
1707	pid = task->pid;
1708	i = filp->f_pos;
1709	switch (i) {
1710	case 0:
1711		ino = inode->i_ino;
1712		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1713			goto out;
1714		i++;
1715		filp->f_pos++;
1716		/* fall through */
1717	case 1:
1718		ino = parent_ino(dentry);
1719		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1720			goto out;
1721		i++;
1722		filp->f_pos++;
1723		/* fall through */
1724	default:
1725		i -= 2;
1726		if (i >= nents) {
1727			ret = 1;
1728			goto out;
1729		}
1730		p = ents + i;
1731		last = &ents[nents - 1];
1732		while (p <= last) {
1733			if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
1734				goto out;
1735			filp->f_pos++;
1736			p++;
1737		}
1738	}
1739
1740	ret = 1;
1741out:
1742	put_task_struct(task);
1743out_no_task:
1744	return ret;
1745}
1746
1747#ifdef CONFIG_SECURITY
1748static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
1749				  size_t count, loff_t *ppos)
1750{
1751	struct inode * inode = file->f_path.dentry->d_inode;
1752	char *p = NULL;
1753	ssize_t length;
1754	struct task_struct *task = get_proc_task(inode);
1755
1756	if (!task)
1757		return -ESRCH;
1758
1759	length = security_getprocattr(task,
1760				      (char*)file->f_path.dentry->d_name.name,
1761				      &p);
1762	put_task_struct(task);
1763	if (length > 0)
1764		length = simple_read_from_buffer(buf, count, ppos, p, length);
1765	kfree(p);
1766	return length;
1767}
1768
1769static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
1770				   size_t count, loff_t *ppos)
1771{
1772	struct inode * inode = file->f_path.dentry->d_inode;
1773	char *page;
1774	ssize_t length;
1775	struct task_struct *task = get_proc_task(inode);
1776
1777	length = -ESRCH;
1778	if (!task)
1779		goto out_no_task;
1780	if (count > PAGE_SIZE)
1781		count = PAGE_SIZE;
1782
1783	/* No partial writes. */
1784	length = -EINVAL;
1785	if (*ppos != 0)
1786		goto out;
1787
1788	length = -ENOMEM;
1789	page = (char*)__get_free_page(GFP_USER);
1790	if (!page)
1791		goto out;
1792
1793	length = -EFAULT;
1794	if (copy_from_user(page, buf, count))
1795		goto out_free;
1796
1797	length = security_setprocattr(task,
1798				      (char*)file->f_path.dentry->d_name.name,
1799				      (void*)page, count);
1800out_free:
1801	free_page((unsigned long) page);
1802out:
1803	put_task_struct(task);
1804out_no_task:
1805	return length;
1806}
1807
1808static const struct file_operations proc_pid_attr_operations = {
1809	.read		= proc_pid_attr_read,
1810	.write		= proc_pid_attr_write,
1811};
1812
1813static const struct pid_entry attr_dir_stuff[] = {
1814	REG("current",    S_IRUGO|S_IWUGO, pid_attr),
1815	REG("prev",       S_IRUGO,	   pid_attr),
1816	REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
1817	REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
1818	REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
1819	REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
1820};
1821
1822static int proc_attr_dir_readdir(struct file * filp,
1823			     void * dirent, filldir_t filldir)
1824{
1825	return proc_pident_readdir(filp,dirent,filldir,
1826				   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
1827}
1828
1829static const struct file_operations proc_attr_dir_operations = {
1830	.read		= generic_read_dir,
1831	.readdir	= proc_attr_dir_readdir,
1832};
1833
1834static struct dentry *proc_attr_dir_lookup(struct inode *dir,
1835				struct dentry *dentry, struct nameidata *nd)
1836{
1837	return proc_pident_lookup(dir, dentry,
1838				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
1839}
1840
1841static const struct inode_operations proc_attr_dir_inode_operations = {
1842	.lookup		= proc_attr_dir_lookup,
1843	.getattr	= pid_getattr,
1844	.setattr	= proc_setattr,
1845};
1846
1847#endif
1848
1849/*
1850 * /proc/self:
1851 */
1852static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
1853			      int buflen)
1854{
1855	char tmp[PROC_NUMBUF];
1856	sprintf(tmp, "%d", current->tgid);
1857	return vfs_readlink(dentry,buffer,buflen,tmp);
1858}
1859
1860static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
1861{
1862	char tmp[PROC_NUMBUF];
1863	sprintf(tmp, "%d", current->tgid);
1864	return ERR_PTR(vfs_follow_link(nd,tmp));
1865}
1866
1867static const struct inode_operations proc_self_inode_operations = {
1868	.readlink	= proc_self_readlink,
1869	.follow_link	= proc_self_follow_link,
1870};
1871
1872/*
1873 * proc base
1874 *
1875 * These are the directory entries in the root directory of /proc
1876 * that properly belong to the /proc filesystem, as they describe
1877 * describe something that is process related.
1878 */
1879static const struct pid_entry proc_base_stuff[] = {
1880	NOD("self", S_IFLNK|S_IRWXUGO,
1881		&proc_self_inode_operations, NULL, {}),
1882};
1883
1884/*
1885 *	Exceptional case: normally we are not allowed to unhash a busy
1886 * directory. In this case, however, we can do it - no aliasing problems
1887 * due to the way we treat inodes.
1888 */
1889static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
1890{
1891	struct inode *inode = dentry->d_inode;
1892	struct task_struct *task = get_proc_task(inode);
1893	if (task) {
1894		put_task_struct(task);
1895		return 1;
1896	}
1897	d_drop(dentry);
1898	return 0;
1899}
1900
1901static struct dentry_operations proc_base_dentry_operations =
1902{
1903	.d_revalidate	= proc_base_revalidate,
1904	.d_delete	= pid_delete_dentry,
1905};
1906
1907static struct dentry *proc_base_instantiate(struct inode *dir,
1908	struct dentry *dentry, struct task_struct *task, const void *ptr)
1909{
1910	const struct pid_entry *p = ptr;
1911	struct inode *inode;
1912	struct proc_inode *ei;
1913	struct dentry *error = ERR_PTR(-EINVAL);
1914
1915	/* Allocate the inode */
1916	error = ERR_PTR(-ENOMEM);
1917	inode = new_inode(dir->i_sb);
1918	if (!inode)
1919		goto out;
1920
1921	/* Initialize the inode */
1922	ei = PROC_I(inode);
1923	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1924
1925	/*
1926	 * grab the reference to the task.
1927	 */
1928	ei->pid = get_task_pid(task, PIDTYPE_PID);
1929	if (!ei->pid)
1930		goto out_iput;
1931
1932	inode->i_uid = 0;
1933	inode->i_gid = 0;
1934	inode->i_mode = p->mode;
1935	if (S_ISDIR(inode->i_mode))
1936		inode->i_nlink = 2;
1937	if (S_ISLNK(inode->i_mode))
1938		inode->i_size = 64;
1939	if (p->iop)
1940		inode->i_op = p->iop;
1941	if (p->fop)
1942		inode->i_fop = p->fop;
1943	ei->op = p->op;
1944	dentry->d_op = &proc_base_dentry_operations;
1945	d_add(dentry, inode);
1946	error = NULL;
1947out:
1948	return error;
1949out_iput:
1950	iput(inode);
1951	goto out;
1952}
1953
1954static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
1955{
1956	struct dentry *error;
1957	struct task_struct *task = get_proc_task(dir);
1958	const struct pid_entry *p, *last;
1959
1960	error = ERR_PTR(-ENOENT);
1961
1962	if (!task)
1963		goto out_no_task;
1964
1965	/* Lookup the directory entry */
1966	last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
1967	for (p = proc_base_stuff; p <= last; p++) {
1968		if (p->len != dentry->d_name.len)
1969			continue;
1970		if (!memcmp(dentry->d_name.name, p->name, p->len))
1971			break;
1972	}
1973	if (p > last)
1974		goto out;
1975
1976	error = proc_base_instantiate(dir, dentry, task, p);
1977
1978out:
1979	put_task_struct(task);
1980out_no_task:
1981	return error;
1982}
1983
1984static int proc_base_fill_cache(struct file *filp, void *dirent,
1985	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
1986{
1987	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
1988				proc_base_instantiate, task, p);
1989}
1990
1991#ifdef CONFIG_TASK_IO_ACCOUNTING
1992static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
1993{
1994	return sprintf(buffer,
1995#ifdef CONFIG_TASK_XACCT
1996			"rchar: %llu\n"
1997			"wchar: %llu\n"
1998			"syscr: %llu\n"
1999			"syscw: %llu\n"
2000#endif
2001			"read_bytes: %llu\n"
2002			"write_bytes: %llu\n"
2003			"cancelled_write_bytes: %llu\n",
2004#ifdef CONFIG_TASK_XACCT
2005			(unsigned long long)task->rchar,
2006			(unsigned long long)task->wchar,
2007			(unsigned long long)task->syscr,
2008			(unsigned long long)task->syscw,
2009#endif
2010			(unsigned long long)task->ioac.read_bytes,
2011			(unsigned long long)task->ioac.write_bytes,
2012			(unsigned long long)task->ioac.cancelled_write_bytes);
2013}
2014#endif
2015
2016/*
2017 * Thread groups
2018 */
2019static const struct file_operations proc_task_operations;
2020static const struct inode_operations proc_task_inode_operations;
2021
2022static const struct pid_entry tgid_base_stuff[] = {
2023	DIR("task",       S_IRUGO|S_IXUGO, task),
2024	DIR("fd",         S_IRUSR|S_IXUSR, fd),
2025	DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
2026	INF("environ",    S_IRUSR, pid_environ),
2027	INF("auxv",       S_IRUSR, pid_auxv),
2028	INF("status",     S_IRUGO, pid_status),
2029#ifdef CONFIG_SCHED_DEBUG
2030	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
2031#endif
2032	INF("cmdline",    S_IRUGO, pid_cmdline),
2033	INF("stat",       S_IRUGO, tgid_stat),
2034	INF("statm",      S_IRUGO, pid_statm),
2035	REG("maps",       S_IRUGO, maps),
2036#ifdef CONFIG_NUMA
2037	REG("numa_maps",  S_IRUGO, numa_maps),
2038#endif
2039	REG("mem",        S_IRUSR|S_IWUSR, mem),
2040#ifdef CONFIG_SECCOMP
2041	REG("seccomp",    S_IRUSR|S_IWUSR, seccomp),
2042#endif
2043	LNK("cwd",        cwd),
2044	LNK("root",       root),
2045	LNK("exe",        exe),
2046	REG("mounts",     S_IRUGO, mounts),
2047	REG("mountstats", S_IRUSR, mountstats),
2048#ifdef CONFIG_MMU
2049	REG("clear_refs", S_IWUSR, clear_refs),
2050	REG("smaps",      S_IRUGO, smaps),
2051#endif
2052#ifdef CONFIG_SECURITY
2053	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
2054#endif
2055#ifdef CONFIG_KALLSYMS
2056	INF("wchan",      S_IRUGO, pid_wchan),
2057#endif
2058#ifdef CONFIG_SCHEDSTATS
2059	INF("schedstat",  S_IRUGO, pid_schedstat),
2060#endif
2061#ifdef CONFIG_CPUSETS
2062	REG("cpuset",     S_IRUGO, cpuset),
2063#endif
2064	INF("oom_score",  S_IRUGO, oom_score),
2065	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
2066#ifdef CONFIG_AUDITSYSCALL
2067	REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
2068#endif
2069#ifdef CONFIG_FAULT_INJECTION
2070	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2071#endif
2072#ifdef CONFIG_TASK_IO_ACCOUNTING
2073	INF("io",	S_IRUGO, pid_io_accounting),
2074#endif
2075};
2076
2077static int proc_tgid_base_readdir(struct file * filp,
2078			     void * dirent, filldir_t filldir)
2079{
2080	return proc_pident_readdir(filp,dirent,filldir,
2081				   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
2082}
2083
2084static const struct file_operations proc_tgid_base_operations = {
2085	.read		= generic_read_dir,
2086	.readdir	= proc_tgid_base_readdir,
2087};
2088
2089static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2090	return proc_pident_lookup(dir, dentry,
2091				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2092}
2093
2094static const struct inode_operations proc_tgid_base_inode_operations = {
2095	.lookup		= proc_tgid_base_lookup,
2096	.getattr	= pid_getattr,
2097	.setattr	= proc_setattr,
2098};
2099
2100/**
2101 * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
2102 *
2103 * @task: task that should be flushed.
2104 *
2105 * Looks in the dcache for
2106 * /proc/@pid
2107 * /proc/@tgid/task/@pid
2108 * if either directory is present flushes it and all of it'ts children
2109 * from the dcache.
2110 *
2111 * It is safe and reasonable to cache /proc entries for a task until
2112 * that task exits.  After that they just clog up the dcache with
2113 * useless entries, possibly causing useful dcache entries to be
2114 * flushed instead.  This routine is proved to flush those useless
2115 * dcache entries at process exit time.
2116 *
2117 * NOTE: This routine is just an optimization so it does not guarantee
2118 *       that no dcache entries will exist at process exit time it
2119 *       just makes it very unlikely that any will persist.
2120 */
2121void proc_flush_task(struct task_struct *task)
2122{
2123	struct dentry *dentry, *leader, *dir;
2124	char buf[PROC_NUMBUF];
2125	struct qstr name;
2126
2127	name.name = buf;
2128	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
2129	dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
2130	if (dentry) {
2131		shrink_dcache_parent(dentry);
2132		d_drop(dentry);
2133		dput(dentry);
2134	}
2135
2136	if (thread_group_leader(task))
2137		goto out;
2138
2139	name.name = buf;
2140	name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
2141	leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
2142	if (!leader)
2143		goto out;
2144
2145	name.name = "task";
2146	name.len = strlen(name.name);
2147	dir = d_hash_and_lookup(leader, &name);
2148	if (!dir)
2149		goto out_put_leader;
2150
2151	name.name = buf;
2152	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
2153	dentry = d_hash_and_lookup(dir, &name);
2154	if (dentry) {
2155		shrink_dcache_parent(dentry);
2156		d_drop(dentry);
2157		dput(dentry);
2158	}
2159
2160	dput(dir);
2161out_put_leader:
2162	dput(leader);
2163out:
2164	return;
2165}
2166
2167static struct dentry *proc_pid_instantiate(struct inode *dir,
2168					   struct dentry * dentry,
2169					   struct task_struct *task, const void *ptr)
2170{
2171	struct dentry *error = ERR_PTR(-ENOENT);
2172	struct inode *inode;
2173
2174	inode = proc_pid_make_inode(dir->i_sb, task);
2175	if (!inode)
2176		goto out;
2177
2178	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2179	inode->i_op = &proc_tgid_base_inode_operations;
2180	inode->i_fop = &proc_tgid_base_operations;
2181	inode->i_flags|=S_IMMUTABLE;
2182	inode->i_nlink = 5;
2183#ifdef CONFIG_SECURITY
2184	inode->i_nlink += 1;
2185#endif
2186
2187	dentry->d_op = &pid_dentry_operations;
2188
2189	d_add(dentry, inode);
2190	/* Close the race of the process dying before we return the dentry */
2191	if (pid_revalidate(dentry, NULL))
2192		error = NULL;
2193out:
2194	return error;
2195}
2196
2197struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2198{
2199	struct dentry *result = ERR_PTR(-ENOENT);
2200	struct task_struct *task;
2201	unsigned tgid;
2202
2203	result = proc_base_lookup(dir, dentry);
2204	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2205		goto out;
2206
2207	tgid = name_to_int(dentry);
2208	if (tgid == ~0U)
2209		goto out;
2210
2211	rcu_read_lock();
2212	task = find_task_by_pid(tgid);
2213	if (task)
2214		get_task_struct(task);
2215	rcu_read_unlock();
2216	if (!task)
2217		goto out;
2218
2219	result = proc_pid_instantiate(dir, dentry, task, NULL);
2220	put_task_struct(task);
2221out:
2222	return result;
2223}
2224
2225/*
2226 * Find the first task with tgid >= tgid
2227 *
2228 */
2229static struct task_struct *next_tgid(unsigned int tgid)
2230{
2231	struct task_struct *task;
2232	struct pid *pid;
2233
2234	rcu_read_lock();
2235retry:
2236	task = NULL;
2237	pid = find_ge_pid(tgid);
2238	if (pid) {
2239		tgid = pid->nr + 1;
2240		task = pid_task(pid, PIDTYPE_PID);
2241		/* What we to know is if the pid we have find is the
2242		 * pid of a thread_group_leader.  Testing for task
2243		 * being a thread_group_leader is the obvious thing
2244		 * todo but there is a window when it fails, due to
2245		 * the pid transfer logic in de_thread.
2246		 *
2247		 * So we perform the straight forward test of seeing
2248		 * if the pid we have found is the pid of a thread
2249		 * group leader, and don't worry if the task we have
2250		 * found doesn't happen to be a thread group leader.
2251		 * As we don't care in the case of readdir.
2252		 */
2253		if (!task || !has_group_leader_pid(task))
2254			goto retry;
2255		get_task_struct(task);
2256	}
2257	rcu_read_unlock();
2258	return task;
2259}
2260
2261#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
2262
2263static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2264	struct task_struct *task, int tgid)
2265{
2266	char name[PROC_NUMBUF];
2267	int len = snprintf(name, sizeof(name), "%d", tgid);
2268	return proc_fill_cache(filp, dirent, filldir, name, len,
2269				proc_pid_instantiate, task, NULL);
2270}
2271
2272/* for the /proc/ directory itself, after non-process stuff has been done */
2273int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2274{
2275	unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2276	struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
2277	struct task_struct *task;
2278	int tgid;
2279
2280	if (!reaper)
2281		goto out_no_task;
2282
2283	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2284		const struct pid_entry *p = &proc_base_stuff[nr];
2285		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2286			goto out;
2287	}
2288
2289	tgid = filp->f_pos - TGID_OFFSET;
2290	for (task = next_tgid(tgid);
2291	     task;
2292	     put_task_struct(task), task = next_tgid(tgid + 1)) {
2293		tgid = task->pid;
2294		filp->f_pos = tgid + TGID_OFFSET;
2295		if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
2296			put_task_struct(task);
2297			goto out;
2298		}
2299	}
2300	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2301out:
2302	put_task_struct(reaper);
2303out_no_task:
2304	return 0;
2305}
2306
2307/*
2308 * Tasks
2309 */
2310static const struct pid_entry tid_base_stuff[] = {
2311	DIR("fd",        S_IRUSR|S_IXUSR, fd),
2312	DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
2313	INF("environ",   S_IRUSR, pid_environ),
2314	INF("auxv",      S_IRUSR, pid_auxv),
2315	INF("status",    S_IRUGO, pid_status),
2316#ifdef CONFIG_SCHED_DEBUG
2317	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
2318#endif
2319	INF("cmdline",   S_IRUGO, pid_cmdline),
2320	INF("stat",      S_IRUGO, tid_stat),
2321	INF("statm",     S_IRUGO, pid_statm),
2322	REG("maps",      S_IRUGO, maps),
2323#ifdef CONFIG_NUMA
2324	REG("numa_maps", S_IRUGO, numa_maps),
2325#endif
2326	REG("mem",       S_IRUSR|S_IWUSR, mem),
2327#ifdef CONFIG_SECCOMP
2328	REG("seccomp",   S_IRUSR|S_IWUSR, seccomp),
2329#endif
2330	LNK("cwd",       cwd),
2331	LNK("root",      root),
2332	LNK("exe",       exe),
2333	REG("mounts",    S_IRUGO, mounts),
2334#ifdef CONFIG_MMU
2335	REG("clear_refs", S_IWUSR, clear_refs),
2336	REG("smaps",     S_IRUGO, smaps),
2337#endif
2338#ifdef CONFIG_SECURITY
2339	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
2340#endif
2341#ifdef CONFIG_KALLSYMS
2342	INF("wchan",     S_IRUGO, pid_wchan),
2343#endif
2344#ifdef CONFIG_SCHEDSTATS
2345	INF("schedstat", S_IRUGO, pid_schedstat),
2346#endif
2347#ifdef CONFIG_CPUSETS
2348	REG("cpuset",    S_IRUGO, cpuset),
2349#endif
2350	INF("oom_score", S_IRUGO, oom_score),
2351	REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
2352#ifdef CONFIG_AUDITSYSCALL
2353	REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
2354#endif
2355#ifdef CONFIG_FAULT_INJECTION
2356	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2357#endif
2358};
2359
2360static int proc_tid_base_readdir(struct file * filp,
2361			     void * dirent, filldir_t filldir)
2362{
2363	return proc_pident_readdir(filp,dirent,filldir,
2364				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
2365}
2366
2367static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2368	return proc_pident_lookup(dir, dentry,
2369				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2370}
2371
2372static const struct file_operations proc_tid_base_operations = {
2373	.read		= generic_read_dir,
2374	.readdir	= proc_tid_base_readdir,
2375};
2376
2377static const struct inode_operations proc_tid_base_inode_operations = {
2378	.lookup		= proc_tid_base_lookup,
2379	.getattr	= pid_getattr,
2380	.setattr	= proc_setattr,
2381};
2382
2383static struct dentry *proc_task_instantiate(struct inode *dir,
2384	struct dentry *dentry, struct task_struct *task, const void *ptr)
2385{
2386	struct dentry *error = ERR_PTR(-ENOENT);
2387	struct inode *inode;
2388	inode = proc_pid_make_inode(dir->i_sb, task);
2389
2390	if (!inode)
2391		goto out;
2392	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2393	inode->i_op = &proc_tid_base_inode_operations;
2394	inode->i_fop = &proc_tid_base_operations;
2395	inode->i_flags|=S_IMMUTABLE;
2396	inode->i_nlink = 4;
2397#ifdef CONFIG_SECURITY
2398	inode->i_nlink += 1;
2399#endif
2400
2401	dentry->d_op = &pid_dentry_operations;
2402
2403	d_add(dentry, inode);
2404	/* Close the race of the process dying before we return the dentry */
2405	if (pid_revalidate(dentry, NULL))
2406		error = NULL;
2407out:
2408	return error;
2409}
2410
2411static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2412{
2413	struct dentry *result = ERR_PTR(-ENOENT);
2414	struct task_struct *task;
2415	struct task_struct *leader = get_proc_task(dir);
2416	unsigned tid;
2417
2418	if (!leader)
2419		goto out_no_task;
2420
2421	tid = name_to_int(dentry);
2422	if (tid == ~0U)
2423		goto out;
2424
2425	rcu_read_lock();
2426	task = find_task_by_pid(tid);
2427	if (task)
2428		get_task_struct(task);
2429	rcu_read_unlock();
2430	if (!task)
2431		goto out;
2432	if (leader->tgid != task->tgid)
2433		goto out_drop_task;
2434
2435	result = proc_task_instantiate(dir, dentry, task, NULL);
2436out_drop_task:
2437	put_task_struct(task);
2438out:
2439	put_task_struct(leader);
2440out_no_task:
2441	return result;
2442}
2443
2444/*
2445 * Find the first tid of a thread group to return to user space.
2446 *
2447 * Usually this is just the thread group leader, but if the users
2448 * buffer was too small or there was a seek into the middle of the
2449 * directory we have more work todo.
2450 *
2451 * In the case of a short read we start with find_task_by_pid.
2452 *
2453 * In the case of a seek we start with the leader and walk nr
2454 * threads past it.
2455 */
2456static struct task_struct *first_tid(struct task_struct *leader,
2457					int tid, int nr)
2458{
2459	struct task_struct *pos;
2460
2461	rcu_read_lock();
2462	/* Attempt to start with the pid of a thread */
2463	if (tid && (nr > 0)) {
2464		pos = find_task_by_pid(tid);
2465		if (pos && (pos->group_leader == leader))
2466			goto found;
2467	}
2468
2469	/* If nr exceeds the number of threads there is nothing todo */
2470	pos = NULL;
2471	if (nr && nr >= get_nr_threads(leader))
2472		goto out;
2473
2474	/* If we haven't found our starting place yet start
2475	 * with the leader and walk nr threads forward.
2476	 */
2477	for (pos = leader; nr > 0; --nr) {
2478		pos = next_thread(pos);
2479		if (pos == leader) {
2480			pos = NULL;
2481			goto out;
2482		}
2483	}
2484found:
2485	get_task_struct(pos);
2486out:
2487	rcu_read_unlock();
2488	return pos;
2489}
2490
2491/*
2492 * Find the next thread in the thread list.
2493 * Return NULL if there is an error or no next thread.
2494 *
2495 * The reference to the input task_struct is released.
2496 */
2497static struct task_struct *next_tid(struct task_struct *start)
2498{
2499	struct task_struct *pos = NULL;
2500	rcu_read_lock();
2501	if (pid_alive(start)) {
2502		pos = next_thread(start);
2503		if (thread_group_leader(pos))
2504			pos = NULL;
2505		else
2506			get_task_struct(pos);
2507	}
2508	rcu_read_unlock();
2509	put_task_struct(start);
2510	return pos;
2511}
2512
2513static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2514	struct task_struct *task, int tid)
2515{
2516	char name[PROC_NUMBUF];
2517	int len = snprintf(name, sizeof(name), "%d", tid);
2518	return proc_fill_cache(filp, dirent, filldir, name, len,
2519				proc_task_instantiate, task, NULL);
2520}
2521
2522/* for the /proc/TGID/task/ directories */
2523static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
2524{
2525	struct dentry *dentry = filp->f_path.dentry;
2526	struct inode *inode = dentry->d_inode;
2527	struct task_struct *leader = NULL;
2528	struct task_struct *task;
2529	int retval = -ENOENT;
2530	ino_t ino;
2531	int tid;
2532	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
2533
2534	task = get_proc_task(inode);
2535	if (!task)
2536		goto out_no_task;
2537	rcu_read_lock();
2538	if (pid_alive(task)) {
2539		leader = task->group_leader;
2540		get_task_struct(leader);
2541	}
2542	rcu_read_unlock();
2543	put_task_struct(task);
2544	if (!leader)
2545		goto out_no_task;
2546	retval = 0;
2547
2548	switch (pos) {
2549	case 0:
2550		ino = inode->i_ino;
2551		if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
2552			goto out;
2553		pos++;
2554		/* fall through */
2555	case 1:
2556		ino = parent_ino(dentry);
2557		if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
2558			goto out;
2559		pos++;
2560		/* fall through */
2561	}
2562
2563	/* f_version caches the tgid value that the last readdir call couldn't
2564	 * return. lseek aka telldir automagically resets f_version to 0.
2565	 */
2566	tid = filp->f_version;
2567	filp->f_version = 0;
2568	for (task = first_tid(leader, tid, pos - 2);
2569	     task;
2570	     task = next_tid(task), pos++) {
2571		tid = task->pid;
2572		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
2573			/* returning this tgid failed, save it as the first
2574			 * pid for the next readir call */
2575			filp->f_version = tid;
2576			put_task_struct(task);
2577			break;
2578		}
2579	}
2580out:
2581	filp->f_pos = pos;
2582	put_task_struct(leader);
2583out_no_task:
2584	return retval;
2585}
2586
2587static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
2588{
2589	struct inode *inode = dentry->d_inode;
2590	struct task_struct *p = get_proc_task(inode);
2591	generic_fillattr(inode, stat);
2592
2593	if (p) {
2594		rcu_read_lock();
2595		stat->nlink += get_nr_threads(p);
2596		rcu_read_unlock();
2597		put_task_struct(p);
2598	}
2599
2600	return 0;
2601}
2602
2603static const struct inode_operations proc_task_inode_operations = {
2604	.lookup		= proc_task_lookup,
2605	.getattr	= proc_task_getattr,
2606	.setattr	= proc_setattr,
2607};
2608
2609static const struct file_operations proc_task_operations = {
2610	.read		= generic_read_dir,
2611	.readdir	= proc_task_readdir,
2612};
2613