shm.c revision 11bac800
1/*
2 * linux/ipc/shm.c
3 * Copyright (C) 1992, 1993 Krishna Balasubramanian
4 *	 Many improvements/fixes by Bruno Haible.
5 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
6 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
7 *
8 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
9 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
10 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
11 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
12 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
13 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
14 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
15 *
16 * support for audit of ipc object properties and permission changes
17 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
18 *
19 * namespaces support
20 * OpenVZ, SWsoft Inc.
21 * Pavel Emelianov <xemul@openvz.org>
22 *
23 * Better ipc lock (kern_ipc_perm.lock) handling
24 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
25 */
26
27#include <linux/slab.h>
28#include <linux/mm.h>
29#include <linux/hugetlb.h>
30#include <linux/shm.h>
31#include <linux/init.h>
32#include <linux/file.h>
33#include <linux/mman.h>
34#include <linux/shmem_fs.h>
35#include <linux/security.h>
36#include <linux/syscalls.h>
37#include <linux/audit.h>
38#include <linux/capability.h>
39#include <linux/ptrace.h>
40#include <linux/seq_file.h>
41#include <linux/rwsem.h>
42#include <linux/nsproxy.h>
43#include <linux/mount.h>
44#include <linux/ipc_namespace.h>
45
46#include <linux/uaccess.h>
47
48#include "util.h"
49
50struct shm_file_data {
51	int id;
52	struct ipc_namespace *ns;
53	struct file *file;
54	const struct vm_operations_struct *vm_ops;
55};
56
57#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
58
59static const struct file_operations shm_file_operations;
60static const struct vm_operations_struct shm_vm_ops;
61
62#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
63
64#define shm_unlock(shp)			\
65	ipc_unlock(&(shp)->shm_perm)
66
67static int newseg(struct ipc_namespace *, struct ipc_params *);
68static void shm_open(struct vm_area_struct *vma);
69static void shm_close(struct vm_area_struct *vma);
70static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
71#ifdef CONFIG_PROC_FS
72static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
73#endif
74
75void shm_init_ns(struct ipc_namespace *ns)
76{
77	ns->shm_ctlmax = SHMMAX;
78	ns->shm_ctlall = SHMALL;
79	ns->shm_ctlmni = SHMMNI;
80	ns->shm_rmid_forced = 0;
81	ns->shm_tot = 0;
82	ipc_init_ids(&shm_ids(ns));
83}
84
85/*
86 * Called with shm_ids.rwsem (writer) and the shp structure locked.
87 * Only shm_ids.rwsem remains locked on exit.
88 */
89static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
90{
91	struct shmid_kernel *shp;
92
93	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
94
95	if (shp->shm_nattch) {
96		shp->shm_perm.mode |= SHM_DEST;
97		/* Do not find it any more */
98		shp->shm_perm.key = IPC_PRIVATE;
99		shm_unlock(shp);
100	} else
101		shm_destroy(ns, shp);
102}
103
104#ifdef CONFIG_IPC_NS
105void shm_exit_ns(struct ipc_namespace *ns)
106{
107	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
108	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
109}
110#endif
111
112static int __init ipc_ns_init(void)
113{
114	shm_init_ns(&init_ipc_ns);
115	return 0;
116}
117
118pure_initcall(ipc_ns_init);
119
120void __init shm_init(void)
121{
122	ipc_init_proc_interface("sysvipc/shm",
123#if BITS_PER_LONG <= 32
124				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
125#else
126				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
127#endif
128				IPC_SHM_IDS, sysvipc_shm_proc_show);
129}
130
131static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
132{
133	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
134
135	if (IS_ERR(ipcp))
136		return ERR_CAST(ipcp);
137
138	return container_of(ipcp, struct shmid_kernel, shm_perm);
139}
140
141static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
142{
143	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
144
145	if (IS_ERR(ipcp))
146		return ERR_CAST(ipcp);
147
148	return container_of(ipcp, struct shmid_kernel, shm_perm);
149}
150
151/*
152 * shm_lock_(check_) routines are called in the paths where the rwsem
153 * is not necessarily held.
154 */
155static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
156{
157	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
158
159	/*
160	 * Callers of shm_lock() must validate the status of the returned ipc
161	 * object pointer (as returned by ipc_lock()), and error out as
162	 * appropriate.
163	 */
164	if (IS_ERR(ipcp))
165		return (void *)ipcp;
166	return container_of(ipcp, struct shmid_kernel, shm_perm);
167}
168
169static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
170{
171	rcu_read_lock();
172	ipc_lock_object(&ipcp->shm_perm);
173}
174
175static void shm_rcu_free(struct rcu_head *head)
176{
177	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
178	struct shmid_kernel *shp = ipc_rcu_to_struct(p);
179
180	security_shm_free(shp);
181	ipc_rcu_free(head);
182}
183
184static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
185{
186	list_del(&s->shm_clist);
187	ipc_rmid(&shm_ids(ns), &s->shm_perm);
188}
189
190
191static int __shm_open(struct vm_area_struct *vma)
192{
193	struct file *file = vma->vm_file;
194	struct shm_file_data *sfd = shm_file_data(file);
195	struct shmid_kernel *shp;
196
197	shp = shm_lock(sfd->ns, sfd->id);
198
199	if (IS_ERR(shp))
200		return PTR_ERR(shp);
201
202	shp->shm_atim = get_seconds();
203	shp->shm_lprid = task_tgid_vnr(current);
204	shp->shm_nattch++;
205	shm_unlock(shp);
206	return 0;
207}
208
209/* This is called by fork, once for every shm attach. */
210static void shm_open(struct vm_area_struct *vma)
211{
212	int err = __shm_open(vma);
213	/*
214	 * We raced in the idr lookup or with shm_destroy().
215	 * Either way, the ID is busted.
216	 */
217	WARN_ON_ONCE(err);
218}
219
220/*
221 * shm_destroy - free the struct shmid_kernel
222 *
223 * @ns: namespace
224 * @shp: struct to free
225 *
226 * It has to be called with shp and shm_ids.rwsem (writer) locked,
227 * but returns with shp unlocked and freed.
228 */
229static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
230{
231	struct file *shm_file;
232
233	shm_file = shp->shm_file;
234	shp->shm_file = NULL;
235	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
236	shm_rmid(ns, shp);
237	shm_unlock(shp);
238	if (!is_file_hugepages(shm_file))
239		shmem_lock(shm_file, 0, shp->mlock_user);
240	else if (shp->mlock_user)
241		user_shm_unlock(i_size_read(file_inode(shm_file)),
242				shp->mlock_user);
243	fput(shm_file);
244	ipc_rcu_putref(shp, shm_rcu_free);
245}
246
247/*
248 * shm_may_destroy - identifies whether shm segment should be destroyed now
249 *
250 * Returns true if and only if there are no active users of the segment and
251 * one of the following is true:
252 *
253 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
254 *
255 * 2) sysctl kernel.shm_rmid_forced is set to 1.
256 */
257static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
258{
259	return (shp->shm_nattch == 0) &&
260	       (ns->shm_rmid_forced ||
261		(shp->shm_perm.mode & SHM_DEST));
262}
263
264/*
265 * remove the attach descriptor vma.
266 * free memory for segment if it is marked destroyed.
267 * The descriptor has already been removed from the current->mm->mmap list
268 * and will later be kfree()d.
269 */
270static void shm_close(struct vm_area_struct *vma)
271{
272	struct file *file = vma->vm_file;
273	struct shm_file_data *sfd = shm_file_data(file);
274	struct shmid_kernel *shp;
275	struct ipc_namespace *ns = sfd->ns;
276
277	down_write(&shm_ids(ns).rwsem);
278	/* remove from the list of attaches of the shm segment */
279	shp = shm_lock(ns, sfd->id);
280
281	/*
282	 * We raced in the idr lookup or with shm_destroy().
283	 * Either way, the ID is busted.
284	 */
285	if (WARN_ON_ONCE(IS_ERR(shp)))
286		goto done; /* no-op */
287
288	shp->shm_lprid = task_tgid_vnr(current);
289	shp->shm_dtim = get_seconds();
290	shp->shm_nattch--;
291	if (shm_may_destroy(ns, shp))
292		shm_destroy(ns, shp);
293	else
294		shm_unlock(shp);
295done:
296	up_write(&shm_ids(ns).rwsem);
297}
298
299/* Called with ns->shm_ids(ns).rwsem locked */
300static int shm_try_destroy_orphaned(int id, void *p, void *data)
301{
302	struct ipc_namespace *ns = data;
303	struct kern_ipc_perm *ipcp = p;
304	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
305
306	/*
307	 * We want to destroy segments without users and with already
308	 * exit'ed originating process.
309	 *
310	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
311	 */
312	if (shp->shm_creator != NULL)
313		return 0;
314
315	if (shm_may_destroy(ns, shp)) {
316		shm_lock_by_ptr(shp);
317		shm_destroy(ns, shp);
318	}
319	return 0;
320}
321
322void shm_destroy_orphaned(struct ipc_namespace *ns)
323{
324	down_write(&shm_ids(ns).rwsem);
325	if (shm_ids(ns).in_use)
326		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
327	up_write(&shm_ids(ns).rwsem);
328}
329
330/* Locking assumes this will only be called with task == current */
331void exit_shm(struct task_struct *task)
332{
333	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
334	struct shmid_kernel *shp, *n;
335
336	if (list_empty(&task->sysvshm.shm_clist))
337		return;
338
339	/*
340	 * If kernel.shm_rmid_forced is not set then only keep track of
341	 * which shmids are orphaned, so that a later set of the sysctl
342	 * can clean them up.
343	 */
344	if (!ns->shm_rmid_forced) {
345		down_read(&shm_ids(ns).rwsem);
346		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
347			shp->shm_creator = NULL;
348		/*
349		 * Only under read lock but we are only called on current
350		 * so no entry on the list will be shared.
351		 */
352		list_del(&task->sysvshm.shm_clist);
353		up_read(&shm_ids(ns).rwsem);
354		return;
355	}
356
357	/*
358	 * Destroy all already created segments, that were not yet mapped,
359	 * and mark any mapped as orphan to cover the sysctl toggling.
360	 * Destroy is skipped if shm_may_destroy() returns false.
361	 */
362	down_write(&shm_ids(ns).rwsem);
363	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
364		shp->shm_creator = NULL;
365
366		if (shm_may_destroy(ns, shp)) {
367			shm_lock_by_ptr(shp);
368			shm_destroy(ns, shp);
369		}
370	}
371
372	/* Remove the list head from any segments still attached. */
373	list_del(&task->sysvshm.shm_clist);
374	up_write(&shm_ids(ns).rwsem);
375}
376
377static int shm_fault(struct vm_fault *vmf)
378{
379	struct file *file = vmf->vma->vm_file;
380	struct shm_file_data *sfd = shm_file_data(file);
381
382	return sfd->vm_ops->fault(vmf);
383}
384
385#ifdef CONFIG_NUMA
386static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
387{
388	struct file *file = vma->vm_file;
389	struct shm_file_data *sfd = shm_file_data(file);
390	int err = 0;
391
392	if (sfd->vm_ops->set_policy)
393		err = sfd->vm_ops->set_policy(vma, new);
394	return err;
395}
396
397static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
398					unsigned long addr)
399{
400	struct file *file = vma->vm_file;
401	struct shm_file_data *sfd = shm_file_data(file);
402	struct mempolicy *pol = NULL;
403
404	if (sfd->vm_ops->get_policy)
405		pol = sfd->vm_ops->get_policy(vma, addr);
406	else if (vma->vm_policy)
407		pol = vma->vm_policy;
408
409	return pol;
410}
411#endif
412
413static int shm_mmap(struct file *file, struct vm_area_struct *vma)
414{
415	struct shm_file_data *sfd = shm_file_data(file);
416	int ret;
417
418	/*
419	 * In case of remap_file_pages() emulation, the file can represent
420	 * removed IPC ID: propogate shm_lock() error to caller.
421	 */
422	ret = __shm_open(vma);
423	if (ret)
424		return ret;
425
426	ret = sfd->file->f_op->mmap(sfd->file, vma);
427	if (ret) {
428		shm_close(vma);
429		return ret;
430	}
431	sfd->vm_ops = vma->vm_ops;
432#ifdef CONFIG_MMU
433	WARN_ON(!sfd->vm_ops->fault);
434#endif
435	vma->vm_ops = &shm_vm_ops;
436	return 0;
437}
438
439static int shm_release(struct inode *ino, struct file *file)
440{
441	struct shm_file_data *sfd = shm_file_data(file);
442
443	put_ipc_ns(sfd->ns);
444	shm_file_data(file) = NULL;
445	kfree(sfd);
446	return 0;
447}
448
449static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
450{
451	struct shm_file_data *sfd = shm_file_data(file);
452
453	if (!sfd->file->f_op->fsync)
454		return -EINVAL;
455	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
456}
457
458static long shm_fallocate(struct file *file, int mode, loff_t offset,
459			  loff_t len)
460{
461	struct shm_file_data *sfd = shm_file_data(file);
462
463	if (!sfd->file->f_op->fallocate)
464		return -EOPNOTSUPP;
465	return sfd->file->f_op->fallocate(file, mode, offset, len);
466}
467
468static unsigned long shm_get_unmapped_area(struct file *file,
469	unsigned long addr, unsigned long len, unsigned long pgoff,
470	unsigned long flags)
471{
472	struct shm_file_data *sfd = shm_file_data(file);
473
474	return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
475						pgoff, flags);
476}
477
478static const struct file_operations shm_file_operations = {
479	.mmap		= shm_mmap,
480	.fsync		= shm_fsync,
481	.release	= shm_release,
482	.get_unmapped_area	= shm_get_unmapped_area,
483	.llseek		= noop_llseek,
484	.fallocate	= shm_fallocate,
485};
486
487/*
488 * shm_file_operations_huge is now identical to shm_file_operations,
489 * but we keep it distinct for the sake of is_file_shm_hugepages().
490 */
491static const struct file_operations shm_file_operations_huge = {
492	.mmap		= shm_mmap,
493	.fsync		= shm_fsync,
494	.release	= shm_release,
495	.get_unmapped_area	= shm_get_unmapped_area,
496	.llseek		= noop_llseek,
497	.fallocate	= shm_fallocate,
498};
499
500bool is_file_shm_hugepages(struct file *file)
501{
502	return file->f_op == &shm_file_operations_huge;
503}
504
505static const struct vm_operations_struct shm_vm_ops = {
506	.open	= shm_open,	/* callback for a new vm-area open */
507	.close	= shm_close,	/* callback for when the vm-area is released */
508	.fault	= shm_fault,
509#if defined(CONFIG_NUMA)
510	.set_policy = shm_set_policy,
511	.get_policy = shm_get_policy,
512#endif
513};
514
515/**
516 * newseg - Create a new shared memory segment
517 * @ns: namespace
518 * @params: ptr to the structure that contains key, size and shmflg
519 *
520 * Called with shm_ids.rwsem held as a writer.
521 */
522static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
523{
524	key_t key = params->key;
525	int shmflg = params->flg;
526	size_t size = params->u.size;
527	int error;
528	struct shmid_kernel *shp;
529	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
530	struct file *file;
531	char name[13];
532	int id;
533	vm_flags_t acctflag = 0;
534
535	if (size < SHMMIN || size > ns->shm_ctlmax)
536		return -EINVAL;
537
538	if (numpages << PAGE_SHIFT < size)
539		return -ENOSPC;
540
541	if (ns->shm_tot + numpages < ns->shm_tot ||
542			ns->shm_tot + numpages > ns->shm_ctlall)
543		return -ENOSPC;
544
545	shp = ipc_rcu_alloc(sizeof(*shp));
546	if (!shp)
547		return -ENOMEM;
548
549	shp->shm_perm.key = key;
550	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
551	shp->mlock_user = NULL;
552
553	shp->shm_perm.security = NULL;
554	error = security_shm_alloc(shp);
555	if (error) {
556		ipc_rcu_putref(shp, ipc_rcu_free);
557		return error;
558	}
559
560	sprintf(name, "SYSV%08x", key);
561	if (shmflg & SHM_HUGETLB) {
562		struct hstate *hs;
563		size_t hugesize;
564
565		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
566		if (!hs) {
567			error = -EINVAL;
568			goto no_file;
569		}
570		hugesize = ALIGN(size, huge_page_size(hs));
571
572		/* hugetlb_file_setup applies strict accounting */
573		if (shmflg & SHM_NORESERVE)
574			acctflag = VM_NORESERVE;
575		file = hugetlb_file_setup(name, hugesize, acctflag,
576				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
577				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
578	} else {
579		/*
580		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
581		 * if it's asked for.
582		 */
583		if  ((shmflg & SHM_NORESERVE) &&
584				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
585			acctflag = VM_NORESERVE;
586		file = shmem_kernel_file_setup(name, size, acctflag);
587	}
588	error = PTR_ERR(file);
589	if (IS_ERR(file))
590		goto no_file;
591
592	shp->shm_cprid = task_tgid_vnr(current);
593	shp->shm_lprid = 0;
594	shp->shm_atim = shp->shm_dtim = 0;
595	shp->shm_ctim = get_seconds();
596	shp->shm_segsz = size;
597	shp->shm_nattch = 0;
598	shp->shm_file = file;
599	shp->shm_creator = current;
600
601	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
602	if (id < 0) {
603		error = id;
604		goto no_id;
605	}
606
607	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
608
609	/*
610	 * shmid gets reported as "inode#" in /proc/pid/maps.
611	 * proc-ps tools use this. Changing this will break them.
612	 */
613	file_inode(file)->i_ino = shp->shm_perm.id;
614
615	ns->shm_tot += numpages;
616	error = shp->shm_perm.id;
617
618	ipc_unlock_object(&shp->shm_perm);
619	rcu_read_unlock();
620	return error;
621
622no_id:
623	if (is_file_hugepages(file) && shp->mlock_user)
624		user_shm_unlock(size, shp->mlock_user);
625	fput(file);
626no_file:
627	ipc_rcu_putref(shp, shm_rcu_free);
628	return error;
629}
630
631/*
632 * Called with shm_ids.rwsem and ipcp locked.
633 */
634static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
635{
636	struct shmid_kernel *shp;
637
638	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
639	return security_shm_associate(shp, shmflg);
640}
641
642/*
643 * Called with shm_ids.rwsem and ipcp locked.
644 */
645static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
646				struct ipc_params *params)
647{
648	struct shmid_kernel *shp;
649
650	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
651	if (shp->shm_segsz < params->u.size)
652		return -EINVAL;
653
654	return 0;
655}
656
657SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
658{
659	struct ipc_namespace *ns;
660	static const struct ipc_ops shm_ops = {
661		.getnew = newseg,
662		.associate = shm_security,
663		.more_checks = shm_more_checks,
664	};
665	struct ipc_params shm_params;
666
667	ns = current->nsproxy->ipc_ns;
668
669	shm_params.key = key;
670	shm_params.flg = shmflg;
671	shm_params.u.size = size;
672
673	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
674}
675
676static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
677{
678	switch (version) {
679	case IPC_64:
680		return copy_to_user(buf, in, sizeof(*in));
681	case IPC_OLD:
682	    {
683		struct shmid_ds out;
684
685		memset(&out, 0, sizeof(out));
686		ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
687		out.shm_segsz	= in->shm_segsz;
688		out.shm_atime	= in->shm_atime;
689		out.shm_dtime	= in->shm_dtime;
690		out.shm_ctime	= in->shm_ctime;
691		out.shm_cpid	= in->shm_cpid;
692		out.shm_lpid	= in->shm_lpid;
693		out.shm_nattch	= in->shm_nattch;
694
695		return copy_to_user(buf, &out, sizeof(out));
696	    }
697	default:
698		return -EINVAL;
699	}
700}
701
702static inline unsigned long
703copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
704{
705	switch (version) {
706	case IPC_64:
707		if (copy_from_user(out, buf, sizeof(*out)))
708			return -EFAULT;
709		return 0;
710	case IPC_OLD:
711	    {
712		struct shmid_ds tbuf_old;
713
714		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
715			return -EFAULT;
716
717		out->shm_perm.uid	= tbuf_old.shm_perm.uid;
718		out->shm_perm.gid	= tbuf_old.shm_perm.gid;
719		out->shm_perm.mode	= tbuf_old.shm_perm.mode;
720
721		return 0;
722	    }
723	default:
724		return -EINVAL;
725	}
726}
727
728static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
729{
730	switch (version) {
731	case IPC_64:
732		return copy_to_user(buf, in, sizeof(*in));
733	case IPC_OLD:
734	    {
735		struct shminfo out;
736
737		if (in->shmmax > INT_MAX)
738			out.shmmax = INT_MAX;
739		else
740			out.shmmax = (int)in->shmmax;
741
742		out.shmmin	= in->shmmin;
743		out.shmmni	= in->shmmni;
744		out.shmseg	= in->shmseg;
745		out.shmall	= in->shmall;
746
747		return copy_to_user(buf, &out, sizeof(out));
748	    }
749	default:
750		return -EINVAL;
751	}
752}
753
754/*
755 * Calculate and add used RSS and swap pages of a shm.
756 * Called with shm_ids.rwsem held as a reader
757 */
758static void shm_add_rss_swap(struct shmid_kernel *shp,
759	unsigned long *rss_add, unsigned long *swp_add)
760{
761	struct inode *inode;
762
763	inode = file_inode(shp->shm_file);
764
765	if (is_file_hugepages(shp->shm_file)) {
766		struct address_space *mapping = inode->i_mapping;
767		struct hstate *h = hstate_file(shp->shm_file);
768		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
769	} else {
770#ifdef CONFIG_SHMEM
771		struct shmem_inode_info *info = SHMEM_I(inode);
772
773		spin_lock_irq(&info->lock);
774		*rss_add += inode->i_mapping->nrpages;
775		*swp_add += info->swapped;
776		spin_unlock_irq(&info->lock);
777#else
778		*rss_add += inode->i_mapping->nrpages;
779#endif
780	}
781}
782
783/*
784 * Called with shm_ids.rwsem held as a reader
785 */
786static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
787		unsigned long *swp)
788{
789	int next_id;
790	int total, in_use;
791
792	*rss = 0;
793	*swp = 0;
794
795	in_use = shm_ids(ns).in_use;
796
797	for (total = 0, next_id = 0; total < in_use; next_id++) {
798		struct kern_ipc_perm *ipc;
799		struct shmid_kernel *shp;
800
801		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
802		if (ipc == NULL)
803			continue;
804		shp = container_of(ipc, struct shmid_kernel, shm_perm);
805
806		shm_add_rss_swap(shp, rss, swp);
807
808		total++;
809	}
810}
811
812/*
813 * This function handles some shmctl commands which require the rwsem
814 * to be held in write mode.
815 * NOTE: no locks must be held, the rwsem is taken inside this function.
816 */
817static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
818		       struct shmid_ds __user *buf, int version)
819{
820	struct kern_ipc_perm *ipcp;
821	struct shmid64_ds shmid64;
822	struct shmid_kernel *shp;
823	int err;
824
825	if (cmd == IPC_SET) {
826		if (copy_shmid_from_user(&shmid64, buf, version))
827			return -EFAULT;
828	}
829
830	down_write(&shm_ids(ns).rwsem);
831	rcu_read_lock();
832
833	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
834				      &shmid64.shm_perm, 0);
835	if (IS_ERR(ipcp)) {
836		err = PTR_ERR(ipcp);
837		goto out_unlock1;
838	}
839
840	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
841
842	err = security_shm_shmctl(shp, cmd);
843	if (err)
844		goto out_unlock1;
845
846	switch (cmd) {
847	case IPC_RMID:
848		ipc_lock_object(&shp->shm_perm);
849		/* do_shm_rmid unlocks the ipc object and rcu */
850		do_shm_rmid(ns, ipcp);
851		goto out_up;
852	case IPC_SET:
853		ipc_lock_object(&shp->shm_perm);
854		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
855		if (err)
856			goto out_unlock0;
857		shp->shm_ctim = get_seconds();
858		break;
859	default:
860		err = -EINVAL;
861		goto out_unlock1;
862	}
863
864out_unlock0:
865	ipc_unlock_object(&shp->shm_perm);
866out_unlock1:
867	rcu_read_unlock();
868out_up:
869	up_write(&shm_ids(ns).rwsem);
870	return err;
871}
872
873static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
874			 int cmd, int version, void __user *buf)
875{
876	int err;
877	struct shmid_kernel *shp;
878
879	/* preliminary security checks for *_INFO */
880	if (cmd == IPC_INFO || cmd == SHM_INFO) {
881		err = security_shm_shmctl(NULL, cmd);
882		if (err)
883			return err;
884	}
885
886	switch (cmd) {
887	case IPC_INFO:
888	{
889		struct shminfo64 shminfo;
890
891		memset(&shminfo, 0, sizeof(shminfo));
892		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
893		shminfo.shmmax = ns->shm_ctlmax;
894		shminfo.shmall = ns->shm_ctlall;
895
896		shminfo.shmmin = SHMMIN;
897		if (copy_shminfo_to_user(buf, &shminfo, version))
898			return -EFAULT;
899
900		down_read(&shm_ids(ns).rwsem);
901		err = ipc_get_maxid(&shm_ids(ns));
902		up_read(&shm_ids(ns).rwsem);
903
904		if (err < 0)
905			err = 0;
906		goto out;
907	}
908	case SHM_INFO:
909	{
910		struct shm_info shm_info;
911
912		memset(&shm_info, 0, sizeof(shm_info));
913		down_read(&shm_ids(ns).rwsem);
914		shm_info.used_ids = shm_ids(ns).in_use;
915		shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp);
916		shm_info.shm_tot = ns->shm_tot;
917		shm_info.swap_attempts = 0;
918		shm_info.swap_successes = 0;
919		err = ipc_get_maxid(&shm_ids(ns));
920		up_read(&shm_ids(ns).rwsem);
921		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
922			err = -EFAULT;
923			goto out;
924		}
925
926		err = err < 0 ? 0 : err;
927		goto out;
928	}
929	case SHM_STAT:
930	case IPC_STAT:
931	{
932		struct shmid64_ds tbuf;
933		int result;
934
935		rcu_read_lock();
936		if (cmd == SHM_STAT) {
937			shp = shm_obtain_object(ns, shmid);
938			if (IS_ERR(shp)) {
939				err = PTR_ERR(shp);
940				goto out_unlock;
941			}
942			result = shp->shm_perm.id;
943		} else {
944			shp = shm_obtain_object_check(ns, shmid);
945			if (IS_ERR(shp)) {
946				err = PTR_ERR(shp);
947				goto out_unlock;
948			}
949			result = 0;
950		}
951
952		err = -EACCES;
953		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
954			goto out_unlock;
955
956		err = security_shm_shmctl(shp, cmd);
957		if (err)
958			goto out_unlock;
959
960		memset(&tbuf, 0, sizeof(tbuf));
961		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
962		tbuf.shm_segsz	= shp->shm_segsz;
963		tbuf.shm_atime	= shp->shm_atim;
964		tbuf.shm_dtime	= shp->shm_dtim;
965		tbuf.shm_ctime	= shp->shm_ctim;
966		tbuf.shm_cpid	= shp->shm_cprid;
967		tbuf.shm_lpid	= shp->shm_lprid;
968		tbuf.shm_nattch	= shp->shm_nattch;
969		rcu_read_unlock();
970
971		if (copy_shmid_to_user(buf, &tbuf, version))
972			err = -EFAULT;
973		else
974			err = result;
975		goto out;
976	}
977	default:
978		return -EINVAL;
979	}
980
981out_unlock:
982	rcu_read_unlock();
983out:
984	return err;
985}
986
987SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
988{
989	struct shmid_kernel *shp;
990	int err, version;
991	struct ipc_namespace *ns;
992
993	if (cmd < 0 || shmid < 0)
994		return -EINVAL;
995
996	version = ipc_parse_version(&cmd);
997	ns = current->nsproxy->ipc_ns;
998
999	switch (cmd) {
1000	case IPC_INFO:
1001	case SHM_INFO:
1002	case SHM_STAT:
1003	case IPC_STAT:
1004		return shmctl_nolock(ns, shmid, cmd, version, buf);
1005	case IPC_RMID:
1006	case IPC_SET:
1007		return shmctl_down(ns, shmid, cmd, buf, version);
1008	case SHM_LOCK:
1009	case SHM_UNLOCK:
1010	{
1011		struct file *shm_file;
1012
1013		rcu_read_lock();
1014		shp = shm_obtain_object_check(ns, shmid);
1015		if (IS_ERR(shp)) {
1016			err = PTR_ERR(shp);
1017			goto out_unlock1;
1018		}
1019
1020		audit_ipc_obj(&(shp->shm_perm));
1021		err = security_shm_shmctl(shp, cmd);
1022		if (err)
1023			goto out_unlock1;
1024
1025		ipc_lock_object(&shp->shm_perm);
1026
1027		/* check if shm_destroy() is tearing down shp */
1028		if (!ipc_valid_object(&shp->shm_perm)) {
1029			err = -EIDRM;
1030			goto out_unlock0;
1031		}
1032
1033		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
1034			kuid_t euid = current_euid();
1035
1036			if (!uid_eq(euid, shp->shm_perm.uid) &&
1037			    !uid_eq(euid, shp->shm_perm.cuid)) {
1038				err = -EPERM;
1039				goto out_unlock0;
1040			}
1041			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
1042				err = -EPERM;
1043				goto out_unlock0;
1044			}
1045		}
1046
1047		shm_file = shp->shm_file;
1048		if (is_file_hugepages(shm_file))
1049			goto out_unlock0;
1050
1051		if (cmd == SHM_LOCK) {
1052			struct user_struct *user = current_user();
1053
1054			err = shmem_lock(shm_file, 1, user);
1055			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
1056				shp->shm_perm.mode |= SHM_LOCKED;
1057				shp->mlock_user = user;
1058			}
1059			goto out_unlock0;
1060		}
1061
1062		/* SHM_UNLOCK */
1063		if (!(shp->shm_perm.mode & SHM_LOCKED))
1064			goto out_unlock0;
1065		shmem_lock(shm_file, 0, shp->mlock_user);
1066		shp->shm_perm.mode &= ~SHM_LOCKED;
1067		shp->mlock_user = NULL;
1068		get_file(shm_file);
1069		ipc_unlock_object(&shp->shm_perm);
1070		rcu_read_unlock();
1071		shmem_unlock_mapping(shm_file->f_mapping);
1072
1073		fput(shm_file);
1074		return err;
1075	}
1076	default:
1077		return -EINVAL;
1078	}
1079
1080out_unlock0:
1081	ipc_unlock_object(&shp->shm_perm);
1082out_unlock1:
1083	rcu_read_unlock();
1084	return err;
1085}
1086
1087/*
1088 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
1089 *
1090 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
1091 * "raddr" thing points to kernel space, and there has to be a wrapper around
1092 * this.
1093 */
1094long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1095	      unsigned long shmlba)
1096{
1097	struct shmid_kernel *shp;
1098	unsigned long addr;
1099	unsigned long size;
1100	struct file *file;
1101	int    err;
1102	unsigned long flags;
1103	unsigned long prot;
1104	int acc_mode;
1105	struct ipc_namespace *ns;
1106	struct shm_file_data *sfd;
1107	struct path path;
1108	fmode_t f_mode;
1109	unsigned long populate = 0;
1110
1111	err = -EINVAL;
1112	if (shmid < 0)
1113		goto out;
1114	else if ((addr = (ulong)shmaddr)) {
1115		if (addr & (shmlba - 1)) {
1116			if (shmflg & SHM_RND)
1117				addr &= ~(shmlba - 1);	   /* round down */
1118			else
1119#ifndef __ARCH_FORCE_SHMLBA
1120				if (addr & ~PAGE_MASK)
1121#endif
1122					goto out;
1123		}
1124		flags = MAP_SHARED | MAP_FIXED;
1125	} else {
1126		if ((shmflg & SHM_REMAP))
1127			goto out;
1128
1129		flags = MAP_SHARED;
1130	}
1131
1132	if (shmflg & SHM_RDONLY) {
1133		prot = PROT_READ;
1134		acc_mode = S_IRUGO;
1135		f_mode = FMODE_READ;
1136	} else {
1137		prot = PROT_READ | PROT_WRITE;
1138		acc_mode = S_IRUGO | S_IWUGO;
1139		f_mode = FMODE_READ | FMODE_WRITE;
1140	}
1141	if (shmflg & SHM_EXEC) {
1142		prot |= PROT_EXEC;
1143		acc_mode |= S_IXUGO;
1144	}
1145
1146	/*
1147	 * We cannot rely on the fs check since SYSV IPC does have an
1148	 * additional creator id...
1149	 */
1150	ns = current->nsproxy->ipc_ns;
1151	rcu_read_lock();
1152	shp = shm_obtain_object_check(ns, shmid);
1153	if (IS_ERR(shp)) {
1154		err = PTR_ERR(shp);
1155		goto out_unlock;
1156	}
1157
1158	err = -EACCES;
1159	if (ipcperms(ns, &shp->shm_perm, acc_mode))
1160		goto out_unlock;
1161
1162	err = security_shm_shmat(shp, shmaddr, shmflg);
1163	if (err)
1164		goto out_unlock;
1165
1166	ipc_lock_object(&shp->shm_perm);
1167
1168	/* check if shm_destroy() is tearing down shp */
1169	if (!ipc_valid_object(&shp->shm_perm)) {
1170		ipc_unlock_object(&shp->shm_perm);
1171		err = -EIDRM;
1172		goto out_unlock;
1173	}
1174
1175	path = shp->shm_file->f_path;
1176	path_get(&path);
1177	shp->shm_nattch++;
1178	size = i_size_read(d_inode(path.dentry));
1179	ipc_unlock_object(&shp->shm_perm);
1180	rcu_read_unlock();
1181
1182	err = -ENOMEM;
1183	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1184	if (!sfd) {
1185		path_put(&path);
1186		goto out_nattch;
1187	}
1188
1189	file = alloc_file(&path, f_mode,
1190			  is_file_hugepages(shp->shm_file) ?
1191				&shm_file_operations_huge :
1192				&shm_file_operations);
1193	err = PTR_ERR(file);
1194	if (IS_ERR(file)) {
1195		kfree(sfd);
1196		path_put(&path);
1197		goto out_nattch;
1198	}
1199
1200	file->private_data = sfd;
1201	file->f_mapping = shp->shm_file->f_mapping;
1202	sfd->id = shp->shm_perm.id;
1203	sfd->ns = get_ipc_ns(ns);
1204	sfd->file = shp->shm_file;
1205	sfd->vm_ops = NULL;
1206
1207	err = security_mmap_file(file, prot, flags);
1208	if (err)
1209		goto out_fput;
1210
1211	if (down_write_killable(&current->mm->mmap_sem)) {
1212		err = -EINTR;
1213		goto out_fput;
1214	}
1215
1216	if (addr && !(shmflg & SHM_REMAP)) {
1217		err = -EINVAL;
1218		if (addr + size < addr)
1219			goto invalid;
1220
1221		if (find_vma_intersection(current->mm, addr, addr + size))
1222			goto invalid;
1223	}
1224
1225	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
1226	*raddr = addr;
1227	err = 0;
1228	if (IS_ERR_VALUE(addr))
1229		err = (long)addr;
1230invalid:
1231	up_write(&current->mm->mmap_sem);
1232	if (populate)
1233		mm_populate(addr, populate);
1234
1235out_fput:
1236	fput(file);
1237
1238out_nattch:
1239	down_write(&shm_ids(ns).rwsem);
1240	shp = shm_lock(ns, shmid);
1241	shp->shm_nattch--;
1242	if (shm_may_destroy(ns, shp))
1243		shm_destroy(ns, shp);
1244	else
1245		shm_unlock(shp);
1246	up_write(&shm_ids(ns).rwsem);
1247	return err;
1248
1249out_unlock:
1250	rcu_read_unlock();
1251out:
1252	return err;
1253}
1254
1255SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
1256{
1257	unsigned long ret;
1258	long err;
1259
1260	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
1261	if (err)
1262		return err;
1263	force_successful_syscall_return();
1264	return (long)ret;
1265}
1266
1267/*
1268 * detach and kill segment if marked destroyed.
1269 * The work is done in shm_close.
1270 */
1271SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1272{
1273	struct mm_struct *mm = current->mm;
1274	struct vm_area_struct *vma;
1275	unsigned long addr = (unsigned long)shmaddr;
1276	int retval = -EINVAL;
1277#ifdef CONFIG_MMU
1278	loff_t size = 0;
1279	struct file *file;
1280	struct vm_area_struct *next;
1281#endif
1282
1283	if (addr & ~PAGE_MASK)
1284		return retval;
1285
1286	if (down_write_killable(&mm->mmap_sem))
1287		return -EINTR;
1288
1289	/*
1290	 * This function tries to be smart and unmap shm segments that
1291	 * were modified by partial mlock or munmap calls:
1292	 * - It first determines the size of the shm segment that should be
1293	 *   unmapped: It searches for a vma that is backed by shm and that
1294	 *   started at address shmaddr. It records it's size and then unmaps
1295	 *   it.
1296	 * - Then it unmaps all shm vmas that started at shmaddr and that
1297	 *   are within the initially determined size and that are from the
1298	 *   same shm segment from which we determined the size.
1299	 * Errors from do_munmap are ignored: the function only fails if
1300	 * it's called with invalid parameters or if it's called to unmap
1301	 * a part of a vma. Both calls in this function are for full vmas,
1302	 * the parameters are directly copied from the vma itself and always
1303	 * valid - therefore do_munmap cannot fail. (famous last words?)
1304	 */
1305	/*
1306	 * If it had been mremap()'d, the starting address would not
1307	 * match the usual checks anyway. So assume all vma's are
1308	 * above the starting address given.
1309	 */
1310	vma = find_vma(mm, addr);
1311
1312#ifdef CONFIG_MMU
1313	while (vma) {
1314		next = vma->vm_next;
1315
1316		/*
1317		 * Check if the starting address would match, i.e. it's
1318		 * a fragment created by mprotect() and/or munmap(), or it
1319		 * otherwise it starts at this address with no hassles.
1320		 */
1321		if ((vma->vm_ops == &shm_vm_ops) &&
1322			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1323
1324			/*
1325			 * Record the file of the shm segment being
1326			 * unmapped.  With mremap(), someone could place
1327			 * page from another segment but with equal offsets
1328			 * in the range we are unmapping.
1329			 */
1330			file = vma->vm_file;
1331			size = i_size_read(file_inode(vma->vm_file));
1332			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1333			/*
1334			 * We discovered the size of the shm segment, so
1335			 * break out of here and fall through to the next
1336			 * loop that uses the size information to stop
1337			 * searching for matching vma's.
1338			 */
1339			retval = 0;
1340			vma = next;
1341			break;
1342		}
1343		vma = next;
1344	}
1345
1346	/*
1347	 * We need look no further than the maximum address a fragment
1348	 * could possibly have landed at. Also cast things to loff_t to
1349	 * prevent overflows and make comparisons vs. equal-width types.
1350	 */
1351	size = PAGE_ALIGN(size);
1352	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
1353		next = vma->vm_next;
1354
1355		/* finding a matching vma now does not alter retval */
1356		if ((vma->vm_ops == &shm_vm_ops) &&
1357		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
1358		    (vma->vm_file == file))
1359			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1360		vma = next;
1361	}
1362
1363#else	/* CONFIG_MMU */
1364	/* under NOMMU conditions, the exact address to be destroyed must be
1365	 * given
1366	 */
1367	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1368		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1369		retval = 0;
1370	}
1371
1372#endif
1373
1374	up_write(&mm->mmap_sem);
1375	return retval;
1376}
1377
1378#ifdef CONFIG_PROC_FS
1379static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
1380{
1381	struct user_namespace *user_ns = seq_user_ns(s);
1382	struct shmid_kernel *shp = it;
1383	unsigned long rss = 0, swp = 0;
1384
1385	shm_add_rss_swap(shp, &rss, &swp);
1386
1387#if BITS_PER_LONG <= 32
1388#define SIZE_SPEC "%10lu"
1389#else
1390#define SIZE_SPEC "%21lu"
1391#endif
1392
1393	seq_printf(s,
1394		   "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
1395		   "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
1396		   SIZE_SPEC " " SIZE_SPEC "\n",
1397		   shp->shm_perm.key,
1398		   shp->shm_perm.id,
1399		   shp->shm_perm.mode,
1400		   shp->shm_segsz,
1401		   shp->shm_cprid,
1402		   shp->shm_lprid,
1403		   shp->shm_nattch,
1404		   from_kuid_munged(user_ns, shp->shm_perm.uid),
1405		   from_kgid_munged(user_ns, shp->shm_perm.gid),
1406		   from_kuid_munged(user_ns, shp->shm_perm.cuid),
1407		   from_kgid_munged(user_ns, shp->shm_perm.cgid),
1408		   shp->shm_atim,
1409		   shp->shm_dtim,
1410		   shp->shm_ctim,
1411		   rss * PAGE_SIZE,
1412		   swp * PAGE_SIZE);
1413
1414	return 0;
1415}
1416#endif
1417