11da177e4SLinus Torvalds/*
21da177e4SLinus Torvalds * POSIX message queues filesystem for Linux.
31da177e4SLinus Torvalds *
41da177e4SLinus Torvalds * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
5f66e928bSMichal Wronski *                          Michal Wronski          (michal.wronski@gmail.com)
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
81da177e4SLinus Torvalds * Lockless receive & send, fd based notify:
9239521f3SManfred Spraul *			    Manfred Spraul	    (manfred@colorfullife.com)
101da177e4SLinus Torvalds *
1120ca73bcSGeorge C. Wilson * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
1220ca73bcSGeorge C. Wilson *
131da177e4SLinus Torvalds * This file is released under the GPL.
141da177e4SLinus Torvalds */
151da177e4SLinus Torvalds
16c59ede7bSRandy Dunlap#include <linux/capability.h>
171da177e4SLinus Torvalds#include <linux/init.h>
181da177e4SLinus Torvalds#include <linux/pagemap.h>
191da177e4SLinus Torvalds#include <linux/file.h>
201da177e4SLinus Torvalds#include <linux/mount.h>
21935c6912SDavid Howells#include <linux/fs_context.h>
221da177e4SLinus Torvalds#include <linux/namei.h>
231da177e4SLinus Torvalds#include <linux/sysctl.h>
241da177e4SLinus Torvalds#include <linux/poll.h>
251da177e4SLinus Torvalds#include <linux/mqueue.h>
261da177e4SLinus Torvalds#include <linux/msg.h>
271da177e4SLinus Torvalds#include <linux/skbuff.h>
285b5c4d1aSDoug Ledford#include <linux/vmalloc.h>
291da177e4SLinus Torvalds#include <linux/netlink.h>
301da177e4SLinus Torvalds#include <linux/syscalls.h>
3120ca73bcSGeorge C. Wilson#include <linux/audit.h>
327ed20e1aSJesper Juhl#include <linux/signal.h>
335f921ae9SIngo Molnar#include <linux/mutex.h>
34b488893aSPavel Emelyanov#include <linux/nsproxy.h>
35b488893aSPavel Emelyanov#include <linux/pid.h>
36614b84cfSSerge E. Hallyn#include <linux/ipc_namespace.h>
376b550f94SSerge E. Hallyn#include <linux/user_namespace.h>
385a0e3ad6STejun Heo#include <linux/slab.h>
3984f001e1SIngo Molnar#include <linux/sched/wake_q.h>
403f07c014SIngo Molnar#include <linux/sched/signal.h>
418703e8a4SIngo Molnar#include <linux/sched/user.h>
425f921ae9SIngo Molnar
431da177e4SLinus Torvalds#include <net/sock.h>
441da177e4SLinus Torvalds#include "util.h"
451da177e4SLinus Torvalds
46935c6912SDavid Howellsstruct mqueue_fs_context {
47935c6912SDavid Howells	struct ipc_namespace	*ipc_ns;
48935c6912SDavid Howells};
49935c6912SDavid Howells
501da177e4SLinus Torvalds#define MQUEUE_MAGIC	0x19800202
511da177e4SLinus Torvalds#define DIRENT_SIZE	20
521da177e4SLinus Torvalds#define FILENT_SIZE	80
531da177e4SLinus Torvalds
541da177e4SLinus Torvalds#define SEND		0
551da177e4SLinus Torvalds#define RECV		1
561da177e4SLinus Torvalds
571da177e4SLinus Torvalds#define STATE_NONE	0
58fa6004adSDavidlohr Bueso#define STATE_READY	1
591da177e4SLinus Torvalds
60d6629859SDoug Ledfordstruct posix_msg_tree_node {
61d6629859SDoug Ledford	struct rb_node		rb_node;
62d6629859SDoug Ledford	struct list_head	msg_list;
63d6629859SDoug Ledford	int			priority;
64d6629859SDoug Ledford};
65d6629859SDoug Ledford
66c5b2cbdbSManfred Spraul/*
67c5b2cbdbSManfred Spraul * Locking:
68c5b2cbdbSManfred Spraul *
69c5b2cbdbSManfred Spraul * Accesses to a message queue are synchronized by acquiring info->lock.
70c5b2cbdbSManfred Spraul *
71c5b2cbdbSManfred Spraul * There are two notable exceptions:
72c5b2cbdbSManfred Spraul * - The actual wakeup of a sleeping task is performed using the wake_q
73c5b2cbdbSManfred Spraul *   framework. info->lock is already released when wake_up_q is called.
74c5b2cbdbSManfred Spraul * - The exit codepaths after sleeping check ext_wait_queue->state without
75c5b2cbdbSManfred Spraul *   any locks. If it is STATE_READY, then the syscall is completed without
76c5b2cbdbSManfred Spraul *   acquiring info->lock.
77c5b2cbdbSManfred Spraul *
78c5b2cbdbSManfred Spraul * MQ_BARRIER:
79c5b2cbdbSManfred Spraul * To achieve proper release/acquire memory barrier pairing, the state is set to
80c5b2cbdbSManfred Spraul * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
81c5b2cbdbSManfred Spraul * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
82c5b2cbdbSManfred Spraul *
83c5b2cbdbSManfred Spraul * This prevents the following races:
84c5b2cbdbSManfred Spraul *
85c5b2cbdbSManfred Spraul * 1) With the simple wake_q_add(), the task could be gone already before
86c5b2cbdbSManfred Spraul *    the increase of the reference happens
87c5b2cbdbSManfred Spraul * Thread A
88c5b2cbdbSManfred Spraul *				Thread B
89c5b2cbdbSManfred Spraul * WRITE_ONCE(wait.state, STATE_NONE);
90c5b2cbdbSManfred Spraul * schedule_hrtimeout()
91c5b2cbdbSManfred Spraul *				wake_q_add(A)
92c5b2cbdbSManfred Spraul *				if (cmpxchg()) // success
93c5b2cbdbSManfred Spraul *				   ->state = STATE_READY (reordered)
94c5b2cbdbSManfred Spraul * <timeout returns>
95c5b2cbdbSManfred Spraul * if (wait.state == STATE_READY) return;
96c5b2cbdbSManfred Spraul * sysret to user space
97c5b2cbdbSManfred Spraul * sys_exit()
98c5b2cbdbSManfred Spraul *				get_task_struct() // UaF
99c5b2cbdbSManfred Spraul *
100c5b2cbdbSManfred Spraul * Solution: Use wake_q_add_safe() and perform the get_task_struct() before
101c5b2cbdbSManfred Spraul * the smp_store_release() that does ->state = STATE_READY.
102c5b2cbdbSManfred Spraul *
103c5b2cbdbSManfred Spraul * 2) Without proper _release/_acquire barriers, the woken up task
104c5b2cbdbSManfred Spraul *    could read stale data
105c5b2cbdbSManfred Spraul *
106c5b2cbdbSManfred Spraul * Thread A
107c5b2cbdbSManfred Spraul *				Thread B
108c5b2cbdbSManfred Spraul * do_mq_timedreceive
109c5b2cbdbSManfred Spraul * WRITE_ONCE(wait.state, STATE_NONE);
110c5b2cbdbSManfred Spraul * schedule_hrtimeout()
111c5b2cbdbSManfred Spraul *				state = STATE_READY;
112c5b2cbdbSManfred Spraul * <timeout returns>
113c5b2cbdbSManfred Spraul * if (wait.state == STATE_READY) return;
114c5b2cbdbSManfred Spraul * msg_ptr = wait.msg;		// Access to stale data!
115c5b2cbdbSManfred Spraul *				receiver->msg = message; (reordered)
116c5b2cbdbSManfred Spraul *
117c5b2cbdbSManfred Spraul * Solution: use _release and _acquire barriers.
118c5b2cbdbSManfred Spraul *
119c5b2cbdbSManfred Spraul * 3) There is intentionally no barrier when setting current->state
120c5b2cbdbSManfred Spraul *    to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
121c5b2cbdbSManfred Spraul *    release memory barrier, and the wakeup is triggered when holding
122c5b2cbdbSManfred Spraul *    info->lock, i.e. spin_lock(&info->lock) provided a pairing
123c5b2cbdbSManfred Spraul *    acquire memory barrier.
124c5b2cbdbSManfred Spraul */
125c5b2cbdbSManfred Spraul
1261da177e4SLinus Torvaldsstruct ext_wait_queue {		/* queue of sleeping tasks */
1271da177e4SLinus Torvalds	struct task_struct *task;
1281da177e4SLinus Torvalds	struct list_head list;
1291da177e4SLinus Torvalds	struct msg_msg *msg;	/* ptr of loaded message */
1301da177e4SLinus Torvalds	int state;		/* one of STATE_* values */
1311da177e4SLinus Torvalds};
1321da177e4SLinus Torvalds
1331da177e4SLinus Torvaldsstruct mqueue_inode_info {
1341da177e4SLinus Torvalds	spinlock_t lock;
1351da177e4SLinus Torvalds	struct inode vfs_inode;
1361da177e4SLinus Torvalds	wait_queue_head_t wait_q;
1371da177e4SLinus Torvalds
138d6629859SDoug Ledford	struct rb_root msg_tree;
139a5091fdaSDavidlohr Bueso	struct rb_node *msg_tree_rightmost;
140ce2d52ccSDoug Ledford	struct posix_msg_tree_node *node_cache;
1411da177e4SLinus Torvalds	struct mq_attr attr;
1421da177e4SLinus Torvalds
1431da177e4SLinus Torvalds	struct sigevent notify;
144239521f3SManfred Spraul	struct pid *notify_owner;
145b5f20061SOleg Nesterov	u32 notify_self_exec_id;
1466f9ac6d9SEric W. Biederman	struct user_namespace *notify_user_ns;
1476e52a9f0SAlexey Gladkov	struct ucounts *ucounts;	/* user who created, for accounting */
1481da177e4SLinus Torvalds	struct sock *notify_sock;
1491da177e4SLinus Torvalds	struct sk_buff *notify_cookie;
1501da177e4SLinus Torvalds
1511da177e4SLinus Torvalds	/* for tasks waiting for free space and messages, respectively */
1521da177e4SLinus Torvalds	struct ext_wait_queue e_wait_q[2];
1531da177e4SLinus Torvalds
1541da177e4SLinus Torvalds	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
1551da177e4SLinus Torvalds};
1561da177e4SLinus Torvalds
157935c6912SDavid Howellsstatic struct file_system_type mqueue_fs_type;
15892e1d5beSArjan van de Venstatic const struct inode_operations mqueue_dir_inode_operations;
1599a32144eSArjan van de Venstatic const struct file_operations mqueue_file_operations;
160b87221deSAlexey Dobriyanstatic const struct super_operations mqueue_super_ops;
161935c6912SDavid Howellsstatic const struct fs_context_operations mqueue_fs_context_ops;
1621da177e4SLinus Torvaldsstatic void remove_notification(struct mqueue_inode_info *info);
1631da177e4SLinus Torvalds
164e18b890bSChristoph Lameterstatic struct kmem_cache *mqueue_inode_cachep;
1651da177e4SLinus Torvalds
166239521f3SManfred Spraulstatic struct ctl_table_header *mq_sysctl_table;
1671da177e4SLinus Torvalds
1681da177e4SLinus Torvaldsstatic inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
1691da177e4SLinus Torvalds{
1701da177e4SLinus Torvalds	return container_of(inode, struct mqueue_inode_info, vfs_inode);
1711da177e4SLinus Torvalds}
1721da177e4SLinus Torvalds
1737eafd7c7SSerge E. Hallyn/*
1747eafd7c7SSerge E. Hallyn * This routine should be called with the mq_lock held.
1757eafd7c7SSerge E. Hallyn */
1767eafd7c7SSerge E. Hallynstatic inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
177614b84cfSSerge E. Hallyn{
1787eafd7c7SSerge E. Hallyn	return get_ipc_ns(inode->i_sb->s_fs_info);
179614b84cfSSerge E. Hallyn}
180614b84cfSSerge E. Hallyn
1817eafd7c7SSerge E. Hallynstatic struct ipc_namespace *get_ns_from_inode(struct inode *inode)
182614b84cfSSerge E. Hallyn{
1837eafd7c7SSerge E. Hallyn	struct ipc_namespace *ns;
1847eafd7c7SSerge E. Hallyn
1857eafd7c7SSerge E. Hallyn	spin_lock(&mq_lock);
1867eafd7c7SSerge E. Hallyn	ns = __get_ns_from_inode(inode);
1877eafd7c7SSerge E. Hallyn	spin_unlock(&mq_lock);
1887eafd7c7SSerge E. Hallyn	return ns;
189614b84cfSSerge E. Hallyn}
190614b84cfSSerge E. Hallyn
191d6629859SDoug Ledford/* Auxiliary functions to manipulate messages' list */
192d6629859SDoug Ledfordstatic int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
193d6629859SDoug Ledford{
194d6629859SDoug Ledford	struct rb_node **p, *parent = NULL;
195d6629859SDoug Ledford	struct posix_msg_tree_node *leaf;
196a5091fdaSDavidlohr Bueso	bool rightmost = true;
197d6629859SDoug Ledford
198d6629859SDoug Ledford	p = &info->msg_tree.rb_node;
199d6629859SDoug Ledford	while (*p) {
200d6629859SDoug Ledford		parent = *p;
201d6629859SDoug Ledford		leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
202d6629859SDoug Ledford
203d6629859SDoug Ledford		if (likely(leaf->priority == msg->m_type))
204d6629859SDoug Ledford			goto insert_msg;
205a5091fdaSDavidlohr Bueso		else if (msg->m_type < leaf->priority) {
206d6629859SDoug Ledford			p = &(*p)->rb_left;
207a5091fdaSDavidlohr Bueso			rightmost = false;
208a5091fdaSDavidlohr Bueso		} else
209d6629859SDoug Ledford			p = &(*p)->rb_right;
210d6629859SDoug Ledford	}
211ce2d52ccSDoug Ledford	if (info->node_cache) {
212ce2d52ccSDoug Ledford		leaf = info->node_cache;
213ce2d52ccSDoug Ledford		info->node_cache = NULL;
214ce2d52ccSDoug Ledford	} else {
215ce2d52ccSDoug Ledford		leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
216ce2d52ccSDoug Ledford		if (!leaf)
217ce2d52ccSDoug Ledford			return -ENOMEM;
218ce2d52ccSDoug Ledford		INIT_LIST_HEAD(&leaf->msg_list);
219ce2d52ccSDoug Ledford	}
220d6629859SDoug Ledford	leaf->priority = msg->m_type;
221a5091fdaSDavidlohr Bueso
222a5091fdaSDavidlohr Bueso	if (rightmost)
223a5091fdaSDavidlohr Bueso		info->msg_tree_rightmost = &leaf->rb_node;
224a5091fdaSDavidlohr Bueso
225d6629859SDoug Ledford	rb_link_node(&leaf->rb_node, parent, p);
226d6629859SDoug Ledford	rb_insert_color(&leaf->rb_node, &info->msg_tree);