1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET4:	Implementation of BSD Unix domain sockets.
4 *
5 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 *		Linus Torvalds	:	Assorted bug cures.
9 *		Niibe Yutaka	:	async I/O support.
10 *		Carsten Paeth	:	PF_UNIX check, address fixes.
11 *		Alan Cox	:	Limit size of allocated blocks.
12 *		Alan Cox	:	Fixed the stupid socketpair bug.
13 *		Alan Cox	:	BSD compatibility fine tuning.
14 *		Alan Cox	:	Fixed a bug in connect when interrupted.
15 *		Alan Cox	:	Sorted out a proper draft version of
16 *					file descriptor passing hacked up from
17 *					Mike Shaver's work.
18 *		Marty Leisner	:	Fixes to fd passing
19 *		Nick Nevin	:	recvmsg bugfix.
20 *		Alan Cox	:	Started proper garbage collector
21 *		Heiko EiBfeldt	:	Missing verify_area check
22 *		Alan Cox	:	Started POSIXisms
23 *		Andreas Schwab	:	Replace inode by dentry for proper
24 *					reference counting
25 *		Kirk Petersen	:	Made this a module
26 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27 *					Lots of bug fixes.
28 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29 *					by above two patches.
30 *	     Andrea Arcangeli	:	If possible we block in connect(2)
31 *					if the max backlog of the listen socket
32 *					is been reached. This won't break
33 *					old apps and it will avoid huge amount
34 *					of socks hashed (this for unix_gc()
35 *					performances reasons).
36 *					Security fix that limits the max
37 *					number of socks to 2*max_files and
38 *					the number of skb queueable in the
39 *					dgram receiver.
40 *		Artur Skawina   :	Hash function optimizations
41 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42 *	      Malcolm Beattie   :	Set peercred for socketpair
43 *	     Michal Ostrowski   :       Module initialization cleanup.
44 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45 *	     				the core infrastructure is doing that
46 *	     				for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 *	[TO FIX]
51 *	ECONNREFUSED is not returned from one end of a connected() socket to the
52 *		other the moment one end closes.
53 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 *	[NOT TO FIX]
56 *	accept() returns a path name even if the connecting socket has closed
57 *		in the meantime (BSD loses the path and gives up).
58 *	accept() returns 0 length path for an unbound connector. BSD returns 16
59 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 *	BSD af_unix apparently has connect forgetting to block properly.
62 *		(need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 *	Bug fixes and improvements.
66 *		- client shutdown killed server socket.
67 *		- removed all useless cli/sti pairs.
68 *
69 *	Semantic changes/extensions.
70 *		- generic control message passing.
71 *		- SCM_CREDENTIALS control message.
72 *		- "Abstract" (not FS based) socket bindings.
73 *		  Abstract names are sequences of bytes (not zero terminated)
74 *		  started by 0, so that this name space does not intersect
75 *		  with BSD names.
76 */
77
78#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80#include <linux/module.h>
81#include <linux/kernel.h>
82#include <linux/signal.h>
83#include <linux/sched/signal.h>
84#include <linux/errno.h>
85#include <linux/string.h>
86#include <linux/stat.h>
87#include <linux/dcache.h>
88#include <linux/namei.h>
89#include <linux/socket.h>
90#include <linux/un.h>
91#include <linux/fcntl.h>
92#include <linux/filter.h>
93#include <linux/termios.h>
94#include <linux/sockios.h>
95#include <linux/net.h>
96#include <linux/in.h>
97#include <linux/fs.h>
98#include <linux/slab.h>
99#include <linux/uaccess.h>
100#include <linux/skbuff.h>
101#include <linux/netdevice.h>
102#include <net/net_namespace.h>
103#include <net/sock.h>
104#include <net/tcp_states.h>
105#include <net/af_unix.h>
106#include <linux/proc_fs.h>
107#include <linux/seq_file.h>
108#include <net/scm.h>
109#include <linux/init.h>
110#include <linux/poll.h>
111#include <linux/rtnetlink.h>
112#include <linux/mount.h>
113#include <net/checksum.h>
114#include <linux/security.h>
115#include <linux/splice.h>
116#include <linux/freezer.h>
117#include <linux/file.h>
118#include <linux/btf_ids.h>
119#include <linux/bpf-cgroup.h>
120
121static atomic_long_t unix_nr_socks;
122static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125/* SMP locking strategy:
126 *    hash table is protected with spinlock.
127 *    each socket state is protected by separate spinlock.
128 */
129
130static unsigned int unix_unbound_hash(struct sock *sk)
131{
132	unsigned long hash = (unsigned long)sk;
133
134	hash ^= hash >> 16;
135	hash ^= hash >> 8;
136	hash ^= sk->sk_type;
137
138	return hash & UNIX_HASH_MOD;
139}
140
141static unsigned int unix_bsd_hash(struct inode *i)
142{
143	return i->i_ino & UNIX_HASH_MOD;
144}
145
146static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147				       int addr_len, int type)
148{
149	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150	unsigned int hash;
151
152	hash = (__force unsigned int)csum_fold(csum);
153	hash ^= hash >> 8;
154	hash ^= type;
155
156	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157}
158
159static void unix_table_double_lock(struct net *net,
160				   unsigned int hash1, unsigned int hash2)
161{
162	if (hash1 == hash2) {
163		spin_lock(&net->unx.table.locks[hash1]);
164		return;
165	}
166
167	if (hash1 > hash2)
168		swap(hash1, hash2);
169
170	spin_lock(&net->unx.table.locks[hash1]);
171	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172}
173
174static void unix_table_double_unlock(struct net *net,
175				     unsigned int hash1, unsigned int hash2)
176{
177	if (hash1 == hash2) {
178		spin_unlock(&net->unx.table.locks[hash1]);
179		return;
180	}
181
182	spin_unlock(&net->unx.table.locks[hash1]);
183	spin_unlock(&net->unx.table.locks[hash2]);
184}
185
186#ifdef CONFIG_SECURITY_NETWORK
187static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188{
189	UNIXCB(skb).secid = scm->secid;
190}
191
192static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193{
194	scm->secid = UNIXCB(skb).secid;
195}
196
197static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198{
199	return (scm->secid == UNIXCB(skb).secid);
200}
201#else
202static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203{ }
204
205static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206{ }
207
208static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209{
210	return true;
211}
212#endif /* CONFIG_SECURITY_NETWORK */
213
214static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215{
216	return unix_peer(osk) == sk;
217}
218
219static inline int unix_may_send(struct sock *sk, struct sock *osk)
220{
221	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222}
223
224static inline int unix_recvq_full(const struct sock *sk)
225{
226	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227}
228
229static inline int unix_recvq_full_lockless(const struct sock *sk)
230{
231	return skb_queue_len_lockless(&sk->sk_receive_queue) >
232		READ_ONCE(sk->sk_max_ack_backlog);
233}
234
235struct sock *unix_peer_get(struct sock *s)
236{
237	struct sock *peer;
238
239	unix_state_lock(s);
240	peer = unix_peer(s);
241	if (peer)
242		sock_hold(peer);
243	unix_state_unlock(s);
244	return peer;
245}
246EXPORT_SYMBOL_GPL(unix_peer_get);
247
248static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249					     int addr_len)
250{
251	struct unix_address *addr;
252
253	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254	if (!addr)
255		return NULL;
256
257	refcount_set(&addr->refcnt, 1);
258	addr->len = addr_len;
259	memcpy(addr->name, sunaddr, addr_len);
260
261	return addr;
262}
263
264static inline void unix_release_addr(struct unix_address *addr)
265{
266	if (refcount_dec_and_test(&addr->refcnt))
267		kfree(addr);
268}
269
270/*
271 *	Check unix socket name:
272 *		- should be not zero length.
273 *	        - if started by not zero, should be NULL terminated (FS object)
274 *		- if started by zero, it is abstract name.
275 */
276
277static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278{
279	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280	    addr_len > sizeof(*sunaddr))
281		return -EINVAL;
282
283	if (sunaddr->sun_family != AF_UNIX)
284		return -EINVAL;
285
286	return 0;
287}
288
289static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290{
291	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292	short offset = offsetof(struct sockaddr_storage, __data);
293
294	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295
296	/* This may look like an off by one error but it is a bit more
297	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
298	 * sun_path[108] doesn't as such exist.  However in kernel space
299	 * we are guaranteed that it is a valid memory location in our
300	 * kernel address buffer because syscall functions always pass
301	 * a pointer of struct sockaddr_storage which has a bigger buffer
302	 * than 108.  Also, we must terminate sun_path for strlen() in
303	 * getname_kernel().
304	 */
305	addr->__data[addr_len - offset] = 0;
306
307	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309	 * know the actual buffer.
310	 */
311	return strlen(addr->__data) + offset + 1;
312}
313
314static void __unix_remove_socket(struct sock *sk)
315{
316	sk_del_node_init(sk);
317}
318
319static void __unix_insert_socket(struct net *net, struct sock *sk)
320{
321	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323}
324
325static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326				 struct unix_address *addr, unsigned int hash)
327{
328	__unix_remove_socket(sk);
329	smp_store_release(&unix_sk(sk)->addr, addr);
330
331	sk->sk_hash = hash;
332	__unix_insert_socket(net, sk);
333}
334
335static void unix_remove_socket(struct net *net, struct sock *sk)
336{
337	spin_lock(&net->unx.table.locks[sk->sk_hash]);
338	__unix_remove_socket(sk);
339	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340}
341
342static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343{
344	spin_lock(&net->unx.table.locks[sk->sk_hash]);
345	__unix_insert_socket(net, sk);
346	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347}
348
349static void unix_insert_bsd_socket(struct sock *sk)
350{
351	spin_lock(&bsd_socket_locks[sk->sk_hash]);
352	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354}
355
356static void unix_remove_bsd_socket(struct sock *sk)
357{
358	if (!hlist_unhashed(&sk->sk_bind_node)) {
359		spin_lock(&bsd_socket_locks[sk->sk_hash]);
360		__sk_del_bind_node(sk);
361		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362
363		sk_node_init(&sk->sk_bind_node);
364	}
365}
366
367static struct sock *__unix_find_socket_byname(struct net *net,
368					      struct sockaddr_un *sunname,
369					      int len, unsigned int hash)
370{
371	struct sock *s;
372
373	sk_for_each(s, &net->unx.table.buckets[hash]) {
374		struct unix_sock *u = unix_sk(s);
375
376		if (u->addr->len == len &&
377		    !memcmp(u->addr->name, sunname, len))
378			return s;
379	}
380	return NULL;
381}
382
383static inline struct sock *unix_find_socket_byname(struct net *net,
384						   struct sockaddr_un *sunname,
385						   int len, unsigned int hash)
386{
387	struct sock *s;
388
389	spin_lock(&net->unx.table.locks[hash]);
390	s = __unix_find_socket_byname(net, sunname, len, hash);
391	if (s)
392		sock_hold(s);
393	spin_unlock(&net->unx.table.locks[hash]);
394	return s;
395}
396
397static struct sock *unix_find_socket_byinode(struct inode *i)
398{
399	unsigned int hash = unix_bsd_hash(i);
400	struct sock *s;
401
402	spin_lock(&bsd_socket_locks[hash]);
403	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404		struct dentry *dentry = unix_sk(s)->path.dentry;
405
406		if (dentry && d_backing_inode(dentry) == i) {
407			sock_hold(s);
408			spin_unlock(&bsd_socket_locks[hash]);
409			return s;
410		}
411	}
412	spin_unlock(&bsd_socket_locks[hash]);
413	return NULL;
414}
415
416/* Support code for asymmetrically connected dgram sockets
417 *
418 * If a datagram socket is connected to a socket not itself connected
419 * to the first socket (eg, /dev/log), clients may only enqueue more
420 * messages if the present receive queue of the server socket is not
421 * "too large". This means there's a second writeability condition
422 * poll and sendmsg need to test. The dgram recv code will do a wake
423 * up on the peer_wait wait queue of a socket upon reception of a
424 * datagram which needs to be propagated to sleeping would-be writers
425 * since these might not have sent anything so far. This can't be
426 * accomplished via poll_wait because the lifetime of the server
427 * socket might be less than that of its clients if these break their
428 * association with it or if the server socket is closed while clients
429 * are still connected to it and there's no way to inform "a polling
430 * implementation" that it should let go of a certain wait queue
431 *
432 * In order to propagate a wake up, a wait_queue_entry_t of the client
433 * socket is enqueued on the peer_wait queue of the server socket
434 * whose wake function does a wake_up on the ordinary client socket
435 * wait queue. This connection is established whenever a write (or
436 * poll for write) hit the flow control condition and broken when the
437 * association to the server socket is dissolved or after a wake up
438 * was relayed.
439 */
440
441static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442				      void *key)
443{
444	struct unix_sock *u;
445	wait_queue_head_t *u_sleep;
446
447	u = container_of(q, struct unix_sock, peer_wake);
448
449	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450			    q);
451	u->peer_wake.private = NULL;
452
453	/* relaying can only happen while the wq still exists */
454	u_sleep = sk_sleep(&u->sk);
455	if (u_sleep)
456		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457
458	return 0;
459}
460
461static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462{
463	struct unix_sock *u, *u_other;
464	int rc;
465
466	u = unix_sk(sk);
467	u_other = unix_sk(other);
468	rc = 0;
469	spin_lock(&u_other->peer_wait.lock);
470
471	if (!u->peer_wake.private) {
472		u->peer_wake.private = other;
473		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474
475		rc = 1;
476	}
477
478	spin_unlock(&u_other->peer_wait.lock);
479	return rc;
480}
481
482static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483					    struct sock *other)
484{
485	struct unix_sock *u, *u_other;
486
487	u = unix_sk(sk);
488	u_other = unix_sk(other);
489	spin_lock(&u_other->peer_wait.lock);
490
491	if (u->peer_wake.private == other) {
492		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493		u->peer_wake.private = NULL;
494	}
495
496	spin_unlock(&u_other->peer_wait.lock);
497}
498
499static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500						   struct sock *other)
501{
502	unix_dgram_peer_wake_disconnect(sk, other);
503	wake_up_interruptible_poll(sk_sleep(sk),
504				   EPOLLOUT |
505				   EPOLLWRNORM |
506				   EPOLLWRBAND);
507}
508
509/* preconditions:
510 *	- unix_peer(sk) == other
511 *	- association is stable
512 */
513static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514{
515	int connected;
516
517	connected = unix_dgram_peer_wake_connect(sk, other);
518
519	/* If other is SOCK_DEAD, we want to make sure we signal
520	 * POLLOUT, such that a subsequent write() can get a
521	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522	 * to other and its full, we will hang waiting for POLLOUT.
523	 */
524	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525		return 1;
526
527	if (connected)
528		unix_dgram_peer_wake_disconnect(sk, other);
529
530	return 0;
531}
532
533static int unix_writable(const struct sock *sk)
534{
535	return sk->sk_state != TCP_LISTEN &&
536	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537}
538
539static void unix_write_space(struct sock *sk)
540{
541	struct socket_wq *wq;
542
543	rcu_read_lock();
544	if (unix_writable(sk)) {
545		wq = rcu_dereference(sk->sk_wq);
546		if (skwq_has_sleeper(wq))
547			wake_up_interruptible_sync_poll(&wq->wait,
548				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
550	}
551	rcu_read_unlock();
552}
553
554/* When dgram socket disconnects (or changes its peer), we clear its receive
555 * queue of packets arrived from previous peer. First, it allows to do
556 * flow control based only on wmem_alloc; second, sk connected to peer
557 * may receive messages only from that peer. */
558static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559{
560	if (!skb_queue_empty(&sk->sk_receive_queue)) {
561		skb_queue_purge(&sk->sk_receive_queue);
562		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563
564		/* If one link of bidirectional dgram pipe is disconnected,
565		 * we signal error. Messages are lost. Do not make this,
566		 * when peer was not connected to us.
567		 */
568		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569			WRITE_ONCE(other->sk_err, ECONNRESET);
570			sk_error_report(other);
571		}
572	}
573	other->sk_state = TCP_CLOSE;
574}
575
576static void unix_sock_destructor(struct sock *sk)
577{
578	struct unix_sock *u = unix_sk(sk);
579
580	skb_queue_purge(&sk->sk_receive_queue);
581
582	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585	if (!sock_flag(sk, SOCK_DEAD)) {
586		pr_info("Attempt to release alive unix socket: %p\n", sk);
587		return;
588	}
589
590	if (u->addr)
591		unix_release_addr(u->addr);
592
593	atomic_long_dec(&unix_nr_socks);
594	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
595#ifdef UNIX_REFCNT_DEBUG
596	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597		atomic_long_read(&unix_nr_socks));
598#endif
599}
600
601static void unix_release_sock(struct sock *sk, int embrion)
602{
603	struct unix_sock *u = unix_sk(sk);
604	struct sock *skpair;
605	struct sk_buff *skb;
606	struct path path;
607	int state;
608
609	unix_remove_socket(sock_net(sk), sk);
610	unix_remove_bsd_socket(sk);
611
612	/* Clear state */
613	unix_state_lock(sk);
614	sock_orphan(sk);
615	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616	path	     = u->path;
617	u->path.dentry = NULL;
618	u->path.mnt = NULL;
619	state = sk->sk_state;
620	sk->sk_state = TCP_CLOSE;
621
622	skpair = unix_peer(sk);
623	unix_peer(sk) = NULL;
624
625	unix_state_unlock(sk);
626
627#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628	if (u->oob_skb) {
629		kfree_skb(u->oob_skb);
630		u->oob_skb = NULL;
631	}
632#endif
633
634	wake_up_interruptible_all(&u->peer_wait);
635
636	if (skpair != NULL) {
637		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
638			unix_state_lock(skpair);
639			/* No more writes */
640			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
642				WRITE_ONCE(skpair->sk_err, ECONNRESET);
643			unix_state_unlock(skpair);
644			skpair->sk_state_change(skpair);
645			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
646		}
647
648		unix_dgram_peer_wake_disconnect(sk, skpair);
649		sock_put(skpair); /* It may now die */
650	}
651
652	/* Try to flush out this socket. Throw out buffers at least */
653
654	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
655		if (state == TCP_LISTEN)
656			unix_release_sock(skb->sk, 1);
657		/* passed fds are erased in the kfree_skb hook	      */
658		UNIXCB(skb).consumed = skb->len;
659		kfree_skb(skb);
660	}
661
662	if (path.dentry)
663		path_put(&path);
664
665	sock_put(sk);
666
667	/* ---- Socket is dead now and most probably destroyed ---- */
668
669	/*
670	 * Fixme: BSD difference: In BSD all sockets connected to us get
671	 *	  ECONNRESET and we die on the spot. In Linux we behave
672	 *	  like files and pipes do and wait for the last
673	 *	  dereference.
674	 *
675	 * Can't we simply set sock->err?
676	 *
677	 *	  What the above comment does talk about? --ANK(980817)
678	 */
679
680	if (READ_ONCE(unix_tot_inflight))
681		unix_gc();		/* Garbage collect fds */
682}
683
684static void init_peercred(struct sock *sk)
685{
686	const struct cred *old_cred;
687	struct pid *old_pid;
688
689	spin_lock(&sk->sk_peer_lock);
690	old_pid = sk->sk_peer_pid;
691	old_cred = sk->sk_peer_cred;
692	sk->sk_peer_pid  = get_pid(task_tgid(current));
693	sk->sk_peer_cred = get_current_cred();
694	spin_unlock(&sk->sk_peer_lock);
695
696	put_pid(old_pid);
697	put_cred(old_cred);
698}
699
700static void copy_peercred(struct sock *sk, struct sock *peersk)
701{
702	const struct cred *old_cred;
703	struct pid *old_pid;
704
705	if (sk < peersk) {
706		spin_lock(&sk->sk_peer_lock);
707		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708	} else {
709		spin_lock(&peersk->sk_peer_lock);
710		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711	}
712	old_pid = sk->sk_peer_pid;
713	old_cred = sk->sk_peer_cred;
714	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
715	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
716
717	spin_unlock(&sk->sk_peer_lock);
718	spin_unlock(&peersk->sk_peer_lock);
719
720	put_pid(old_pid);
721	put_cred(old_cred);
722}
723
724static int unix_listen(struct socket *sock, int backlog)
725{
726	int err;
727	struct sock *sk = sock->sk;
728	struct unix_sock *u = unix_sk(sk);
729
730	err = -EOPNOTSUPP;
731	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732		goto out;	/* Only stream/seqpacket sockets accept */
733	err = -EINVAL;
734	if (!u->addr)
735		goto out;	/* No listens on an unbound socket */
736	unix_state_lock(sk);
737	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738		goto out_unlock;
739	if (backlog > sk->sk_max_ack_backlog)
740		wake_up_interruptible_all(&u->peer_wait);
741	sk->sk_max_ack_backlog	= backlog;
742	sk->sk_state		= TCP_LISTEN;
743	/* set credentials so connect can copy them */
744	init_peercred(sk);
745	err = 0;
746
747out_unlock:
748	unix_state_unlock(sk);
749out:
750	return err;
751}
752
753static int unix_release(struct socket *);
754static int unix_bind(struct socket *, struct sockaddr *, int);
755static int unix_stream_connect(struct socket *, struct sockaddr *,
756			       int addr_len, int flags);
757static int unix_socketpair(struct socket *, struct socket *);
758static int unix_accept(struct socket *, struct socket *, int, bool);
759static int unix_getname(struct socket *, struct sockaddr *, int);
760static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761static __poll_t unix_dgram_poll(struct file *, struct socket *,
762				    poll_table *);
763static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764#ifdef CONFIG_COMPAT
765static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766#endif
767static int unix_shutdown(struct socket *, int);
768static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771				       struct pipe_inode_info *, size_t size,
772				       unsigned int flags);
773static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777static int unix_dgram_connect(struct socket *, struct sockaddr *,
778			      int, int);
779static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781				  int);
782
783#ifdef CONFIG_PROC_FS
784static int unix_count_nr_fds(struct sock *sk)
785{
786	struct sk_buff *skb;
787	struct unix_sock *u;
788	int nr_fds = 0;
789
790	spin_lock(&sk->sk_receive_queue.lock);
791	skb = skb_peek(&sk->sk_receive_queue);
792	while (skb) {
793		u = unix_sk(skb->sk);
794		nr_fds += atomic_read(&u->scm_stat.nr_fds);
795		skb = skb_peek_next(skb, &sk->sk_receive_queue);
796	}
797	spin_unlock(&sk->sk_receive_queue.lock);
798
799	return nr_fds;
800}
801
802static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
803{
804	struct sock *sk = sock->sk;
805	unsigned char s_state;
806	struct unix_sock *u;
807	int nr_fds = 0;
808
809	if (sk) {
810		s_state = READ_ONCE(sk->sk_state);
811		u = unix_sk(sk);
812
813		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
814		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815		 * SOCK_DGRAM is ordinary. So, no lock is needed.
816		 */
817		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
818			nr_fds = atomic_read(&u->scm_stat.nr_fds);
819		else if (s_state == TCP_LISTEN)
820			nr_fds = unix_count_nr_fds(sk);
821
822		seq_printf(m, "scm_fds: %u\n", nr_fds);
823	}
824}
825#else
826#define unix_show_fdinfo NULL
827#endif
828
829static const struct proto_ops unix_stream_ops = {
830	.family =	PF_UNIX,
831	.owner =	THIS_MODULE,
832	.release =	unix_release,
833	.bind =		unix_bind,
834	.connect =	unix_stream_connect,
835	.socketpair =	unix_socketpair,
836	.accept =	unix_accept,
837	.getname =	unix_getname,
838	.poll =		unix_poll,
839	.ioctl =	unix_ioctl,
840#ifdef CONFIG_COMPAT
841	.compat_ioctl =	unix_compat_ioctl,
842#endif
843	.listen =	unix_listen,
844	.shutdown =	unix_shutdown,
845	.sendmsg =	unix_stream_sendmsg,
846	.recvmsg =	unix_stream_recvmsg,
847	.read_skb =	unix_stream_read_skb,
848	.mmap =		sock_no_mmap,
849	.splice_read =	unix_stream_splice_read,
850	.set_peek_off =	sk_set_peek_off,
851	.show_fdinfo =	unix_show_fdinfo,
852};
853
854static const struct proto_ops unix_dgram_ops = {
855	.family =	PF_UNIX,
856	.owner =	THIS_MODULE,
857	.release =	unix_release,
858	.bind =		unix_bind,
859	.connect =	unix_dgram_connect,
860	.socketpair =	unix_socketpair,
861	.accept =	sock_no_accept,
862	.getname =	unix_getname,
863	.poll =		unix_dgram_poll,
864	.ioctl =	unix_ioctl,
865#ifdef CONFIG_COMPAT
866	.compat_ioctl =	unix_compat_ioctl,
867#endif
868	.listen =	sock_no_listen,
869	.shutdown =	unix_shutdown,
870	.sendmsg =	unix_dgram_sendmsg,
871	.read_skb =	unix_read_skb,
872	.recvmsg =	unix_dgram_recvmsg,
873	.mmap =		sock_no_mmap,
874	.set_peek_off =	sk_set_peek_off,
875	.show_fdinfo =	unix_show_fdinfo,
876};
877
878static const struct proto_ops unix_seqpacket_ops = {
879	.family =	PF_UNIX,
880	.owner =	THIS_MODULE,
881	.release =	unix_release,
882	.bind =		unix_bind,
883	.connect =	unix_stream_connect,
884	.socketpair =	unix_socketpair,
885	.accept =	unix_accept,
886	.getname =	unix_getname,
887	.poll =		unix_dgram_poll,
888	.ioctl =	unix_ioctl,
889#ifdef CONFIG_COMPAT
890	.compat_ioctl =	unix_compat_ioctl,
891#endif
892	.listen =	unix_listen,
893	.shutdown =	unix_shutdown,
894	.sendmsg =	unix_seqpacket_sendmsg,
895	.recvmsg =	unix_seqpacket_recvmsg,
896	.mmap =		sock_no_mmap,
897	.set_peek_off =	sk_set_peek_off,
898	.show_fdinfo =	unix_show_fdinfo,
899};
900
901static void unix_close(struct sock *sk, long timeout)
902{
903	/* Nothing to do here, unix socket does not need a ->close().
904	 * This is merely for sockmap.
905	 */
906}
907
908static void unix_unhash(struct sock *sk)
909{
910	/* Nothing to do here, unix socket does not need a ->unhash().
911	 * This is merely for sockmap.
912	 */
913}
914
915static bool unix_bpf_bypass_getsockopt(int level, int optname)
916{
917	if (level == SOL_SOCKET) {
918		switch (optname) {
919		case SO_PEERPIDFD:
920			return true;
921		default:
922			return false;
923		}
924	}
925
926	return false;
927}
928
929struct proto unix_dgram_proto = {
930	.name			= "UNIX",
931	.owner			= THIS_MODULE,
932	.obj_size		= sizeof(struct unix_sock),
933	.close			= unix_close,
934	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
935#ifdef CONFIG_BPF_SYSCALL
936	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
937#endif
938};
939
940struct proto unix_stream_proto = {
941	.name			= "UNIX-STREAM",
942	.owner			= THIS_MODULE,
943	.obj_size		= sizeof(struct unix_sock),
944	.close			= unix_close,
945	.unhash			= unix_unhash,
946	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
947#ifdef CONFIG_BPF_SYSCALL
948	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
949#endif
950};
951
952static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
953{
954	struct unix_sock *u;
955	struct sock *sk;
956	int err;
957
958	atomic_long_inc(&unix_nr_socks);
959	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
960		err = -ENFILE;
961		goto err;
962	}
963
964	if (type == SOCK_STREAM)
965		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
966	else /*dgram and  seqpacket */
967		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
968
969	if (!sk) {
970		err = -ENOMEM;
971		goto err;
972	}
973
974	sock_init_data(sock, sk);
975
976	sk->sk_hash		= unix_unbound_hash(sk);
977	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
978	sk->sk_write_space	= unix_write_space;
979	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
980	sk->sk_destruct		= unix_sock_destructor;
981	u = unix_sk(sk);
982	u->inflight = 0;
983	u->path.dentry = NULL;
984	u->path.mnt = NULL;
985	spin_lock_init(&u->lock);
986	INIT_LIST_HEAD(&u->link);
987	mutex_init(&u->iolock); /* single task reading lock */
988	mutex_init(&u->bindlock); /* single task binding lock */
989	init_waitqueue_head(&u->peer_wait);
990	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
991	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
992	unix_insert_unbound_socket(net, sk);
993
994	sock_prot_inuse_add(net, sk->sk_prot, 1);
995
996	return sk;
997
998err:
999	atomic_long_dec(&unix_nr_socks);
1000	return ERR_PTR(err);
1001}
1002
1003static int unix_create(struct net *net, struct socket *sock, int protocol,
1004		       int kern)
1005{
1006	struct sock *sk;
1007
1008	if (protocol && protocol != PF_UNIX)
1009		return -EPROTONOSUPPORT;
1010
1011	sock->state = SS_UNCONNECTED;
1012
1013	switch (sock->type) {
1014	case SOCK_STREAM:
1015		sock->ops = &unix_stream_ops;
1016		break;
1017		/*
1018		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019		 *	nothing uses it.
1020		 */
1021	case SOCK_RAW:
1022		sock->type = SOCK_DGRAM;
1023		fallthrough;
1024	case SOCK_DGRAM:
1025		sock->ops = &unix_dgram_ops;
1026		break;
1027	case SOCK_SEQPACKET:
1028		sock->ops = &unix_seqpacket_ops;
1029		break;
1030	default:
1031		return -ESOCKTNOSUPPORT;
1032	}
1033
1034	sk = unix_create1(net, sock, kern, sock->type);
1035	if (IS_ERR(sk))
1036		return PTR_ERR(sk);
1037
1038	return 0;
1039}
1040
1041static int unix_release(struct socket *sock)
1042{
1043	struct sock *sk = sock->sk;
1044
1045	if (!sk)
1046		return 0;
1047
1048	sk->sk_prot->close(sk, 0);
1049	unix_release_sock(sk, 0);
1050	sock->sk = NULL;
1051
1052	return 0;
1053}
1054
1055static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1056				  int type)
1057{
1058	struct inode *inode;
1059	struct path path;
1060	struct sock *sk;
1061	int err;
1062
1063	unix_mkname_bsd(sunaddr, addr_len);
1064	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065	if (err)
1066		goto fail;
1067
1068	err = path_permission(&path, MAY_WRITE);
1069	if (err)
1070		goto path_put;
1071
1072	err = -ECONNREFUSED;
1073	inode = d_backing_inode(path.dentry);
1074	if (!S_ISSOCK(inode->i_mode))
1075		goto path_put;
1076
1077	sk = unix_find_socket_byinode(inode);
1078	if (!sk)
1079		goto path_put;
1080
1081	err = -EPROTOTYPE;
1082	if (sk->sk_type == type)
1083		touch_atime(&path);
1084	else
1085		goto sock_put;
1086
1087	path_put(&path);
1088
1089	return sk;
1090
1091sock_put:
1092	sock_put(sk);
1093path_put:
1094	path_put(&path);
1095fail:
1096	return ERR_PTR(err);
1097}
1098
1099static struct sock *unix_find_abstract(struct net *net,
1100				       struct sockaddr_un *sunaddr,
1101				       int addr_len, int type)
1102{
1103	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104	struct dentry *dentry;
1105	struct sock *sk;
1106
1107	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1108	if (!sk)
1109		return ERR_PTR(-ECONNREFUSED);
1110
1111	dentry = unix_sk(sk)->path.dentry;
1112	if (dentry)
1113		touch_atime(&unix_sk(sk)->path);
1114
1115	return sk;
1116}
1117
1118static struct sock *unix_find_other(struct net *net,
1119				    struct sockaddr_un *sunaddr,
1120				    int addr_len, int type)
1121{
1122	struct sock *sk;
1123
1124	if (sunaddr->sun_path[0])
1125		sk = unix_find_bsd(sunaddr, addr_len, type);
1126	else
1127		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128
1129	return sk;
1130}
1131
1132static int unix_autobind(struct sock *sk)
1133{
1134	unsigned int new_hash, old_hash = sk->sk_hash;
1135	struct unix_sock *u = unix_sk(sk);
1136	struct net *net = sock_net(sk);
1137	struct unix_address *addr;
1138	u32 lastnum, ordernum;
1139	int err;
1140
1141	err = mutex_lock_interruptible(&u->bindlock);
1142	if (err)
1143		return err;
1144
1145	if (u->addr)
1146		goto out;
1147
1148	err = -ENOMEM;
1149	addr = kzalloc(sizeof(*addr) +
1150		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1151	if (!addr)
1152		goto out;
1153
1154	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1155	addr->name->sun_family = AF_UNIX;
1156	refcount_set(&addr->refcnt, 1);
1157
1158	ordernum = get_random_u32();
1159	lastnum = ordernum & 0xFFFFF;
1160retry:
1161	ordernum = (ordernum + 1) & 0xFFFFF;
1162	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1163
1164	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1165	unix_table_double_lock(net, old_hash, new_hash);
1166
1167	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1168		unix_table_double_unlock(net, old_hash, new_hash);
1169
1170		/* __unix_find_socket_byname() may take long time if many names
1171		 * are already in use.
1172		 */
1173		cond_resched();
1174
1175		if (ordernum == lastnum) {
1176			/* Give up if all names seems to be in use. */
1177			err = -ENOSPC;
1178			unix_release_addr(addr);
1179			goto out;
1180		}
1181
1182		goto retry;
1183	}
1184
1185	__unix_set_addr_hash(net, sk, addr, new_hash);
1186	unix_table_double_unlock(net, old_hash, new_hash);
1187	err = 0;
1188
1189out:	mutex_unlock(&u->bindlock);
1190	return err;
1191}
1192
1193static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1194			 int addr_len)
1195{
1196	umode_t mode = S_IFSOCK |
1197	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1198	unsigned int new_hash, old_hash = sk->sk_hash;
1199	struct unix_sock *u = unix_sk(sk);
1200	struct net *net = sock_net(sk);
1201	struct mnt_idmap *idmap;
1202	struct unix_address *addr;
1203	struct dentry *dentry;
1204	struct path parent;
1205	int err;
1206
1207	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1208	addr = unix_create_addr(sunaddr, addr_len);
1209	if (!addr)
1210		return -ENOMEM;
1211
1212	/*
1213	 * Get the parent directory, calculate the hash for last
1214	 * component.
1215	 */
1216	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1217	if (IS_ERR(dentry)) {
1218		err = PTR_ERR(dentry);
1219		goto out;
1220	}
1221
1222	/*
1223	 * All right, let's create it.
1224	 */
1225	idmap = mnt_idmap(parent.mnt);
1226	err = security_path_mknod(&parent, dentry, mode, 0);
1227	if (!err)
1228		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1229	if (err)
1230		goto out_path;
1231	err = mutex_lock_interruptible(&u->bindlock);
1232	if (err)
1233		goto out_unlink;
1234	if (u->addr)
1235		goto out_unlock;
1236
1237	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1238	unix_table_double_lock(net, old_hash, new_hash);
1239	u->path.mnt = mntget(parent.mnt);
1240	u->path.dentry = dget(dentry);
1241	__unix_set_addr_hash(net, sk, addr, new_hash);
1242	unix_table_double_unlock(net, old_hash, new_hash);
1243	unix_insert_bsd_socket(sk);
1244	mutex_unlock(&u->bindlock);
1245	done_path_create(&parent, dentry);
1246	return 0;
1247
1248out_unlock:
1249	mutex_unlock(&u->bindlock);
1250	err = -EINVAL;
1251out_unlink:
1252	/* failed after successful mknod?  unlink what we'd created... */
1253	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1254out_path:
1255	done_path_create(&parent, dentry);
1256out:
1257	unix_release_addr(addr);
1258	return err == -EEXIST ? -EADDRINUSE : err;
1259}
1260
1261static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1262			      int addr_len)
1263{
1264	unsigned int new_hash, old_hash = sk->sk_hash;
1265	struct unix_sock *u = unix_sk(sk);
1266	struct net *net = sock_net(sk);
1267	struct unix_address *addr;
1268	int err;
1269
1270	addr = unix_create_addr(sunaddr, addr_len);
1271	if (!addr)
1272		return -ENOMEM;
1273
1274	err = mutex_lock_interruptible(&u->bindlock);
1275	if (err)
1276		goto out;
1277
1278	if (u->addr) {
1279		err = -EINVAL;
1280		goto out_mutex;
1281	}
1282
1283	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1284	unix_table_double_lock(net, old_hash, new_hash);
1285
1286	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1287		goto out_spin;
1288
1289	__unix_set_addr_hash(net, sk, addr, new_hash);
1290	unix_table_double_unlock(net, old_hash, new_hash);
1291	mutex_unlock(&u->bindlock);
1292	return 0;
1293
1294out_spin:
1295	unix_table_double_unlock(net, old_hash, new_hash);
1296	err = -EADDRINUSE;
1297out_mutex:
1298	mutex_unlock(&u->bindlock);
1299out:
1300	unix_release_addr(addr);
1301	return err;
1302}
1303
1304static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1305{
1306	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1307	struct sock *sk = sock->sk;
1308	int err;
1309
1310	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1311	    sunaddr->sun_family == AF_UNIX)
1312		return unix_autobind(sk);
1313
1314	err = unix_validate_addr(sunaddr, addr_len);
1315	if (err)
1316		return err;
1317
1318	if (sunaddr->sun_path[0])
1319		err = unix_bind_bsd(sk, sunaddr, addr_len);
1320	else
1321		err = unix_bind_abstract(sk, sunaddr, addr_len);
1322
1323	return err;
1324}
1325
1326static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1327{
1328	if (unlikely(sk1 == sk2) || !sk2) {
1329		unix_state_lock(sk1);
1330		return;
1331	}
1332	if (sk1 > sk2)
1333		swap(sk1, sk2);
1334
1335	unix_state_lock(sk1);
1336	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1337}
1338
1339static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1340{
1341	if (unlikely(sk1 == sk2) || !sk2) {
1342		unix_state_unlock(sk1);
1343		return;
1344	}
1345	unix_state_unlock(sk1);
1346	unix_state_unlock(sk2);
1347}
1348
1349static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1350			      int alen, int flags)
1351{
1352	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1353	struct sock *sk = sock->sk;
1354	struct sock *other;
1355	int err;
1356
1357	err = -EINVAL;
1358	if (alen < offsetofend(struct sockaddr, sa_family))
1359		goto out;
1360
1361	if (addr->sa_family != AF_UNSPEC) {
1362		err = unix_validate_addr(sunaddr, alen);
1363		if (err)
1364			goto out;
1365
1366		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1367		if (err)
1368			goto out;
1369
1370		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1371		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1372		    !unix_sk(sk)->addr) {
1373			err = unix_autobind(sk);
1374			if (err)
1375				goto out;
1376		}
1377
1378restart:
1379		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1380		if (IS_ERR(other)) {
1381			err = PTR_ERR(other);
1382			goto out;
1383		}
1384
1385		unix_state_double_lock(sk, other);
1386
1387		/* Apparently VFS overslept socket death. Retry. */
1388		if (sock_flag(other, SOCK_DEAD)) {
1389			unix_state_double_unlock(sk, other);
1390			sock_put(other);
1391			goto restart;
1392		}
1393
1394		err = -EPERM;
1395		if (!unix_may_send(sk, other))
1396			goto out_unlock;
1397
1398		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1399		if (err)
1400			goto out_unlock;
1401
1402		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1403	} else {
1404		/*
1405		 *	1003.1g breaking connected state with AF_UNSPEC
1406		 */
1407		other = NULL;
1408		unix_state_double_lock(sk, other);
1409	}
1410
1411	/*
1412	 * If it was connected, reconnect.
1413	 */
1414	if (unix_peer(sk)) {
1415		struct sock *old_peer = unix_peer(sk);
1416
1417		unix_peer(sk) = other;
1418		if (!other)
1419			sk->sk_state = TCP_CLOSE;
1420		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1421
1422		unix_state_double_unlock(sk, other);
1423
1424		if (other != old_peer)
1425			unix_dgram_disconnected(sk, old_peer);
1426		sock_put(old_peer);
1427	} else {
1428		unix_peer(sk) = other;
1429		unix_state_double_unlock(sk, other);
1430	}
1431
1432	return 0;
1433
1434out_unlock:
1435	unix_state_double_unlock(sk, other);
1436	sock_put(other);
1437out:
1438	return err;
1439}
1440
1441static long unix_wait_for_peer(struct sock *other, long timeo)
1442	__releases(&unix_sk(other)->lock)
1443{
1444	struct unix_sock *u = unix_sk(other);
1445	int sched;
1446	DEFINE_WAIT(wait);
1447
1448	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1449
1450	sched = !sock_flag(other, SOCK_DEAD) &&
1451		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1452		unix_recvq_full_lockless(other);
1453
1454	unix_state_unlock(other);
1455
1456	if (sched)
1457		timeo = schedule_timeout(timeo);
1458
1459	finish_wait(&u->peer_wait, &wait);
1460	return timeo;
1461}
1462
1463static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1464			       int addr_len, int flags)
1465{
1466	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1467	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1468	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1469	struct net *net = sock_net(sk);
1470	struct sk_buff *skb = NULL;
1471	long timeo;
1472	int err;
1473	int st;
1474
1475	err = unix_validate_addr(sunaddr, addr_len);
1476	if (err)
1477		goto out;
1478
1479	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1480	if (err)
1481		goto out;
1482
1483	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1484	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1485		err = unix_autobind(sk);
1486		if (err)
1487			goto out;
1488	}
1489
1490	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1491
1492	/* First of all allocate resources.
1493	   If we will make it after state is locked,
1494	   we will have to recheck all again in any case.
1495	 */
1496
1497	/* create new sock for complete connection */
1498	newsk = unix_create1(net, NULL, 0, sock->type);
1499	if (IS_ERR(newsk)) {
1500		err = PTR_ERR(newsk);
1501		newsk = NULL;
1502		goto out;
1503	}
1504
1505	err = -ENOMEM;
1506
1507	/* Allocate skb for sending to listening sock */
1508	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1509	if (skb == NULL)
1510		goto out;
1511
1512restart:
1513	/*  Find listening sock. */
1514	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1515	if (IS_ERR(other)) {
1516		err = PTR_ERR(other);
1517		other = NULL;
1518		goto out;
1519	}
1520
1521	/* Latch state of peer */
1522	unix_state_lock(other);
1523
1524	/* Apparently VFS overslept socket death. Retry. */
1525	if (sock_flag(other, SOCK_DEAD)) {
1526		unix_state_unlock(other);
1527		sock_put(other);
1528		goto restart;
1529	}
1530
1531	err = -ECONNREFUSED;
1532	if (other->sk_state != TCP_LISTEN)
1533		goto out_unlock;
1534	if (other->sk_shutdown & RCV_SHUTDOWN)
1535		goto out_unlock;
1536
1537	if (unix_recvq_full(other)) {
1538		err = -EAGAIN;
1539		if (!timeo)
1540			goto out_unlock;
1541
1542		timeo = unix_wait_for_peer(other, timeo);
1543
1544		err = sock_intr_errno(timeo);
1545		if (signal_pending(current))
1546			goto out;
1547		sock_put(other);
1548		goto restart;
1549	}
1550
1551	/* Latch our state.
1552
1553	   It is tricky place. We need to grab our state lock and cannot
1554	   drop lock on peer. It is dangerous because deadlock is
1555	   possible. Connect to self case and simultaneous
1556	   attempt to connect are eliminated by checking socket
1557	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1558	   check this before attempt to grab lock.
1559
1560	   Well, and we have to recheck the state after socket locked.
1561	 */
1562	st = sk->sk_state;
1563
1564	switch (st) {
1565	case TCP_CLOSE:
1566		/* This is ok... continue with connect */
1567		break;
1568	case TCP_ESTABLISHED:
1569		/* Socket is already connected */
1570		err = -EISCONN;
1571		goto out_unlock;
1572	default:
1573		err = -EINVAL;
1574		goto out_unlock;
1575	}
1576
1577	unix_state_lock_nested(sk, U_LOCK_SECOND);
1578
1579	if (sk->sk_state != st) {
1580		unix_state_unlock(sk);
1581		unix_state_unlock(other);
1582		sock_put(other);
1583		goto restart;
1584	}
1585
1586	err = security_unix_stream_connect(sk, other, newsk);
1587	if (err) {
1588		unix_state_unlock(sk);
1589		goto out_unlock;
1590	}
1591
1592	/* The way is open! Fastly set all the necessary fields... */
1593
1594	sock_hold(sk);
1595	unix_peer(newsk)	= sk;
1596	newsk->sk_state		= TCP_ESTABLISHED;
1597	newsk->sk_type		= sk->sk_type;
1598	init_peercred(newsk);
1599	newu = unix_sk(newsk);
1600	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1601	otheru = unix_sk(other);
1602
1603	/* copy address information from listening to new sock
1604	 *
1605	 * The contents of *(otheru->addr) and otheru->path
1606	 * are seen fully set up here, since we have found
1607	 * otheru in hash under its lock.  Insertion into the
1608	 * hash chain we'd found it in had been done in an
1609	 * earlier critical area protected by the chain's lock,
1610	 * the same one where we'd set *(otheru->addr) contents,
1611	 * as well as otheru->path and otheru->addr itself.
1612	 *
1613	 * Using smp_store_release() here to set newu->addr
1614	 * is enough to make those stores, as well as stores
1615	 * to newu->path visible to anyone who gets newu->addr
1616	 * by smp_load_acquire().  IOW, the same warranties
1617	 * as for unix_sock instances bound in unix_bind() or
1618	 * in unix_autobind().
1619	 */
1620	if (otheru->path.dentry) {
1621		path_get(&otheru->path);
1622		newu->path = otheru->path;
1623	}
1624	refcount_inc(&otheru->addr->refcnt);
1625	smp_store_release(&newu->addr, otheru->addr);
1626
1627	/* Set credentials */
1628	copy_peercred(sk, other);
1629
1630	sock->state	= SS_CONNECTED;
1631	sk->sk_state	= TCP_ESTABLISHED;
1632	sock_hold(newsk);
1633
1634	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1635	unix_peer(sk)	= newsk;
1636
1637	unix_state_unlock(sk);
1638
1639	/* take ten and send info to listening sock */
1640	spin_lock(&other->sk_receive_queue.lock);
1641	__skb_queue_tail(&other->sk_receive_queue, skb);
1642	spin_unlock(&other->sk_receive_queue.lock);
1643	unix_state_unlock(other);
1644	other->sk_data_ready(other);
1645	sock_put(other);
1646	return 0;
1647
1648out_unlock:
1649	if (other)
1650		unix_state_unlock(other);
1651
1652out:
1653	kfree_skb(skb);
1654	if (newsk)
1655		unix_release_sock(newsk, 0);
1656	if (other)
1657		sock_put(other);
1658	return err;
1659}
1660
1661static int unix_socketpair(struct socket *socka, struct socket *sockb)
1662{
1663	struct sock *ska = socka->sk, *skb = sockb->sk;
1664
1665	/* Join our sockets back to back */
1666	sock_hold(ska);
1667	sock_hold(skb);
1668	unix_peer(ska) = skb;
1669	unix_peer(skb) = ska;
1670	init_peercred(ska);
1671	init_peercred(skb);
1672
1673	ska->sk_state = TCP_ESTABLISHED;
1674	skb->sk_state = TCP_ESTABLISHED;
1675	socka->state  = SS_CONNECTED;
1676	sockb->state  = SS_CONNECTED;
1677	return 0;
1678}
1679
1680static void unix_sock_inherit_flags(const struct socket *old,
1681				    struct socket *new)
1682{
1683	if (test_bit(SOCK_PASSCRED, &old->flags))
1684		set_bit(SOCK_PASSCRED, &new->flags);
1685	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1686		set_bit(SOCK_PASSPIDFD, &new->flags);
1687	if (test_bit(SOCK_PASSSEC, &old->flags))
1688		set_bit(SOCK_PASSSEC, &new->flags);
1689}
1690
1691static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1692		       bool kern)
1693{
1694	struct sock *sk = sock->sk;
1695	struct sock *tsk;
1696	struct sk_buff *skb;
1697	int err;
1698
1699	err = -EOPNOTSUPP;
1700	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1701		goto out;
1702
1703	err = -EINVAL;
1704	if (sk->sk_state != TCP_LISTEN)
1705		goto out;
1706
1707	/* If socket state is TCP_LISTEN it cannot change (for now...),
1708	 * so that no locks are necessary.
1709	 */
1710
1711	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1712				&err);
1713	if (!skb) {
1714		/* This means receive shutdown. */
1715		if (err == 0)
1716			err = -EINVAL;
1717		goto out;
1718	}
1719
1720	tsk = skb->sk;
1721	skb_free_datagram(sk, skb);
1722	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1723
1724	/* attach accepted sock to socket */
1725	unix_state_lock(tsk);
1726	newsock->state = SS_CONNECTED;
1727	unix_sock_inherit_flags(sock, newsock);
1728	sock_graft(tsk, newsock);
1729	unix_state_unlock(tsk);
1730	return 0;
1731
1732out:
1733	return err;
1734}
1735
1736
1737static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1738{
1739	struct sock *sk = sock->sk;
1740	struct unix_address *addr;
1741	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1742	int err = 0;
1743
1744	if (peer) {
1745		sk = unix_peer_get(sk);
1746
1747		err = -ENOTCONN;
1748		if (!sk)
1749			goto out;
1750		err = 0;
1751	} else {
1752		sock_hold(sk);
1753	}
1754
1755	addr = smp_load_acquire(&unix_sk(sk)->addr);
1756	if (!addr) {
1757		sunaddr->sun_family = AF_UNIX;
1758		sunaddr->sun_path[0] = 0;
1759		err = offsetof(struct sockaddr_un, sun_path);
1760	} else {
1761		err = addr->len;
1762		memcpy(sunaddr, addr->name, addr->len);
1763
1764		if (peer)
1765			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1766					       CGROUP_UNIX_GETPEERNAME);
1767		else
1768			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1769					       CGROUP_UNIX_GETSOCKNAME);
1770	}
1771	sock_put(sk);
1772out:
1773	return err;
1774}
1775
1776/* The "user->unix_inflight" variable is protected by the garbage
1777 * collection lock, and we just read it locklessly here. If you go
1778 * over the limit, there might be a tiny race in actually noticing
1779 * it across threads. Tough.
1780 */
1781static inline bool too_many_unix_fds(struct task_struct *p)
1782{
1783	struct user_struct *user = current_user();
1784
1785	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1786		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1787	return false;
1788}
1789
1790static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1791{
1792	int i;
1793
1794	if (too_many_unix_fds(current))
1795		return -ETOOMANYREFS;
1796
1797	/* Need to duplicate file references for the sake of garbage
1798	 * collection.  Otherwise a socket in the fps might become a
1799	 * candidate for GC while the skb is not yet queued.
1800	 */
1801	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1802	if (!UNIXCB(skb).fp)
1803		return -ENOMEM;
1804
1805	for (i = scm->fp->count - 1; i >= 0; i--)
1806		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1807
1808	return 0;
1809}
1810
1811static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1812{
1813	int i;
1814
1815	scm->fp = UNIXCB(skb).fp;
1816	UNIXCB(skb).fp = NULL;
1817
1818	for (i = scm->fp->count - 1; i >= 0; i--)
1819		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1820}
1821
1822static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1823{
1824	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1825
1826	/*
1827	 * Garbage collection of unix sockets starts by selecting a set of
1828	 * candidate sockets which have reference only from being in flight
1829	 * (total_refs == inflight_refs).  This condition is checked once during
1830	 * the candidate collection phase, and candidates are marked as such, so
1831	 * that non-candidates can later be ignored.  While inflight_refs is
1832	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1833	 * is an instantaneous decision.
1834	 *
1835	 * Once a candidate, however, the socket must not be reinstalled into a
1836	 * file descriptor while the garbage collection is in progress.
1837	 *
1838	 * If the above conditions are met, then the directed graph of
1839	 * candidates (*) does not change while unix_gc_lock is held.
1840	 *
1841	 * Any operations that changes the file count through file descriptors
1842	 * (dup, close, sendmsg) does not change the graph since candidates are
1843	 * not installed in fds.
1844	 *
1845	 * Dequeing a candidate via recvmsg would install it into an fd, but
1846	 * that takes unix_gc_lock to decrement the inflight count, so it's
1847	 * serialized with garbage collection.
1848	 *
1849	 * MSG_PEEK is special in that it does not change the inflight count,
1850	 * yet does install the socket into an fd.  The following lock/unlock
1851	 * pair is to ensure serialization with garbage collection.  It must be
1852	 * done between incrementing the file count and installing the file into
1853	 * an fd.
1854	 *
1855	 * If garbage collection starts after the barrier provided by the
1856	 * lock/unlock, then it will see the elevated refcount and not mark this
1857	 * as a candidate.  If a garbage collection is already in progress
1858	 * before the file count was incremented, then the lock/unlock pair will
1859	 * ensure that garbage collection is finished before progressing to
1860	 * installing the fd.
1861	 *
1862	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1863	 * which is on the queue of listening socket A.
1864	 */
1865	spin_lock(&unix_gc_lock);
1866	spin_unlock(&unix_gc_lock);
1867}
1868
1869static void unix_destruct_scm(struct sk_buff *skb)
1870{
1871	struct scm_cookie scm;
1872
1873	memset(&scm, 0, sizeof(scm));
1874	scm.pid  = UNIXCB(skb).pid;
1875	if (UNIXCB(skb).fp)
1876		unix_detach_fds(&scm, skb);
1877
1878	/* Alas, it calls VFS */
1879	/* So fscking what? fput() had been SMP-safe since the last Summer */
1880	scm_destroy(&scm);
1881	sock_wfree(skb);
1882}
1883
1884static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1885{
1886	int err = 0;
1887
1888	UNIXCB(skb).pid  = get_pid(scm->pid);
1889	UNIXCB(skb).uid = scm->creds.uid;
1890	UNIXCB(skb).gid = scm->creds.gid;
1891	UNIXCB(skb).fp = NULL;
1892	unix_get_secdata(scm, skb);
1893	if (scm->fp && send_fds)
1894		err = unix_attach_fds(scm, skb);
1895
1896	skb->destructor = unix_destruct_scm;
1897	return err;
1898}
1899
1900static bool unix_passcred_enabled(const struct socket *sock,
1901				  const struct sock *other)
1902{
1903	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1904	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1905	       !other->sk_socket ||
1906	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1907	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1908}
1909
1910/*
1911 * Some apps rely on write() giving SCM_CREDENTIALS
1912 * We include credentials if source or destination socket
1913 * asserted SOCK_PASSCRED.
1914 */
1915static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1916			    const struct sock *other)
1917{
1918	if (UNIXCB(skb).pid)
1919		return;
1920	if (unix_passcred_enabled(sock, other)) {
1921		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1922		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1923	}
1924}
1925
1926static bool unix_skb_scm_eq(struct sk_buff *skb,
1927			    struct scm_cookie *scm)
1928{
1929	return UNIXCB(skb).pid == scm->pid &&
1930	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1931	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1932	       unix_secdata_eq(scm, skb);
1933}
1934
1935static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1936{
1937	struct scm_fp_list *fp = UNIXCB(skb).fp;
1938	struct unix_sock *u = unix_sk(sk);
1939
1940	if (unlikely(fp && fp->count))
1941		atomic_add(fp->count, &u->scm_stat.nr_fds);
1942}
1943
1944static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1945{
1946	struct scm_fp_list *fp = UNIXCB(skb).fp;
1947	struct unix_sock *u = unix_sk(sk);
1948
1949	if (unlikely(fp && fp->count))
1950		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1951}
1952
1953/*
1954 *	Send AF_UNIX data.
1955 */
1956
1957static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1958			      size_t len)
1959{
1960	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1961	struct sock *sk = sock->sk, *other = NULL;
1962	struct unix_sock *u = unix_sk(sk);
1963	struct scm_cookie scm;
1964	struct sk_buff *skb;
1965	int data_len = 0;
1966	int sk_locked;
1967	long timeo;
1968	int err;
1969
1970	err = scm_send(sock, msg, &scm, false);
1971	if (err < 0)
1972		return err;
1973
1974	wait_for_unix_gc(scm.fp);
1975
1976	err = -EOPNOTSUPP;
1977	if (msg->msg_flags&MSG_OOB)
1978		goto out;
1979
1980	if (msg->msg_namelen) {
1981		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1982		if (err)
1983			goto out;
1984
1985		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1986							    msg->msg_name,
1987							    &msg->msg_namelen,
1988							    NULL);
1989		if (err)
1990			goto out;
1991	} else {
1992		sunaddr = NULL;
1993		err = -ENOTCONN;
1994		other = unix_peer_get(sk);
1995		if (!other)
1996			goto out;
1997	}
1998
1999	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2000	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
2001		err = unix_autobind(sk);
2002		if (err)
2003			goto out;
2004	}
2005
2006	err = -EMSGSIZE;
2007	if (len > sk->sk_sndbuf - 32)
2008		goto out;
2009
2010	if (len > SKB_MAX_ALLOC) {
2011		data_len = min_t(size_t,
2012				 len - SKB_MAX_ALLOC,
2013				 MAX_SKB_FRAGS * PAGE_SIZE);
2014		data_len = PAGE_ALIGN(data_len);
2015
2016		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2017	}
2018
2019	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2020				   msg->msg_flags & MSG_DONTWAIT, &err,
2021				   PAGE_ALLOC_COSTLY_ORDER);
2022	if (skb == NULL)
2023		goto out;
2024
2025	err = unix_scm_to_skb(&scm, skb, true);
2026	if (err < 0)
2027		goto out_free;
2028
2029	skb_put(skb, len - data_len);
2030	skb->data_len = data_len;
2031	skb->len = len;
2032	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2033	if (err)
2034		goto out_free;
2035
2036	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2037
2038restart:
2039	if (!other) {
2040		err = -ECONNRESET;
2041		if (sunaddr == NULL)
2042			goto out_free;
2043
2044		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2045					sk->sk_type);
2046		if (IS_ERR(other)) {
2047			err = PTR_ERR(other);
2048			other = NULL;
2049			goto out_free;
2050		}
2051	}
2052
2053	if (sk_filter(other, skb) < 0) {
2054		/* Toss the packet but do not return any error to the sender */
2055		err = len;
2056		goto out_free;
2057	}
2058
2059	sk_locked = 0;
2060	unix_state_lock(other);
2061restart_locked:
2062	err = -EPERM;
2063	if (!unix_may_send(sk, other))
2064		goto out_unlock;
2065
2066	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2067		/*
2068		 *	Check with 1003.1g - what should
2069		 *	datagram error
2070		 */
2071		unix_state_unlock(other);
2072		sock_put(other);
2073
2074		if (!sk_locked)
2075			unix_state_lock(sk);
2076
2077		err = 0;
2078		if (sk->sk_type == SOCK_SEQPACKET) {
2079			/* We are here only when racing with unix_release_sock()
2080			 * is clearing @other. Never change state to TCP_CLOSE
2081			 * unlike SOCK_DGRAM wants.
2082			 */
2083			unix_state_unlock(sk);
2084			err = -EPIPE;
2085		} else if (unix_peer(sk) == other) {
2086			unix_peer(sk) = NULL;
2087			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2088
2089			sk->sk_state = TCP_CLOSE;
2090			unix_state_unlock(sk);
2091
2092			unix_dgram_disconnected(sk, other);
2093			sock_put(other);
2094			err = -ECONNREFUSED;
2095		} else {
2096			unix_state_unlock(sk);
2097		}
2098
2099		other = NULL;
2100		if (err)
2101			goto out_free;
2102		goto restart;
2103	}
2104
2105	err = -EPIPE;
2106	if (other->sk_shutdown & RCV_SHUTDOWN)
2107		goto out_unlock;
2108
2109	if (sk->sk_type != SOCK_SEQPACKET) {
2110		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2111		if (err)
2112			goto out_unlock;
2113	}
2114
2115	/* other == sk && unix_peer(other) != sk if
2116	 * - unix_peer(sk) == NULL, destination address bound to sk
2117	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2118	 */
2119	if (other != sk &&
2120	    unlikely(unix_peer(other) != sk &&
2121	    unix_recvq_full_lockless(other))) {
2122		if (timeo) {
2123			timeo = unix_wait_for_peer(other, timeo);
2124
2125			err = sock_intr_errno(timeo);
2126			if (signal_pending(current))
2127				goto out_free;
2128
2129			goto restart;
2130		}
2131
2132		if (!sk_locked) {
2133			unix_state_unlock(other);
2134			unix_state_double_lock(sk, other);
2135		}
2136
2137		if (unix_peer(sk) != other ||
2138		    unix_dgram_peer_wake_me(sk, other)) {
2139			err = -EAGAIN;
2140			sk_locked = 1;
2141			goto out_unlock;
2142		}
2143
2144		if (!sk_locked) {
2145			sk_locked = 1;
2146			goto restart_locked;
2147		}
2148	}
2149
2150	if (unlikely(sk_locked))
2151		unix_state_unlock(sk);
2152
2153	if (sock_flag(other, SOCK_RCVTSTAMP))
2154		__net_timestamp(skb);
2155	maybe_add_creds(skb, sock, other);
2156	scm_stat_add(other, skb);
2157	skb_queue_tail(&other->sk_receive_queue, skb);
2158	unix_state_unlock(other);
2159	other->sk_data_ready(other);
2160	sock_put(other);
2161	scm_destroy(&scm);
2162	return len;
2163
2164out_unlock:
2165	if (sk_locked)
2166		unix_state_unlock(sk);
2167	unix_state_unlock(other);
2168out_free:
2169	kfree_skb(skb);
2170out:
2171	if (other)
2172		sock_put(other);
2173	scm_destroy(&scm);
2174	return err;
2175}
2176
2177/* We use paged skbs for stream sockets, and limit occupancy to 32768
2178 * bytes, and a minimum of a full page.
2179 */
2180#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2181
2182#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2183static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2184		     struct scm_cookie *scm, bool fds_sent)
2185{
2186	struct unix_sock *ousk = unix_sk(other);
2187	struct sk_buff *skb;
2188	int err = 0;
2189
2190	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2191
2192	if (!skb)
2193		return err;
2194
2195	err = unix_scm_to_skb(scm, skb, !fds_sent);
2196	if (err < 0) {
2197		kfree_skb(skb);
2198		return err;
2199	}
2200	skb_put(skb, 1);
2201	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2202
2203	if (err) {
2204		kfree_skb(skb);
2205		return err;
2206	}
2207
2208	unix_state_lock(other);
2209
2210	if (sock_flag(other, SOCK_DEAD) ||
2211	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2212		unix_state_unlock(other);
2213		kfree_skb(skb);
2214		return -EPIPE;
2215	}
2216
2217	maybe_add_creds(skb, sock, other);
2218	skb_get(skb);
2219
2220	if (ousk->oob_skb)
2221		consume_skb(ousk->oob_skb);
2222
2223	WRITE_ONCE(ousk->oob_skb, skb);
2224
2225	scm_stat_add(other, skb);
2226	skb_queue_tail(&other->sk_receive_queue, skb);
2227	sk_send_sigurg(other);
2228	unix_state_unlock(other);
2229	other->sk_data_ready(other);
2230
2231	return err;
2232}
2233#endif
2234
2235static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2236			       size_t len)
2237{
2238	struct sock *sk = sock->sk;
2239	struct sock *other = NULL;
2240	int err, size;
2241	struct sk_buff *skb;
2242	int sent = 0;
2243	struct scm_cookie scm;
2244	bool fds_sent = false;
2245	int data_len;
2246
2247	err = scm_send(sock, msg, &scm, false);
2248	if (err < 0)
2249		return err;
2250
2251	wait_for_unix_gc(scm.fp);
2252
2253	err = -EOPNOTSUPP;
2254	if (msg->msg_flags & MSG_OOB) {
2255#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2256		if (len)
2257			len--;
2258		else
2259#endif
2260			goto out_err;
2261	}
2262
2263	if (msg->msg_namelen) {
2264		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2265		goto out_err;
2266	} else {
2267		err = -ENOTCONN;
2268		other = unix_peer(sk);
2269		if (!other)
2270			goto out_err;
2271	}
2272
2273	if (sk->sk_shutdown & SEND_SHUTDOWN)
2274		goto pipe_err;
2275
2276	while (sent < len) {
2277		size = len - sent;
2278
2279		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2280			skb = sock_alloc_send_pskb(sk, 0, 0,
2281						   msg->msg_flags & MSG_DONTWAIT,
2282						   &err, 0);
2283		} else {
2284			/* Keep two messages in the pipe so it schedules better */
2285			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2286
2287			/* allow fallback to order-0 allocations */
2288			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2289
2290			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2291
2292			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2293
2294			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2295						   msg->msg_flags & MSG_DONTWAIT, &err,
2296						   get_order(UNIX_SKB_FRAGS_SZ));
2297		}
2298		if (!skb)
2299			goto out_err;
2300
2301		/* Only send the fds in the first buffer */
2302		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2303		if (err < 0) {
2304			kfree_skb(skb);
2305			goto out_err;
2306		}
2307		fds_sent = true;
2308
2309		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2310			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2311						   sk->sk_allocation);
2312			if (err < 0) {
2313				kfree_skb(skb);
2314				goto out_err;
2315			}
2316			size = err;
2317			refcount_add(size, &sk->sk_wmem_alloc);
2318		} else {
2319			skb_put(skb, size - data_len);
2320			skb->data_len = data_len;
2321			skb->len = size;
2322			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2323			if (err) {
2324				kfree_skb(skb);
2325				goto out_err;
2326			}
2327		}
2328
2329		unix_state_lock(other);
2330
2331		if (sock_flag(other, SOCK_DEAD) ||
2332		    (other->sk_shutdown & RCV_SHUTDOWN))
2333			goto pipe_err_free;
2334
2335		maybe_add_creds(skb, sock, other);
2336		scm_stat_add(other, skb);
2337		skb_queue_tail(&other->sk_receive_queue, skb);
2338		unix_state_unlock(other);
2339		other->sk_data_ready(other);
2340		sent += size;
2341	}
2342
2343#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2344	if (msg->msg_flags & MSG_OOB) {
2345		err = queue_oob(sock, msg, other, &scm, fds_sent);
2346		if (err)
2347			goto out_err;
2348		sent++;
2349	}
2350#endif
2351
2352	scm_destroy(&scm);
2353
2354	return sent;
2355
2356pipe_err_free:
2357	unix_state_unlock(other);
2358	kfree_skb(skb);
2359pipe_err:
2360	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2361		send_sig(SIGPIPE, current, 0);
2362	err = -EPIPE;
2363out_err:
2364	scm_destroy(&scm);
2365	return sent ? : err;
2366}
2367
2368static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2369				  size_t len)
2370{
2371	int err;
2372	struct sock *sk = sock->sk;
2373
2374	err = sock_error(sk);
2375	if (err)
2376		return err;
2377
2378	if (sk->sk_state != TCP_ESTABLISHED)
2379		return -ENOTCONN;
2380
2381	if (msg->msg_namelen)
2382		msg->msg_namelen = 0;
2383
2384	return unix_dgram_sendmsg(sock, msg, len);
2385}
2386
2387static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2388				  size_t size, int flags)
2389{
2390	struct sock *sk = sock->sk;
2391
2392	if (sk->sk_state != TCP_ESTABLISHED)
2393		return -ENOTCONN;
2394
2395	return unix_dgram_recvmsg(sock, msg, size, flags);
2396}
2397
2398static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2399{
2400	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2401
2402	if (addr) {
2403		msg->msg_namelen = addr->len;
2404		memcpy(msg->msg_name, addr->name, addr->len);
2405	}
2406}
2407
2408int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2409			 int flags)
2410{
2411	struct scm_cookie scm;
2412	struct socket *sock = sk->sk_socket;
2413	struct unix_sock *u = unix_sk(sk);
2414	struct sk_buff *skb, *last;
2415	long timeo;
2416	int skip;
2417	int err;
2418
2419	err = -EOPNOTSUPP;
2420	if (flags&MSG_OOB)
2421		goto out;
2422
2423	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2424
2425	do {
2426		mutex_lock(&u->iolock);
2427
2428		skip = sk_peek_offset(sk, flags);
2429		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2430					      &skip, &err, &last);
2431		if (skb) {
2432			if (!(flags & MSG_PEEK))
2433				scm_stat_del(sk, skb);
2434			break;
2435		}
2436
2437		mutex_unlock(&u->iolock);
2438
2439		if (err != -EAGAIN)
2440			break;
2441	} while (timeo &&
2442		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2443					      &err, &timeo, last));
2444
2445	if (!skb) { /* implies iolock unlocked */
2446		unix_state_lock(sk);
2447		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2448		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2449		    (sk->sk_shutdown & RCV_SHUTDOWN))
2450			err = 0;
2451		unix_state_unlock(sk);
2452		goto out;
2453	}
2454
2455	if (wq_has_sleeper(&u->peer_wait))
2456		wake_up_interruptible_sync_poll(&u->peer_wait,
2457						EPOLLOUT | EPOLLWRNORM |
2458						EPOLLWRBAND);
2459
2460	if (msg->msg_name) {
2461		unix_copy_addr(msg, skb->sk);
2462
2463		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2464						      msg->msg_name,
2465						      &msg->msg_namelen);
2466	}
2467
2468	if (size > skb->len - skip)
2469		size = skb->len - skip;
2470	else if (size < skb->len - skip)
2471		msg->msg_flags |= MSG_TRUNC;
2472
2473	err = skb_copy_datagram_msg(skb, skip, msg, size);
2474	if (err)
2475		goto out_free;
2476
2477	if (sock_flag(sk, SOCK_RCVTSTAMP))
2478		__sock_recv_timestamp(msg, sk, skb);
2479
2480	memset(&scm, 0, sizeof(scm));
2481
2482	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2483	unix_set_secdata(&scm, skb);
2484
2485	if (!(flags & MSG_PEEK)) {
2486		if (UNIXCB(skb).fp)
2487			unix_detach_fds(&scm, skb);
2488
2489		sk_peek_offset_bwd(sk, skb->len);
2490	} else {
2491		/* It is questionable: on PEEK we could:
2492		   - do not return fds - good, but too simple 8)
2493		   - return fds, and do not return them on read (old strategy,
2494		     apparently wrong)
2495		   - clone fds (I chose it for now, it is the most universal
2496		     solution)
2497
2498		   POSIX 1003.1g does not actually define this clearly
2499		   at all. POSIX 1003.1g doesn't define a lot of things
2500		   clearly however!
2501
2502		*/
2503
2504		sk_peek_offset_fwd(sk, size);
2505
2506		if (UNIXCB(skb).fp)
2507			unix_peek_fds(&scm, skb);
2508	}
2509	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2510
2511	scm_recv_unix(sock, msg, &scm, flags);
2512
2513out_free:
2514	skb_free_datagram(sk, skb);
2515	mutex_unlock(&u->iolock);
2516out:
2517	return err;
2518}
2519
2520static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2521			      int flags)
2522{
2523	struct sock *sk = sock->sk;
2524
2525#ifdef CONFIG_BPF_SYSCALL
2526	const struct proto *prot = READ_ONCE(sk->sk_prot);
2527
2528	if (prot != &unix_dgram_proto)
2529		return prot->recvmsg(sk, msg, size, flags, NULL);
2530#endif
2531	return __unix_dgram_recvmsg(sk, msg, size, flags);
2532}
2533
2534static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2535{
2536	struct unix_sock *u = unix_sk(sk);
2537	struct sk_buff *skb;
2538	int err;
2539
2540	mutex_lock(&u->iolock);
2541	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2542	mutex_unlock(&u->iolock);
2543	if (!skb)
2544		return err;
2545
2546	return recv_actor(sk, skb);
2547}
2548
2549/*
2550 *	Sleep until more data has arrived. But check for races..
2551 */
2552static long unix_stream_data_wait(struct sock *sk, long timeo,
2553				  struct sk_buff *last, unsigned int last_len,
2554				  bool freezable)
2555{
2556	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2557	struct sk_buff *tail;
2558	DEFINE_WAIT(wait);
2559
2560	unix_state_lock(sk);
2561
2562	for (;;) {
2563		prepare_to_wait(sk_sleep(sk), &wait, state);
2564
2565		tail = skb_peek_tail(&sk->sk_receive_queue);
2566		if (tail != last ||
2567		    (tail && tail->len != last_len) ||
2568		    sk->sk_err ||
2569		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2570		    signal_pending(current) ||
2571		    !timeo)
2572			break;
2573
2574		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575		unix_state_unlock(sk);
2576		timeo = schedule_timeout(timeo);
2577		unix_state_lock(sk);
2578
2579		if (sock_flag(sk, SOCK_DEAD))
2580			break;
2581
2582		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2583	}
2584
2585	finish_wait(sk_sleep(sk), &wait);
2586	unix_state_unlock(sk);
2587	return timeo;
2588}
2589
2590static unsigned int unix_skb_len(const struct sk_buff *skb)
2591{
2592	return skb->len - UNIXCB(skb).consumed;
2593}
2594
2595struct unix_stream_read_state {
2596	int (*recv_actor)(struct sk_buff *, int, int,
2597			  struct unix_stream_read_state *);
2598	struct socket *socket;
2599	struct msghdr *msg;
2600	struct pipe_inode_info *pipe;
2601	size_t size;
2602	int flags;
2603	unsigned int splice_flags;
2604};
2605
2606#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2607static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2608{
2609	struct socket *sock = state->socket;
2610	struct sock *sk = sock->sk;
2611	struct unix_sock *u = unix_sk(sk);
2612	int chunk = 1;
2613	struct sk_buff *oob_skb;
2614
2615	mutex_lock(&u->iolock);
2616	unix_state_lock(sk);
2617
2618	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2619		unix_state_unlock(sk);
2620		mutex_unlock(&u->iolock);
2621		return -EINVAL;
2622	}
2623
2624	oob_skb = u->oob_skb;
2625
2626	if (!(state->flags & MSG_PEEK))
2627		WRITE_ONCE(u->oob_skb, NULL);
2628	else
2629		skb_get(oob_skb);
2630	unix_state_unlock(sk);
2631
2632	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2633
2634	if (!(state->flags & MSG_PEEK))
2635		UNIXCB(oob_skb).consumed += 1;
2636
2637	consume_skb(oob_skb);
2638
2639	mutex_unlock(&u->iolock);
2640
2641	if (chunk < 0)
2642		return -EFAULT;
2643
2644	state->msg->msg_flags |= MSG_OOB;
2645	return 1;
2646}
2647
2648static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2649				  int flags, int copied)
2650{
2651	struct unix_sock *u = unix_sk(sk);
2652
2653	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2654		skb_unlink(skb, &sk->sk_receive_queue);
2655		consume_skb(skb);
2656		skb = NULL;
2657	} else {
2658		if (skb == u->oob_skb) {
2659			if (copied) {
2660				skb = NULL;
2661			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2662				if (!(flags & MSG_PEEK)) {
2663					WRITE_ONCE(u->oob_skb, NULL);
2664					consume_skb(skb);
2665				}
2666			} else if (flags & MSG_PEEK) {
2667				skb = NULL;
2668			} else {
2669				skb_unlink(skb, &sk->sk_receive_queue);
2670				WRITE_ONCE(u->oob_skb, NULL);
2671				if (!WARN_ON_ONCE(skb_unref(skb)))
2672					kfree_skb(skb);
2673				skb = skb_peek(&sk->sk_receive_queue);
2674			}
2675		}
2676	}
2677	return skb;
2678}
2679#endif
2680
2681static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2682{
2683	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2684		return -ENOTCONN;
2685
2686	return unix_read_skb(sk, recv_actor);
2687}
2688
2689static int unix_stream_read_generic(struct unix_stream_read_state *state,
2690				    bool freezable)
2691{
2692	struct scm_cookie scm;
2693	struct socket *sock = state->socket;
2694	struct sock *sk = sock->sk;
2695	struct unix_sock *u = unix_sk(sk);
2696	int copied = 0;
2697	int flags = state->flags;
2698	int noblock = flags & MSG_DONTWAIT;
2699	bool check_creds = false;
2700	int target;
2701	int err = 0;
2702	long timeo;
2703	int skip;
2704	size_t size = state->size;
2705	unsigned int last_len;
2706
2707	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2708		err = -EINVAL;
2709		goto out;
2710	}
2711
2712	if (unlikely(flags & MSG_OOB)) {
2713		err = -EOPNOTSUPP;
2714#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2715		err = unix_stream_recv_urg(state);
2716#endif
2717		goto out;
2718	}
2719
2720	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2721	timeo = sock_rcvtimeo(sk, noblock);
2722
2723	memset(&scm, 0, sizeof(scm));
2724
2725	/* Lock the socket to prevent queue disordering
2726	 * while sleeps in memcpy_tomsg
2727	 */
2728	mutex_lock(&u->iolock);
2729
2730	skip = max(sk_peek_offset(sk, flags), 0);
2731
2732	do {
2733		int chunk;
2734		bool drop_skb;
2735		struct sk_buff *skb, *last;
2736
2737redo:
2738		unix_state_lock(sk);
2739		if (sock_flag(sk, SOCK_DEAD)) {
2740			err = -ECONNRESET;
2741			goto unlock;
2742		}
2743		last = skb = skb_peek(&sk->sk_receive_queue);
2744		last_len = last ? last->len : 0;
2745
2746again:
2747#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2748		if (skb) {
2749			skb = manage_oob(skb, sk, flags, copied);
2750			if (!skb && copied) {
2751				unix_state_unlock(sk);
2752				break;
2753			}
2754		}
2755#endif
2756		if (skb == NULL) {
2757			if (copied >= target)
2758				goto unlock;
2759
2760			/*
2761			 *	POSIX 1003.1g mandates this order.
2762			 */
2763
2764			err = sock_error(sk);
2765			if (err)
2766				goto unlock;
2767			if (sk->sk_shutdown & RCV_SHUTDOWN)
2768				goto unlock;
2769
2770			unix_state_unlock(sk);
2771			if (!timeo) {
2772				err = -EAGAIN;
2773				break;
2774			}
2775
2776			mutex_unlock(&u->iolock);
2777
2778			timeo = unix_stream_data_wait(sk, timeo, last,
2779						      last_len, freezable);
2780
2781			if (signal_pending(current)) {
2782				err = sock_intr_errno(timeo);
2783				scm_destroy(&scm);
2784				goto out;
2785			}
2786
2787			mutex_lock(&u->iolock);
2788			goto redo;
2789unlock:
2790			unix_state_unlock(sk);
2791			break;
2792		}
2793
2794		while (skip >= unix_skb_len(skb)) {
2795			skip -= unix_skb_len(skb);
2796			last = skb;
2797			last_len = skb->len;
2798			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2799			if (!skb)
2800				goto again;
2801		}
2802
2803		unix_state_unlock(sk);
2804
2805		if (check_creds) {
2806			/* Never glue messages from different writers */
2807			if (!unix_skb_scm_eq(skb, &scm))
2808				break;
2809		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2810			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2811			/* Copy credentials */
2812			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2813			unix_set_secdata(&scm, skb);
2814			check_creds = true;
2815		}
2816
2817		/* Copy address just once */
2818		if (state->msg && state->msg->msg_name) {
2819			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2820					 state->msg->msg_name);
2821			unix_copy_addr(state->msg, skb->sk);
2822
2823			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2824							      state->msg->msg_name,
2825							      &state->msg->msg_namelen);
2826
2827			sunaddr = NULL;
2828		}
2829
2830		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2831		skb_get(skb);
2832		chunk = state->recv_actor(skb, skip, chunk, state);
2833		drop_skb = !unix_skb_len(skb);
2834		/* skb is only safe to use if !drop_skb */
2835		consume_skb(skb);
2836		if (chunk < 0) {
2837			if (copied == 0)
2838				copied = -EFAULT;
2839			break;
2840		}
2841		copied += chunk;
2842		size -= chunk;
2843
2844		if (drop_skb) {
2845			/* the skb was touched by a concurrent reader;
2846			 * we should not expect anything from this skb
2847			 * anymore and assume it invalid - we can be
2848			 * sure it was dropped from the socket queue
2849			 *
2850			 * let's report a short read
2851			 */
2852			err = 0;
2853			break;
2854		}
2855
2856		/* Mark read part of skb as used */
2857		if (!(flags & MSG_PEEK)) {
2858			UNIXCB(skb).consumed += chunk;
2859
2860			sk_peek_offset_bwd(sk, chunk);
2861
2862			if (UNIXCB(skb).fp) {
2863				scm_stat_del(sk, skb);
2864				unix_detach_fds(&scm, skb);
2865			}
2866
2867			if (unix_skb_len(skb))
2868				break;
2869
2870			skb_unlink(skb, &sk->sk_receive_queue);
2871			consume_skb(skb);
2872
2873			if (scm.fp)
2874				break;
2875		} else {
2876			/* It is questionable, see note in unix_dgram_recvmsg.
2877			 */
2878			if (UNIXCB(skb).fp)
2879				unix_peek_fds(&scm, skb);
2880
2881			sk_peek_offset_fwd(sk, chunk);
2882
2883			if (UNIXCB(skb).fp)
2884				break;
2885
2886			skip = 0;
2887			last = skb;
2888			last_len = skb->len;
2889			unix_state_lock(sk);
2890			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2891			if (skb)
2892				goto again;
2893			unix_state_unlock(sk);
2894			break;
2895		}
2896	} while (size);
2897
2898	mutex_unlock(&u->iolock);
2899	if (state->msg)
2900		scm_recv_unix(sock, state->msg, &scm, flags);
2901	else
2902		scm_destroy(&scm);
2903out:
2904	return copied ? : err;
2905}
2906
2907static int unix_stream_read_actor(struct sk_buff *skb,
2908				  int skip, int chunk,
2909				  struct unix_stream_read_state *state)
2910{
2911	int ret;
2912
2913	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2914				    state->msg, chunk);
2915	return ret ?: chunk;
2916}
2917
2918int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2919			  size_t size, int flags)
2920{
2921	struct unix_stream_read_state state = {
2922		.recv_actor = unix_stream_read_actor,
2923		.socket = sk->sk_socket,
2924		.msg = msg,
2925		.size = size,
2926		.flags = flags
2927	};
2928
2929	return unix_stream_read_generic(&state, true);
2930}
2931
2932static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2933			       size_t size, int flags)
2934{
2935	struct unix_stream_read_state state = {
2936		.recv_actor = unix_stream_read_actor,
2937		.socket = sock,
2938		.msg = msg,
2939		.size = size,
2940		.flags = flags
2941	};
2942
2943#ifdef CONFIG_BPF_SYSCALL
2944	struct sock *sk = sock->sk;
2945	const struct proto *prot = READ_ONCE(sk->sk_prot);
2946
2947	if (prot != &unix_stream_proto)
2948		return prot->recvmsg(sk, msg, size, flags, NULL);
2949#endif
2950	return unix_stream_read_generic(&state, true);
2951}
2952
2953static int unix_stream_splice_actor(struct sk_buff *skb,
2954				    int skip, int chunk,
2955				    struct unix_stream_read_state *state)
2956{
2957	return skb_splice_bits(skb, state->socket->sk,
2958			       UNIXCB(skb).consumed + skip,
2959			       state->pipe, chunk, state->splice_flags);
2960}
2961
2962static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2963				       struct pipe_inode_info *pipe,
2964				       size_t size, unsigned int flags)
2965{
2966	struct unix_stream_read_state state = {
2967		.recv_actor = unix_stream_splice_actor,
2968		.socket = sock,
2969		.pipe = pipe,
2970		.size = size,
2971		.splice_flags = flags,
2972	};
2973
2974	if (unlikely(*ppos))
2975		return -ESPIPE;
2976
2977	if (sock->file->f_flags & O_NONBLOCK ||
2978	    flags & SPLICE_F_NONBLOCK)
2979		state.flags = MSG_DONTWAIT;
2980
2981	return unix_stream_read_generic(&state, false);
2982}
2983
2984static int unix_shutdown(struct socket *sock, int mode)
2985{
2986	struct sock *sk = sock->sk;
2987	struct sock *other;
2988
2989	if (mode < SHUT_RD || mode > SHUT_RDWR)
2990		return -EINVAL;
2991	/* This maps:
2992	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2993	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2994	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2995	 */
2996	++mode;
2997
2998	unix_state_lock(sk);
2999	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3000	other = unix_peer(sk);
3001	if (other)
3002		sock_hold(other);
3003	unix_state_unlock(sk);
3004	sk->sk_state_change(sk);
3005
3006	if (other &&
3007		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3008
3009		int peer_mode = 0;
3010		const struct proto *prot = READ_ONCE(other->sk_prot);
3011
3012		if (prot->unhash)
3013			prot->unhash(other);
3014		if (mode&RCV_SHUTDOWN)
3015			peer_mode |= SEND_SHUTDOWN;
3016		if (mode&SEND_SHUTDOWN)
3017			peer_mode |= RCV_SHUTDOWN;
3018		unix_state_lock(other);
3019		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3020		unix_state_unlock(other);
3021		other->sk_state_change(other);
3022		if (peer_mode == SHUTDOWN_MASK)
3023			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3024		else if (peer_mode & RCV_SHUTDOWN)
3025			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3026	}
3027	if (other)
3028		sock_put(other);
3029
3030	return 0;
3031}
3032
3033long unix_inq_len(struct sock *sk)
3034{
3035	struct sk_buff *skb;
3036	long amount = 0;
3037
3038	if (sk->sk_state == TCP_LISTEN)
3039		return -EINVAL;
3040
3041	spin_lock(&sk->sk_receive_queue.lock);
3042	if (sk->sk_type == SOCK_STREAM ||
3043	    sk->sk_type == SOCK_SEQPACKET) {
3044		skb_queue_walk(&sk->sk_receive_queue, skb)
3045			amount += unix_skb_len(skb);
3046	} else {
3047		skb = skb_peek(&sk->sk_receive_queue);
3048		if (skb)
3049			amount = skb->len;
3050	}
3051	spin_unlock(&sk->sk_receive_queue.lock);
3052
3053	return amount;
3054}
3055EXPORT_SYMBOL_GPL(unix_inq_len);
3056
3057long unix_outq_len(struct sock *sk)
3058{
3059	return sk_wmem_alloc_get(sk);
3060}
3061EXPORT_SYMBOL_GPL(unix_outq_len);
3062
3063static int unix_open_file(struct sock *sk)
3064{
3065	struct path path;
3066	struct file *f;
3067	int fd;
3068
3069	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3070		return -EPERM;
3071
3072	if (!smp_load_acquire(&unix_sk(sk)->addr))
3073		return -ENOENT;
3074
3075	path = unix_sk(sk)->path;
3076	if (!path.dentry)
3077		return -ENOENT;
3078
3079	path_get(&path);
3080
3081	fd = get_unused_fd_flags(O_CLOEXEC);
3082	if (fd < 0)
3083		goto out;
3084
3085	f = dentry_open(&path, O_PATH, current_cred());
3086	if (IS_ERR(f)) {
3087		put_unused_fd(fd);
3088		fd = PTR_ERR(f);
3089		goto out;
3090	}
3091
3092	fd_install(fd, f);
3093out:
3094	path_put(&path);
3095
3096	return fd;
3097}
3098
3099static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3100{
3101	struct sock *sk = sock->sk;
3102	long amount = 0;
3103	int err;
3104
3105	switch (cmd) {
3106	case SIOCOUTQ:
3107		amount = unix_outq_len(sk);
3108		err = put_user(amount, (int __user *)arg);
3109		break;
3110	case SIOCINQ:
3111		amount = unix_inq_len(sk);
3112		if (amount < 0)
3113			err = amount;
3114		else
3115			err = put_user(amount, (int __user *)arg);
3116		break;
3117	case SIOCUNIXFILE:
3118		err = unix_open_file(sk);
3119		break;
3120#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3121	case SIOCATMARK:
3122		{
3123			struct sk_buff *skb;
3124			int answ = 0;
3125
3126			skb = skb_peek(&sk->sk_receive_queue);
3127			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3128				answ = 1;
3129			err = put_user(answ, (int __user *)arg);
3130		}
3131		break;
3132#endif
3133	default:
3134		err = -ENOIOCTLCMD;
3135		break;
3136	}
3137	return err;
3138}
3139
3140#ifdef CONFIG_COMPAT
3141static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3142{
3143	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3144}
3145#endif
3146
3147static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3148{
3149	struct sock *sk = sock->sk;
3150	__poll_t mask;
3151	u8 shutdown;
3152
3153	sock_poll_wait(file, sock, wait);
3154	mask = 0;
3155	shutdown = READ_ONCE(sk->sk_shutdown);
3156
3157	/* exceptional events? */
3158	if (READ_ONCE(sk->sk_err))
3159		mask |= EPOLLERR;
3160	if (shutdown == SHUTDOWN_MASK)
3161		mask |= EPOLLHUP;
3162	if (shutdown & RCV_SHUTDOWN)
3163		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3164
3165	/* readable? */
3166	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3167		mask |= EPOLLIN | EPOLLRDNORM;
3168	if (sk_is_readable(sk))
3169		mask |= EPOLLIN | EPOLLRDNORM;
3170#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3171	if (READ_ONCE(unix_sk(sk)->oob_skb))
3172		mask |= EPOLLPRI;
3173#endif
3174
3175	/* Connection-based need to check for termination and startup */
3176	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3177	    sk->sk_state == TCP_CLOSE)
3178		mask |= EPOLLHUP;
3179
3180	/*
3181	 * we set writable also when the other side has shut down the
3182	 * connection. This prevents stuck sockets.
3183	 */
3184	if (unix_writable(sk))
3185		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3186
3187	return mask;
3188}
3189
3190static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3191				    poll_table *wait)
3192{
3193	struct sock *sk = sock->sk, *other;
3194	unsigned int writable;
3195	__poll_t mask;
3196	u8 shutdown;
3197
3198	sock_poll_wait(file, sock, wait);
3199	mask = 0;
3200	shutdown = READ_ONCE(sk->sk_shutdown);
3201
3202	/* exceptional events? */
3203	if (READ_ONCE(sk->sk_err) ||
3204	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3205		mask |= EPOLLERR |
3206			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3207
3208	if (shutdown & RCV_SHUTDOWN)
3209		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3210	if (shutdown == SHUTDOWN_MASK)
3211		mask |= EPOLLHUP;
3212
3213	/* readable? */
3214	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3215		mask |= EPOLLIN | EPOLLRDNORM;
3216	if (sk_is_readable(sk))
3217		mask |= EPOLLIN | EPOLLRDNORM;
3218
3219	/* Connection-based need to check for termination and startup */
3220	if (sk->sk_type == SOCK_SEQPACKET) {
3221		if (sk->sk_state == TCP_CLOSE)
3222			mask |= EPOLLHUP;
3223		/* connection hasn't started yet? */
3224		if (sk->sk_state == TCP_SYN_SENT)
3225			return mask;
3226	}
3227
3228	/* No write status requested, avoid expensive OUT tests. */
3229	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3230		return mask;
3231
3232	writable = unix_writable(sk);
3233	if (writable) {
3234		unix_state_lock(sk);
3235
3236		other = unix_peer(sk);
3237		if (other && unix_peer(other) != sk &&
3238		    unix_recvq_full_lockless(other) &&
3239		    unix_dgram_peer_wake_me(sk, other))
3240			writable = 0;
3241
3242		unix_state_unlock(sk);
3243	}
3244
3245	if (writable)
3246		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3247	else
3248		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3249
3250	return mask;
3251}
3252
3253#ifdef CONFIG_PROC_FS
3254
3255#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3256
3257#define get_bucket(x) ((x) >> BUCKET_SPACE)
3258#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3259#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3260
3261static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3262{
3263	unsigned long offset = get_offset(*pos);
3264	unsigned long bucket = get_bucket(*pos);
3265	unsigned long count = 0;
3266	struct sock *sk;
3267
3268	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3269	     sk; sk = sk_next(sk)) {
3270		if (++count == offset)
3271			break;
3272	}
3273
3274	return sk;
3275}
3276
3277static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3278{
3279	unsigned long bucket = get_bucket(*pos);
3280	struct net *net = seq_file_net(seq);
3281	struct sock *sk;
3282
3283	while (bucket < UNIX_HASH_SIZE) {
3284		spin_lock(&net->unx.table.locks[bucket]);
3285
3286		sk = unix_from_bucket(seq, pos);
3287		if (sk)
3288			return sk;
3289
3290		spin_unlock(&net->unx.table.locks[bucket]);
3291
3292		*pos = set_bucket_offset(++bucket, 1);
3293	}
3294
3295	return NULL;
3296}
3297
3298static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3299				  loff_t *pos)
3300{
3301	unsigned long bucket = get_bucket(*pos);
3302
3303	sk = sk_next(sk);
3304	if (sk)
3305		return sk;
3306
3307
3308	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3309
3310	*pos = set_bucket_offset(++bucket, 1);
3311
3312	return unix_get_first(seq, pos);
3313}
3314
3315static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3316{
3317	if (!*pos)
3318		return SEQ_START_TOKEN;
3319
3320	return unix_get_first(seq, pos);
3321}
3322
3323static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3324{
3325	++*pos;
3326
3327	if (v == SEQ_START_TOKEN)
3328		return unix_get_first(seq, pos);
3329
3330	return unix_get_next(seq, v, pos);
3331}
3332
3333static void unix_seq_stop(struct seq_file *seq, void *v)
3334{
3335	struct sock *sk = v;
3336
3337	if (sk)
3338		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3339}
3340
3341static int unix_seq_show(struct seq_file *seq, void *v)
3342{
3343
3344	if (v == SEQ_START_TOKEN)
3345		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3346			 "Inode Path\n");
3347	else {
3348		struct sock *s = v;
3349		struct unix_sock *u = unix_sk(s);
3350		unix_state_lock(s);
3351
3352		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3353			s,
3354			refcount_read(&s->sk_refcnt),
3355			0,
3356			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3357			s->sk_type,
3358			s->sk_socket ?
3359			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3360			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3361			sock_i_ino(s));
3362
3363		if (u->addr) {	// under a hash table lock here
3364			int i, len;
3365			seq_putc(seq, ' ');
3366
3367			i = 0;
3368			len = u->addr->len -
3369				offsetof(struct sockaddr_un, sun_path);
3370			if (u->addr->name->sun_path[0]) {
3371				len--;
3372			} else {
3373				seq_putc(seq, '@');
3374				i++;
3375			}
3376			for ( ; i < len; i++)
3377				seq_putc(seq, u->addr->name->sun_path[i] ?:
3378					 '@');
3379		}
3380		unix_state_unlock(s);
3381		seq_putc(seq, '\n');
3382	}
3383
3384	return 0;
3385}
3386
3387static const struct seq_operations unix_seq_ops = {
3388	.start  = unix_seq_start,
3389	.next   = unix_seq_next,
3390	.stop   = unix_seq_stop,
3391	.show   = unix_seq_show,
3392};
3393
3394#ifdef CONFIG_BPF_SYSCALL
3395struct bpf_unix_iter_state {
3396	struct seq_net_private p;
3397	unsigned int cur_sk;
3398	unsigned int end_sk;
3399	unsigned int max_sk;
3400	struct sock **batch;
3401	bool st_bucket_done;
3402};
3403
3404struct bpf_iter__unix {
3405	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3406	__bpf_md_ptr(struct unix_sock *, unix_sk);
3407	uid_t uid __aligned(8);
3408};
3409
3410static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3411			      struct unix_sock *unix_sk, uid_t uid)
3412{
3413	struct bpf_iter__unix ctx;
3414
3415	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3416	ctx.meta = meta;
3417	ctx.unix_sk = unix_sk;
3418	ctx.uid = uid;
3419	return bpf_iter_run_prog(prog, &ctx);
3420}
3421
3422static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3423
3424{
3425	struct bpf_unix_iter_state *iter = seq->private;
3426	unsigned int expected = 1;
3427	struct sock *sk;
3428
3429	sock_hold(start_sk);
3430	iter->batch[iter->end_sk++] = start_sk;
3431
3432	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3433		if (iter->end_sk < iter->max_sk) {
3434			sock_hold(sk);
3435			iter->batch[iter->end_sk++] = sk;
3436		}
3437
3438		expected++;
3439	}
3440
3441	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3442
3443	return expected;
3444}
3445
3446static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3447{
3448	while (iter->cur_sk < iter->end_sk)
3449		sock_put(iter->batch[iter->cur_sk++]);
3450}
3451
3452static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3453				       unsigned int new_batch_sz)
3454{
3455	struct sock **new_batch;
3456
3457	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3458			     GFP_USER | __GFP_NOWARN);
3459	if (!new_batch)
3460		return -ENOMEM;
3461
3462	bpf_iter_unix_put_batch(iter);
3463	kvfree(iter->batch);
3464	iter->batch = new_batch;
3465	iter->max_sk = new_batch_sz;
3466
3467	return 0;
3468}
3469
3470static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3471					loff_t *pos)
3472{
3473	struct bpf_unix_iter_state *iter = seq->private;
3474	unsigned int expected;
3475	bool resized = false;
3476	struct sock *sk;
3477
3478	if (iter->st_bucket_done)
3479		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3480
3481again:
3482	/* Get a new batch */
3483	iter->cur_sk = 0;
3484	iter->end_sk = 0;
3485
3486	sk = unix_get_first(seq, pos);
3487	if (!sk)
3488		return NULL; /* Done */
3489
3490	expected = bpf_iter_unix_hold_batch(seq, sk);
3491
3492	if (iter->end_sk == expected) {
3493		iter->st_bucket_done = true;
3494		return sk;
3495	}
3496
3497	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3498		resized = true;
3499		goto again;
3500	}
3501
3502	return sk;
3503}
3504
3505static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3506{
3507	if (!*pos)
3508		return SEQ_START_TOKEN;
3509
3510	/* bpf iter does not support lseek, so it always
3511	 * continue from where it was stop()-ped.
3512	 */
3513	return bpf_iter_unix_batch(seq, pos);
3514}
3515
3516static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3517{
3518	struct bpf_unix_iter_state *iter = seq->private;
3519	struct sock *sk;
3520
3521	/* Whenever seq_next() is called, the iter->cur_sk is
3522	 * done with seq_show(), so advance to the next sk in
3523	 * the batch.
3524	 */
3525	if (iter->cur_sk < iter->end_sk)
3526		sock_put(iter->batch[iter->cur_sk++]);
3527
3528	++*pos;
3529
3530	if (iter->cur_sk < iter->end_sk)
3531		sk = iter->batch[iter->cur_sk];
3532	else
3533		sk = bpf_iter_unix_batch(seq, pos);
3534
3535	return sk;
3536}
3537
3538static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3539{
3540	struct bpf_iter_meta meta;
3541	struct bpf_prog *prog;
3542	struct sock *sk = v;
3543	uid_t uid;
3544	bool slow;
3545	int ret;
3546
3547	if (v == SEQ_START_TOKEN)
3548		return 0;
3549
3550	slow = lock_sock_fast(sk);
3551
3552	if (unlikely(sk_unhashed(sk))) {
3553		ret = SEQ_SKIP;
3554		goto unlock;
3555	}
3556
3557	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3558	meta.seq = seq;
3559	prog = bpf_iter_get_info(&meta, false);
3560	ret = unix_prog_seq_show(prog, &meta, v, uid);
3561unlock:
3562	unlock_sock_fast(sk, slow);
3563	return ret;
3564}
3565
3566static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3567{
3568	struct bpf_unix_iter_state *iter = seq->private;
3569	struct bpf_iter_meta meta;
3570	struct bpf_prog *prog;
3571
3572	if (!v) {
3573		meta.seq = seq;
3574		prog = bpf_iter_get_info(&meta, true);
3575		if (prog)
3576			(void)unix_prog_seq_show(prog, &meta, v, 0);
3577	}
3578
3579	if (iter->cur_sk < iter->end_sk)
3580		bpf_iter_unix_put_batch(iter);
3581}
3582
3583static const struct seq_operations bpf_iter_unix_seq_ops = {
3584	.start	= bpf_iter_unix_seq_start,
3585	.next	= bpf_iter_unix_seq_next,
3586	.stop	= bpf_iter_unix_seq_stop,
3587	.show	= bpf_iter_unix_seq_show,
3588};
3589#endif
3590#endif
3591
3592static const struct net_proto_family unix_family_ops = {
3593	.family = PF_UNIX,
3594	.create = unix_create,
3595	.owner	= THIS_MODULE,
3596};
3597
3598
3599static int __net_init unix_net_init(struct net *net)
3600{
3601	int i;
3602
3603	net->unx.sysctl_max_dgram_qlen = 10;
3604	if (unix_sysctl_register(net))
3605		goto out;
3606
3607#ifdef CONFIG_PROC_FS
3608	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3609			     sizeof(struct seq_net_private)))
3610		goto err_sysctl;
3611#endif
3612
3613	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3614					      sizeof(spinlock_t), GFP_KERNEL);
3615	if (!net->unx.table.locks)
3616		goto err_proc;
3617
3618	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3619						sizeof(struct hlist_head),
3620						GFP_KERNEL);
3621	if (!net->unx.table.buckets)
3622		goto free_locks;
3623
3624	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3625		spin_lock_init(&net->unx.table.locks[i]);
3626		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3627	}
3628
3629	return 0;
3630
3631free_locks:
3632	kvfree(net->unx.table.locks);
3633err_proc:
3634#ifdef CONFIG_PROC_FS
3635	remove_proc_entry("unix", net->proc_net);
3636err_sysctl:
3637#endif
3638	unix_sysctl_unregister(net);
3639out:
3640	return -ENOMEM;
3641}
3642
3643static void __net_exit unix_net_exit(struct net *net)
3644{
3645	kvfree(net->unx.table.buckets);
3646	kvfree(net->unx.table.locks);
3647	unix_sysctl_unregister(net);
3648	remove_proc_entry("unix", net->proc_net);
3649}
3650
3651static struct pernet_operations unix_net_ops = {
3652	.init = unix_net_init,
3653	.exit = unix_net_exit,
3654};
3655
3656#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3657DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3658		     struct unix_sock *unix_sk, uid_t uid)
3659
3660#define INIT_BATCH_SZ 16
3661
3662static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3663{
3664	struct bpf_unix_iter_state *iter = priv_data;
3665	int err;
3666
3667	err = bpf_iter_init_seq_net(priv_data, aux);
3668	if (err)
3669		return err;
3670
3671	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3672	if (err) {
3673		bpf_iter_fini_seq_net(priv_data);
3674		return err;
3675	}
3676
3677	return 0;
3678}
3679
3680static void bpf_iter_fini_unix(void *priv_data)
3681{
3682	struct bpf_unix_iter_state *iter = priv_data;
3683
3684	bpf_iter_fini_seq_net(priv_data);
3685	kvfree(iter->batch);
3686}
3687
3688static const struct bpf_iter_seq_info unix_seq_info = {
3689	.seq_ops		= &bpf_iter_unix_seq_ops,
3690	.init_seq_private	= bpf_iter_init_unix,
3691	.fini_seq_private	= bpf_iter_fini_unix,
3692	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3693};
3694
3695static const struct bpf_func_proto *
3696bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3697			     const struct bpf_prog *prog)
3698{
3699	switch (func_id) {
3700	case BPF_FUNC_setsockopt:
3701		return &bpf_sk_setsockopt_proto;
3702	case BPF_FUNC_getsockopt:
3703		return &bpf_sk_getsockopt_proto;
3704	default:
3705		return NULL;
3706	}
3707}
3708
3709static struct bpf_iter_reg unix_reg_info = {
3710	.target			= "unix",
3711	.ctx_arg_info_size	= 1,
3712	.ctx_arg_info		= {
3713		{ offsetof(struct bpf_iter__unix, unix_sk),
3714		  PTR_TO_BTF_ID_OR_NULL },
3715	},
3716	.get_func_proto         = bpf_iter_unix_get_func_proto,
3717	.seq_info		= &unix_seq_info,
3718};
3719
3720static void __init bpf_iter_register(void)
3721{
3722	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3723	if (bpf_iter_reg_target(&unix_reg_info))
3724		pr_warn("Warning: could not register bpf iterator unix\n");
3725}
3726#endif
3727
3728static int __init af_unix_init(void)
3729{
3730	int i, rc = -1;
3731
3732	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3733
3734	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3735		spin_lock_init(&bsd_socket_locks[i]);
3736		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3737	}
3738
3739	rc = proto_register(&unix_dgram_proto, 1);
3740	if (rc != 0) {
3741		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3742		goto out;
3743	}
3744
3745	rc = proto_register(&unix_stream_proto, 1);
3746	if (rc != 0) {
3747		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3748		proto_unregister(&unix_dgram_proto);
3749		goto out;
3750	}
3751
3752	sock_register(&unix_family_ops);
3753	register_pernet_subsys(&unix_net_ops);
3754	unix_bpf_build_proto();
3755
3756#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3757	bpf_iter_register();
3758#endif
3759
3760out:
3761	return rc;
3762}
3763
3764/* Later than subsys_initcall() because we depend on stuff initialised there */
3765fs_initcall(af_unix_init);
3766