1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * userdlm.c
4 *
5 * Code which implements the kernel side of a minimal userspace
6 * interface to our DLM.
7 *
8 * Many of the functions here are pared down versions of dlmglue.c
9 * functions.
10 *
11 * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
12 */
13
14#include <linux/signal.h>
15#include <linux/sched/signal.h>
16
17#include <linux/module.h>
18#include <linux/fs.h>
19#include <linux/types.h>
20#include <linux/crc32.h>
21
22#include "../ocfs2_lockingver.h"
23#include "../stackglue.h"
24#include "userdlm.h"
25
26#define MLOG_MASK_PREFIX ML_DLMFS
27#include "../cluster/masklog.h"
28
29
30static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
31{
32	return container_of(lksb, struct user_lock_res, l_lksb);
33}
34
35static inline int user_check_wait_flag(struct user_lock_res *lockres,
36				       int flag)
37{
38	int ret;
39
40	spin_lock(&lockres->l_lock);
41	ret = lockres->l_flags & flag;
42	spin_unlock(&lockres->l_lock);
43
44	return ret;
45}
46
47static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
48
49{
50	wait_event(lockres->l_event,
51		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
52}
53
54static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
55
56{
57	wait_event(lockres->l_event,
58		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
59}
60
61/* I heart container_of... */
62static inline struct ocfs2_cluster_connection *
63cluster_connection_from_user_lockres(struct user_lock_res *lockres)
64{
65	struct dlmfs_inode_private *ip;
66
67	ip = container_of(lockres,
68			  struct dlmfs_inode_private,
69			  ip_lockres);
70	return ip->ip_conn;
71}
72
73static struct inode *
74user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
75{
76	struct dlmfs_inode_private *ip;
77
78	ip = container_of(lockres,
79			  struct dlmfs_inode_private,
80			  ip_lockres);
81	return &ip->ip_vfs_inode;
82}
83
84static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
85{
86	spin_lock(&lockres->l_lock);
87	lockres->l_flags &= ~USER_LOCK_BUSY;
88	spin_unlock(&lockres->l_lock);
89}
90
91#define user_log_dlm_error(_func, _stat, _lockres) do {			\
92	mlog(ML_ERROR, "Dlm error %d while calling %s on "		\
93		"resource %.*s\n", _stat, _func,			\
94		_lockres->l_namelen, _lockres->l_name); 		\
95} while (0)
96
97/* WARNING: This function lives in a world where the only three lock
98 * levels are EX, PR, and NL. It *will* have to be adjusted when more
99 * lock types are added. */
100static inline int user_highest_compat_lock_level(int level)
101{
102	int new_level = DLM_LOCK_EX;
103
104	if (level == DLM_LOCK_EX)
105		new_level = DLM_LOCK_NL;
106	else if (level == DLM_LOCK_PR)
107		new_level = DLM_LOCK_PR;
108	return new_level;
109}
110
111static void user_ast(struct ocfs2_dlm_lksb *lksb)
112{
113	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
114	int status;
115
116	mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
117	     lockres->l_namelen, lockres->l_name, lockres->l_level,
118	     lockres->l_requested);
119
120	spin_lock(&lockres->l_lock);
121
122	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
123	if (status) {
124		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
125		     status, lockres->l_namelen, lockres->l_name);
126		spin_unlock(&lockres->l_lock);
127		return;
128	}
129
130	mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
131			"Lockres %.*s, requested ivmode. flags 0x%x\n",
132			lockres->l_namelen, lockres->l_name, lockres->l_flags);
133
134	/* we're downconverting. */
135	if (lockres->l_requested < lockres->l_level) {
136		if (lockres->l_requested <=
137		    user_highest_compat_lock_level(lockres->l_blocking)) {
138			lockres->l_blocking = DLM_LOCK_NL;
139			lockres->l_flags &= ~USER_LOCK_BLOCKED;
140		}
141	}
142
143	lockres->l_level = lockres->l_requested;
144	lockres->l_requested = DLM_LOCK_IV;
145	lockres->l_flags |= USER_LOCK_ATTACHED;
146	lockres->l_flags &= ~USER_LOCK_BUSY;
147
148	spin_unlock(&lockres->l_lock);
149
150	wake_up(&lockres->l_event);
151}
152
153static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
154{
155	struct inode *inode;
156	inode = user_dlm_inode_from_user_lockres(lockres);
157	if (!igrab(inode))
158		BUG();
159}
160
161static void user_dlm_unblock_lock(struct work_struct *work);
162
163static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
164{
165	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
166		user_dlm_grab_inode_ref(lockres);
167
168		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
169
170		queue_work(user_dlm_worker, &lockres->l_work);
171		lockres->l_flags |= USER_LOCK_QUEUED;
172	}
173}
174
175static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
176{
177	int queue = 0;
178
179	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
180		return;
181
182	switch (lockres->l_blocking) {
183	case DLM_LOCK_EX:
184		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
185			queue = 1;
186		break;
187	case DLM_LOCK_PR:
188		if (!lockres->l_ex_holders)
189			queue = 1;
190		break;
191	default:
192		BUG();
193	}
194
195	if (queue)
196		__user_dlm_queue_lockres(lockres);
197}
198
199static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
200{
201	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
202
203	mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
204	     lockres->l_namelen, lockres->l_name, level, lockres->l_level);
205
206	spin_lock(&lockres->l_lock);
207	lockres->l_flags |= USER_LOCK_BLOCKED;
208	if (level > lockres->l_blocking)
209		lockres->l_blocking = level;
210
211	__user_dlm_queue_lockres(lockres);
212	spin_unlock(&lockres->l_lock);
213
214	wake_up(&lockres->l_event);
215}
216
217static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
218{
219	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
220
221	mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
222	     lockres->l_namelen, lockres->l_name, lockres->l_flags);
223
224	if (status)
225		mlog(ML_ERROR, "dlm returns status %d\n", status);
226
227	spin_lock(&lockres->l_lock);
228	/* The teardown flag gets set early during the unlock process,
229	 * so test the cancel flag to make sure that this ast isn't
230	 * for a concurrent cancel. */
231	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
232	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
233		lockres->l_level = DLM_LOCK_IV;
234	} else if (status == DLM_CANCELGRANT) {
235		/* We tried to cancel a convert request, but it was
236		 * already granted. Don't clear the busy flag - the
237		 * ast should've done this already. */
238		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
239		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
240		goto out_noclear;
241	} else {
242		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
243		/* Cancel succeeded, we want to re-queue */
244		lockres->l_requested = DLM_LOCK_IV; /* cancel an
245						    * upconvert
246						    * request. */
247		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
248		/* we want the unblock thread to look at it again
249		 * now. */
250		if (lockres->l_flags & USER_LOCK_BLOCKED)
251			__user_dlm_queue_lockres(lockres);
252	}
253
254	lockres->l_flags &= ~USER_LOCK_BUSY;
255out_noclear:
256	spin_unlock(&lockres->l_lock);
257
258	wake_up(&lockres->l_event);
259}
260
261/*
262 * This is the userdlmfs locking protocol version.
263 *
264 * See fs/ocfs2/dlmglue.c for more details on locking versions.
265 */
266static struct ocfs2_locking_protocol user_dlm_lproto = {
267	.lp_max_version = {
268		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
269		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
270	},
271	.lp_lock_ast		= user_ast,
272	.lp_blocking_ast	= user_bast,
273	.lp_unlock_ast		= user_unlock_ast,
274};
275
276static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
277{
278	struct inode *inode;
279	inode = user_dlm_inode_from_user_lockres(lockres);
280	iput(inode);
281}
282
283static void user_dlm_unblock_lock(struct work_struct *work)
284{
285	int new_level, status;
286	struct user_lock_res *lockres =
287		container_of(work, struct user_lock_res, l_work);
288	struct ocfs2_cluster_connection *conn =
289		cluster_connection_from_user_lockres(lockres);
290
291	mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
292
293	spin_lock(&lockres->l_lock);
294
295	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
296			"Lockres %.*s, flags 0x%x\n",
297			lockres->l_namelen, lockres->l_name, lockres->l_flags);
298
299	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
300	 * set, we want user_ast clear it. */
301	lockres->l_flags &= ~USER_LOCK_QUEUED;
302
303	/* It's valid to get here and no longer be blocked - if we get
304	 * several basts in a row, we might be queued by the first
305	 * one, the unblock thread might run and clear the queued
306	 * flag, and finally we might get another bast which re-queues
307	 * us before our ast for the downconvert is called. */
308	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
309		mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
310		     lockres->l_namelen, lockres->l_name);
311		spin_unlock(&lockres->l_lock);
312		goto drop_ref;
313	}
314
315	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
316		mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
317		     lockres->l_namelen, lockres->l_name);
318		spin_unlock(&lockres->l_lock);
319		goto drop_ref;
320	}
321
322	if (lockres->l_flags & USER_LOCK_BUSY) {
323		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
324			mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
325			     lockres->l_namelen, lockres->l_name);
326			spin_unlock(&lockres->l_lock);
327			goto drop_ref;
328		}
329
330		lockres->l_flags |= USER_LOCK_IN_CANCEL;
331		spin_unlock(&lockres->l_lock);
332
333		status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
334					  DLM_LKF_CANCEL);
335		if (status)
336			user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
337		goto drop_ref;
338	}
339
340	/* If there are still incompat holders, we can exit safely
341	 * without worrying about re-queueing this lock as that will
342	 * happen on the last call to user_cluster_unlock. */
343	if ((lockres->l_blocking == DLM_LOCK_EX)
344	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
345		spin_unlock(&lockres->l_lock);
346		mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
347		     lockres->l_namelen, lockres->l_name,
348		     lockres->l_ex_holders, lockres->l_ro_holders);
349		goto drop_ref;
350	}
351
352	if ((lockres->l_blocking == DLM_LOCK_PR)
353	    && lockres->l_ex_holders) {
354		spin_unlock(&lockres->l_lock);
355		mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
356		     lockres->l_namelen, lockres->l_name,
357		     lockres->l_ex_holders);
358		goto drop_ref;
359	}
360
361	/* yay, we can downconvert now. */
362	new_level = user_highest_compat_lock_level(lockres->l_blocking);
363	lockres->l_requested = new_level;
364	lockres->l_flags |= USER_LOCK_BUSY;
365	mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
366	     lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
367	spin_unlock(&lockres->l_lock);
368
369	/* need lock downconvert request now... */
370	status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
371				DLM_LKF_CONVERT|DLM_LKF_VALBLK,
372				lockres->l_name,
373				lockres->l_namelen);
374	if (status) {
375		user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
376		user_recover_from_dlm_error(lockres);
377	}
378
379drop_ref:
380	user_dlm_drop_inode_ref(lockres);
381}
382
383static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
384					int level)
385{
386	switch(level) {
387	case DLM_LOCK_EX:
388		lockres->l_ex_holders++;
389		break;
390	case DLM_LOCK_PR:
391		lockres->l_ro_holders++;
392		break;
393	default:
394		BUG();
395	}
396}
397
398/* predict what lock level we'll be dropping down to on behalf
399 * of another node, and return true if the currently wanted
400 * level will be compatible with it. */
401static inline int
402user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
403				  int wanted)
404{
405	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
406
407	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
408}
409
410int user_dlm_cluster_lock(struct user_lock_res *lockres,
411			  int level,
412			  int lkm_flags)
413{
414	int status, local_flags;
415	struct ocfs2_cluster_connection *conn =
416		cluster_connection_from_user_lockres(lockres);
417
418	if (level != DLM_LOCK_EX &&
419	    level != DLM_LOCK_PR) {
420		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
421		     lockres->l_namelen, lockres->l_name);
422		status = -EINVAL;
423		goto bail;
424	}
425
426	mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
427	     lockres->l_namelen, lockres->l_name, level, lkm_flags);
428
429again:
430	if (signal_pending(current)) {
431		status = -ERESTARTSYS;
432		goto bail;
433	}
434
435	spin_lock(&lockres->l_lock);
436	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
437		spin_unlock(&lockres->l_lock);
438		status = -EAGAIN;
439		goto bail;
440	}
441
442	/* We only compare against the currently granted level
443	 * here. If the lock is blocked waiting on a downconvert,
444	 * we'll get caught below. */
445	if ((lockres->l_flags & USER_LOCK_BUSY) &&
446	    (level > lockres->l_level)) {
447		/* is someone sitting in dlm_lock? If so, wait on
448		 * them. */
449		spin_unlock(&lockres->l_lock);
450
451		user_wait_on_busy_lock(lockres);
452		goto again;
453	}
454
455	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
456	    (!user_may_continue_on_blocked_lock(lockres, level))) {
457		/* is the lock is currently blocked on behalf of
458		 * another node */
459		spin_unlock(&lockres->l_lock);
460
461		user_wait_on_blocked_lock(lockres);
462		goto again;
463	}
464
465	if (level > lockres->l_level) {
466		local_flags = lkm_flags | DLM_LKF_VALBLK;
467		if (lockres->l_level != DLM_LOCK_IV)
468			local_flags |= DLM_LKF_CONVERT;
469
470		lockres->l_requested = level;
471		lockres->l_flags |= USER_LOCK_BUSY;
472		spin_unlock(&lockres->l_lock);
473
474		BUG_ON(level == DLM_LOCK_IV);
475		BUG_ON(level == DLM_LOCK_NL);
476
477		/* call dlm_lock to upgrade lock now */
478		status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
479					local_flags, lockres->l_name,
480					lockres->l_namelen);
481		if (status) {
482			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
483			    (status != -EAGAIN))
484				user_log_dlm_error("ocfs2_dlm_lock",
485						   status, lockres);
486			user_recover_from_dlm_error(lockres);
487			goto bail;
488		}
489
490		user_wait_on_busy_lock(lockres);
491		goto again;
492	}
493
494	user_dlm_inc_holders(lockres, level);
495	spin_unlock(&lockres->l_lock);
496
497	status = 0;
498bail:
499	return status;
500}
501
502static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
503					int level)
504{
505	switch(level) {
506	case DLM_LOCK_EX:
507		BUG_ON(!lockres->l_ex_holders);
508		lockres->l_ex_holders--;
509		break;
510	case DLM_LOCK_PR:
511		BUG_ON(!lockres->l_ro_holders);
512		lockres->l_ro_holders--;
513		break;
514	default:
515		BUG();
516	}
517}
518
519void user_dlm_cluster_unlock(struct user_lock_res *lockres,
520			     int level)
521{
522	if (level != DLM_LOCK_EX &&
523	    level != DLM_LOCK_PR) {
524		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
525		     lockres->l_namelen, lockres->l_name);
526		return;
527	}
528
529	spin_lock(&lockres->l_lock);
530	user_dlm_dec_holders(lockres, level);
531	__user_dlm_cond_queue_lockres(lockres);
532	spin_unlock(&lockres->l_lock);
533}
534
535void user_dlm_write_lvb(struct inode *inode,
536			const char *val,
537			unsigned int len)
538{
539	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
540	char *lvb;
541
542	BUG_ON(len > DLM_LVB_LEN);
543
544	spin_lock(&lockres->l_lock);
545
546	BUG_ON(lockres->l_level < DLM_LOCK_EX);
547	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
548	memcpy(lvb, val, len);
549
550	spin_unlock(&lockres->l_lock);
551}
552
553bool user_dlm_read_lvb(struct inode *inode, char *val)
554{
555	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
556	char *lvb;
557	bool ret = true;
558
559	spin_lock(&lockres->l_lock);
560
561	BUG_ON(lockres->l_level < DLM_LOCK_PR);
562	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
563		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
564		memcpy(val, lvb, DLM_LVB_LEN);
565	} else
566		ret = false;
567
568	spin_unlock(&lockres->l_lock);
569	return ret;
570}
571
572void user_dlm_lock_res_init(struct user_lock_res *lockres,
573			    struct dentry *dentry)
574{
575	memset(lockres, 0, sizeof(*lockres));
576
577	spin_lock_init(&lockres->l_lock);
578	init_waitqueue_head(&lockres->l_event);
579	lockres->l_level = DLM_LOCK_IV;
580	lockres->l_requested = DLM_LOCK_IV;
581	lockres->l_blocking = DLM_LOCK_IV;
582
583	/* should have been checked before getting here. */
584	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
585
586	memcpy(lockres->l_name,
587	       dentry->d_name.name,
588	       dentry->d_name.len);
589	lockres->l_namelen = dentry->d_name.len;
590}
591
592int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{
594	int status = -EBUSY;
595	struct ocfs2_cluster_connection *conn =
596		cluster_connection_from_user_lockres(lockres);
597
598	mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
599
600	spin_lock(&lockres->l_lock);
601	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
602		spin_unlock(&lockres->l_lock);
603		goto bail;
604	}
605
606	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
607
608	while (lockres->l_flags & USER_LOCK_BUSY) {
609		spin_unlock(&lockres->l_lock);
610
611		user_wait_on_busy_lock(lockres);
612
613		spin_lock(&lockres->l_lock);
614	}
615
616	if (lockres->l_ro_holders || lockres->l_ex_holders) {
617		lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
618		spin_unlock(&lockres->l_lock);
619		goto bail;
620	}
621
622	status = 0;
623	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
624		/*
625		 * lock is never requested, leave USER_LOCK_IN_TEARDOWN set
626		 * to avoid new lock request coming in.
627		 */
628		spin_unlock(&lockres->l_lock);
629		goto bail;
630	}
631
632	lockres->l_flags |= USER_LOCK_BUSY;
633	spin_unlock(&lockres->l_lock);
634
635	status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
636	if (status) {
637		spin_lock(&lockres->l_lock);
638		lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
639		lockres->l_flags &= ~USER_LOCK_BUSY;
640		spin_unlock(&lockres->l_lock);
641		user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
642		goto bail;
643	}
644
645	user_wait_on_busy_lock(lockres);
646
647	status = 0;
648bail:
649	return status;
650}
651
652static void user_dlm_recovery_handler_noop(int node_num,
653					   void *recovery_data)
654{
655	/* We ignore recovery events */
656	return;
657}
658
659void user_dlm_set_locking_protocol(void)
660{
661	ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
662}
663
664struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name)
665{
666	int rc;
667	struct ocfs2_cluster_connection *conn;
668
669	rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
670					    &user_dlm_lproto,
671					    user_dlm_recovery_handler_noop,
672					    NULL, &conn);
673	if (rc)
674		mlog_errno(rc);
675
676	return rc ? ERR_PTR(rc) : conn;
677}
678
679void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
680{
681	ocfs2_cluster_disconnect(conn, 0);
682}
683