1// SPDX-License-Identifier: GPL-2.0-or-later
2/* Handle vlserver selection and rotation.
3 *
4 * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/sched/signal.h>
11#include "internal.h"
12#include "afs_vl.h"
13
14/*
15 * Begin an operation on a volume location server.
16 */
17bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
18				  struct key *key)
19{
20	static atomic_t debug_ids;
21
22	memset(vc, 0, sizeof(*vc));
23	vc->cell = cell;
24	vc->key = key;
25	vc->cumul_error.error = -EDESTADDRREQ;
26	vc->nr_iterations = -1;
27
28	if (signal_pending(current)) {
29		vc->cumul_error.error = -EINTR;
30		vc->flags |= AFS_VL_CURSOR_STOP;
31		return false;
32	}
33
34	vc->debug_id = atomic_inc_return(&debug_ids);
35	return true;
36}
37
38/*
39 * Begin iteration through a server list, starting with the last used server if
40 * possible, or the last recorded good server if not.
41 */
42static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
43{
44	struct afs_cell *cell = vc->cell;
45	unsigned int dns_lookup_count;
46
47	if (cell->dns_source == DNS_RECORD_UNAVAILABLE ||
48	    cell->dns_expiry <= ktime_get_real_seconds()) {
49		dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
50		set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
51		afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
52
53		if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
54			if (wait_var_event_interruptible(
55				    &cell->dns_lookup_count,
56				    smp_load_acquire(&cell->dns_lookup_count)
57				    != dns_lookup_count) < 0) {
58				vc->cumul_error.error = -ERESTARTSYS;
59				return false;
60			}
61		}
62
63		/* Status load is ordered after lookup counter load */
64		if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
65			pr_warn("No record of cell %s\n", cell->name);
66			vc->cumul_error.error = -ENOENT;
67			return false;
68		}
69
70		if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
71			vc->cumul_error.error = -EDESTADDRREQ;
72			return false;
73		}
74	}
75
76	read_lock(&cell->vl_servers_lock);
77	vc->server_list = afs_get_vlserverlist(
78		rcu_dereference_protected(cell->vl_servers,
79					  lockdep_is_held(&cell->vl_servers_lock)));
80	read_unlock(&cell->vl_servers_lock);
81	if (!vc->server_list->nr_servers)
82		return false;
83
84	vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1;
85	vc->server_index = -1;
86	return true;
87}
88
89/*
90 * Select the vlserver to use.  May be called multiple times to rotate
91 * through the vlservers.
92 */
93bool afs_select_vlserver(struct afs_vl_cursor *vc)
94{
95	struct afs_addr_list *alist = vc->alist;
96	struct afs_vlserver *vlserver;
97	unsigned long set, failed;
98	unsigned int rtt;
99	s32 abort_code = vc->call_abort_code;
100	int error = vc->call_error, i;
101
102	vc->nr_iterations++;
103
104	_enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d",
105	       vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers,
106	       vc->addr_index, vc->addr_tried,
107	       error, abort_code);
108
109	if (vc->flags & AFS_VL_CURSOR_STOP) {
110		_leave(" = f [stopped]");
111		return false;
112	}
113
114	if (vc->nr_iterations == 0)
115		goto start;
116
117	WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error);
118
119	/* Evaluate the result of the previous operation, if there was one. */
120	switch (error) {
121	default:
122	case 0:
123		/* Success or local failure.  Stop. */
124		vc->cumul_error.error = error;
125		vc->flags |= AFS_VL_CURSOR_STOP;
126		_leave(" = f [okay/local %d]", vc->cumul_error.error);
127		return false;
128
129	case -ECONNABORTED:
130		/* The far side rejected the operation on some grounds.  This
131		 * might involve the server being busy or the volume having been moved.
132		 */
133		switch (abort_code) {
134		case AFSVL_IO:
135		case AFSVL_BADVOLOPER:
136		case AFSVL_NOMEM:
137			/* The server went weird. */
138			afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code);
139			//write_lock(&vc->cell->vl_servers_lock);
140			//vc->server_list->weird_mask |= 1 << vc->server_index;
141			//write_unlock(&vc->cell->vl_servers_lock);
142			goto next_server;
143
144		default:
145			afs_prioritise_error(&vc->cumul_error, error, abort_code);
146			goto failed;
147		}
148
149	case -ERFKILL:
150	case -EADDRNOTAVAIL:
151	case -ENETUNREACH:
152	case -EHOSTUNREACH:
153	case -EHOSTDOWN:
154	case -ECONNREFUSED:
155	case -ETIMEDOUT:
156	case -ETIME:
157		_debug("no conn %d", error);
158		afs_prioritise_error(&vc->cumul_error, error, 0);
159		goto iterate_address;
160
161	case -ECONNRESET:
162		_debug("call reset");
163		afs_prioritise_error(&vc->cumul_error, error, 0);
164		vc->flags |= AFS_VL_CURSOR_RETRY;
165		goto next_server;
166
167	case -EOPNOTSUPP:
168		_debug("notsupp");
169		goto next_server;
170	}
171
172restart_from_beginning:
173	_debug("restart");
174	if (vc->call_responded &&
175	    vc->addr_index != vc->alist->preferred &&
176	    test_bit(alist->preferred, &vc->addr_tried))
177		WRITE_ONCE(alist->preferred, vc->addr_index);
178	afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart);
179	alist = vc->alist = NULL;
180
181	afs_put_vlserverlist(vc->cell->net, vc->server_list);
182	vc->server_list = NULL;
183	if (vc->flags & AFS_VL_CURSOR_RETRIED)
184		goto failed;
185	vc->flags |= AFS_VL_CURSOR_RETRIED;
186start:
187	_debug("start");
188	ASSERTCMP(alist, ==, NULL);
189
190	if (!afs_start_vl_iteration(vc))
191		goto failed;
192
193	error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
194	if (error < 0) {
195		afs_prioritise_error(&vc->cumul_error, error, 0);
196		goto failed;
197	}
198
199pick_server:
200	_debug("pick [%lx]", vc->untried_servers);
201	ASSERTCMP(alist, ==, NULL);
202
203	error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers);
204	if (error < 0) {
205		afs_prioritise_error(&vc->cumul_error, error, 0);
206		goto failed;
207	}
208
209	/* Pick the untried server with the lowest RTT. */
210	vc->server_index = vc->server_list->preferred;
211	if (test_bit(vc->server_index, &vc->untried_servers))
212		goto selected_server;
213
214	vc->server_index = -1;
215	rtt = UINT_MAX;
216	for (i = 0; i < vc->server_list->nr_servers; i++) {
217		struct afs_vlserver *s = vc->server_list->servers[i].server;
218
219		if (!test_bit(i, &vc->untried_servers) ||
220		    !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
221			continue;
222		if (s->probe.rtt <= rtt) {
223			vc->server_index = i;
224			rtt = s->probe.rtt;
225		}
226	}
227
228	if (vc->server_index == -1)
229		goto no_more_servers;
230
231selected_server:
232	_debug("use %d", vc->server_index);
233	__clear_bit(vc->server_index, &vc->untried_servers);
234
235	/* We're starting on a different vlserver from the list.  We need to
236	 * check it, find its address list and probe its capabilities before we
237	 * use it.
238	 */
239	vlserver = vc->server_list->servers[vc->server_index].server;
240	vc->server = vlserver;
241
242	_debug("USING VLSERVER: %s", vlserver->name);
243
244	read_lock(&vlserver->lock);
245	alist = rcu_dereference_protected(vlserver->addresses,
246					  lockdep_is_held(&vlserver->lock));
247	vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set);
248	read_unlock(&vlserver->lock);
249
250	vc->addr_tried = 0;
251	vc->addr_index = -1;
252
253iterate_address:
254	/* Iterate over the current server's address list to try and find an
255	 * address on which it will respond to us.
256	 */
257	set = READ_ONCE(alist->responded);
258	failed = READ_ONCE(alist->probe_failed);
259	vc->addr_index = READ_ONCE(alist->preferred);
260
261	_debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index);
262
263	set &= ~(failed | vc->addr_tried);
264
265	if (!set)
266		goto next_server;
267
268	if (!test_bit(vc->addr_index, &set))
269		vc->addr_index = __ffs(set);
270
271	set_bit(vc->addr_index, &vc->addr_tried);
272	vc->alist = alist;
273
274	_debug("VL address %d/%d", vc->addr_index, alist->nr_addrs);
275
276	vc->call_responded = false;
277	_leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer));
278	return true;
279
280next_server:
281	_debug("next");
282	ASSERT(alist);
283	if (vc->call_responded &&
284	    vc->addr_index != alist->preferred &&
285	    test_bit(alist->preferred, &vc->addr_tried))
286		WRITE_ONCE(alist->preferred, vc->addr_index);
287	afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next);
288	alist = vc->alist = NULL;
289	goto pick_server;
290
291no_more_servers:
292	/* That's all the servers poked to no good effect.  Try again if some
293	 * of them were busy.
294	 */
295	if (vc->flags & AFS_VL_CURSOR_RETRY)
296		goto restart_from_beginning;
297
298	for (i = 0; i < vc->server_list->nr_servers; i++) {
299		struct afs_vlserver *s = vc->server_list->servers[i].server;
300
301		if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
302			vc->cumul_error.responded = true;
303		afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error),
304				     s->probe.abort_code);
305	}
306
307failed:
308	if (alist) {
309		if (vc->call_responded &&
310		    vc->addr_index != alist->preferred &&
311		    test_bit(alist->preferred, &vc->addr_tried))
312			WRITE_ONCE(alist->preferred, vc->addr_index);
313		afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail);
314		alist = vc->alist = NULL;
315	}
316	vc->flags |= AFS_VL_CURSOR_STOP;
317	_leave(" = f [failed %d]", vc->cumul_error.error);
318	return false;
319}
320
321/*
322 * Dump cursor state in the case of the error being EDESTADDRREQ.
323 */
324static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
325{
326	struct afs_cell *cell = vc->cell;
327	static int count;
328	int i;
329
330	if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
331		return;
332	count++;
333
334	rcu_read_lock();
335	pr_notice("EDESTADDR occurred\n");
336	pr_notice("CELL: %s err=%d\n", cell->name, cell->error);
337	pr_notice("DNS: src=%u st=%u lc=%x\n",
338		  cell->dns_source, cell->dns_status, cell->dns_lookup_count);
339	pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
340		  vc->untried_servers, vc->server_index, vc->nr_iterations,
341		  vc->flags, vc->cumul_error.error);
342	pr_notice("VC: call  er=%d ac=%d r=%u\n",
343		  vc->call_error, vc->call_abort_code, vc->call_responded);
344
345	if (vc->server_list) {
346		const struct afs_vlserver_list *sl = vc->server_list;
347		pr_notice("VC: SL nr=%u ix=%u\n",
348			  sl->nr_servers, sl->index);
349		for (i = 0; i < sl->nr_servers; i++) {
350			const struct afs_vlserver *s = sl->servers[i].server;
351			pr_notice("VC: server %s+%hu fl=%lx E=%hd\n",
352				  s->name, s->port, s->flags, s->probe.error);
353			if (s->addresses) {
354				const struct afs_addr_list *a =
355					rcu_dereference(s->addresses);
356				pr_notice("VC:  - nr=%u/%u/%u pf=%u\n",
357					  a->nr_ipv4, a->nr_addrs, a->max_addrs,
358					  a->preferred);
359				pr_notice("VC:  - R=%lx F=%lx\n",
360					  a->responded, a->probe_failed);
361				if (a == vc->alist)
362					pr_notice("VC:  - current\n");
363			}
364		}
365	}
366
367	pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index);
368	rcu_read_unlock();
369}
370
371/*
372 * Tidy up a volume location server cursor and unlock the vnode.
373 */
374int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
375{
376	struct afs_net *net = vc->cell->net;
377
378	_enter("VC=%x+%x", vc->debug_id, vc->nr_iterations);
379
380	switch (vc->cumul_error.error) {
381	case -EDESTADDRREQ:
382	case -EADDRNOTAVAIL:
383	case -ENETUNREACH:
384	case -EHOSTUNREACH:
385		afs_vl_dump_edestaddrreq(vc);
386		break;
387	}
388
389	if (vc->alist) {
390		if (vc->call_responded &&
391		    vc->addr_index != vc->alist->preferred &&
392		    test_bit(vc->alist->preferred, &vc->addr_tried))
393			WRITE_ONCE(vc->alist->preferred, vc->addr_index);
394		afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end);
395		vc->alist = NULL;
396	}
397	afs_put_vlserverlist(net, vc->server_list);
398	return vc->cumul_error.error;
399}
400