1/* AFS volume location management
2 *
3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include "internal.h"
17
18unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
19unsigned afs_vlocation_update_timeout = 10 * 60;
20
21static void afs_vlocation_reaper(struct work_struct *);
22static void afs_vlocation_updater(struct work_struct *);
23
24static LIST_HEAD(afs_vlocation_updates);
25static LIST_HEAD(afs_vlocation_graveyard);
26static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
27static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
28static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
29static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
30static struct workqueue_struct *afs_vlocation_update_worker;
31
32/*
33 * iterate through the VL servers in a cell until one of them admits knowing
34 * about the volume in question
35 */
36static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
37					   struct key *key,
38					   struct afs_cache_vlocation *vldb)
39{
40	struct afs_cell *cell = vl->cell;
41	struct in_addr addr;
42	int count, ret;
43
44	_enter("%s,%s", cell->name, vl->vldb.name);
45
46	down_write(&vl->cell->vl_sem);
47	ret = -ENOMEDIUM;
48	for (count = cell->vl_naddrs; count > 0; count--) {
49		addr = cell->vl_addrs[cell->vl_curr_svix];
50
51		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
52
53		/* attempt to access the VL server */
54		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
55					       &afs_sync_call);
56		switch (ret) {
57		case 0:
58			goto out;
59		case -ENOMEM:
60		case -ENONET:
61		case -ENETUNREACH:
62		case -EHOSTUNREACH:
63		case -ECONNREFUSED:
64			if (ret == -ENOMEM || ret == -ENONET)
65				goto out;
66			goto rotate;
67		case -ENOMEDIUM:
68			goto out;
69		default:
70			ret = -EIO;
71			goto rotate;
72		}
73
74		/* rotate the server records upon lookup failure */
75	rotate:
76		cell->vl_curr_svix++;
77		cell->vl_curr_svix %= cell->vl_naddrs;
78	}
79
80out:
81	up_write(&vl->cell->vl_sem);
82	_leave(" = %d", ret);
83	return ret;
84}
85
86/*
87 * iterate through the VL servers in a cell until one of them admits knowing
88 * about the volume in question
89 */
90static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
91					 struct key *key,
92					 afs_volid_t volid,
93					 afs_voltype_t voltype,
94					 struct afs_cache_vlocation *vldb)
95{
96	struct afs_cell *cell = vl->cell;
97	struct in_addr addr;
98	int count, ret;
99
100	_enter("%s,%x,%d,", cell->name, volid, voltype);
101
102	down_write(&vl->cell->vl_sem);
103	ret = -ENOMEDIUM;
104	for (count = cell->vl_naddrs; count > 0; count--) {
105		addr = cell->vl_addrs[cell->vl_curr_svix];
106
107		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
108
109		/* attempt to access the VL server */
110		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
111					     &afs_sync_call);
112		switch (ret) {
113		case 0:
114			goto out;
115		case -ENOMEM:
116		case -ENONET:
117		case -ENETUNREACH:
118		case -EHOSTUNREACH:
119		case -ECONNREFUSED:
120			if (ret == -ENOMEM || ret == -ENONET)
121				goto out;
122			goto rotate;
123		case -EBUSY:
124			vl->upd_busy_cnt++;
125			if (vl->upd_busy_cnt <= 3) {
126				if (vl->upd_busy_cnt > 1) {
127					/* second+ BUSY - sleep a little bit */
128					set_current_state(TASK_UNINTERRUPTIBLE);
129					schedule_timeout(1);
130					__set_current_state(TASK_RUNNING);
131				}
132				continue;
133			}
134			break;
135		case -ENOMEDIUM:
136			vl->upd_rej_cnt++;
137			goto rotate;
138		default:
139			ret = -EIO;
140			goto rotate;
141		}
142
143		/* rotate the server records upon lookup failure */
144	rotate:
145		cell->vl_curr_svix++;
146		cell->vl_curr_svix %= cell->vl_naddrs;
147		vl->upd_busy_cnt = 0;
148	}
149
150out:
151	if (ret < 0 && vl->upd_rej_cnt > 0) {
152		printk(KERN_NOTICE "kAFS:"
153		       " Active volume no longer valid '%s'\n",
154		       vl->vldb.name);
155		vl->valid = 0;
156		ret = -ENOMEDIUM;
157	}
158
159	up_write(&vl->cell->vl_sem);
160	_leave(" = %d", ret);
161	return ret;
162}
163
164/*
165 * allocate a volume location record
166 */
167static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
168						 const char *name,
169						 size_t namesz)
170{
171	struct afs_vlocation *vl;
172
173	vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
174	if (vl) {
175		vl->cell = cell;
176		vl->state = AFS_VL_NEW;
177		atomic_set(&vl->usage, 1);
178		INIT_LIST_HEAD(&vl->link);
179		INIT_LIST_HEAD(&vl->grave);
180		INIT_LIST_HEAD(&vl->update);
181		init_waitqueue_head(&vl->waitq);
182		spin_lock_init(&vl->lock);
183		memcpy(vl->vldb.name, name, namesz);
184	}
185
186	_leave(" = %p", vl);
187	return vl;
188}
189
190/*
191 * update record if we found it in the cache
192 */
193static int afs_vlocation_update_record(struct afs_vlocation *vl,
194				       struct key *key,
195				       struct afs_cache_vlocation *vldb)
196{
197	afs_voltype_t voltype;
198	afs_volid_t vid;
199	int ret;
200
201	/* try to look up a cached volume in the cell VL databases by ID */
202	_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
203	       vl->vldb.name,
204	       vl->vldb.vidmask,
205	       ntohl(vl->vldb.servers[0].s_addr),
206	       vl->vldb.srvtmask[0],
207	       ntohl(vl->vldb.servers[1].s_addr),
208	       vl->vldb.srvtmask[1],
209	       ntohl(vl->vldb.servers[2].s_addr),
210	       vl->vldb.srvtmask[2]);
211
212	_debug("Vids: %08x %08x %08x",
213	       vl->vldb.vid[0],
214	       vl->vldb.vid[1],
215	       vl->vldb.vid[2]);
216
217	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
218		vid = vl->vldb.vid[0];
219		voltype = AFSVL_RWVOL;
220	} else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
221		vid = vl->vldb.vid[1];
222		voltype = AFSVL_ROVOL;
223	} else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
224		vid = vl->vldb.vid[2];
225		voltype = AFSVL_BACKVOL;
226	} else {
227		BUG();
228		vid = 0;
229		voltype = 0;
230	}
231
232	/* contact the server to make sure the volume is still available
233	 * - TODO: need to handle disconnected operation here
234	 */
235	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
236	switch (ret) {
237		/* net error */
238	default:
239		printk(KERN_WARNING "kAFS:"
240		       " failed to update volume '%s' (%x) up in '%s': %d\n",
241		       vl->vldb.name, vid, vl->cell->name, ret);
242		_leave(" = %d", ret);
243		return ret;
244
245		/* pulled from local cache into memory */
246	case 0:
247		_leave(" = 0");
248		return 0;
249
250		/* uh oh... looks like the volume got deleted */
251	case -ENOMEDIUM:
252		printk(KERN_ERR "kAFS:"
253		       " volume '%s' (%x) does not exist '%s'\n",
254		       vl->vldb.name, vid, vl->cell->name);
255
256		/* TODO: make existing record unavailable */
257		_leave(" = %d", ret);
258		return ret;
259	}
260}
261
262/*
263 * apply the update to a VL record
264 */
265static void afs_vlocation_apply_update(struct afs_vlocation *vl,
266				       struct afs_cache_vlocation *vldb)
267{
268	_debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
269	       vldb->name, vldb->vidmask,
270	       ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
271	       ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
272	       ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
273
274	_debug("Vids: %08x %08x %08x",
275	       vldb->vid[0], vldb->vid[1], vldb->vid[2]);
276
277	if (strcmp(vldb->name, vl->vldb.name) != 0)
278		printk(KERN_NOTICE "kAFS:"
279		       " name of volume '%s' changed to '%s' on server\n",
280		       vl->vldb.name, vldb->name);
281
282	vl->vldb = *vldb;
283
284#ifdef AFS_CACHING_SUPPORT
285	/* update volume entry in local cache */
286	cachefs_update_cookie(vl->cache);
287#endif
288}
289
290/*
291 * fill in a volume location record, consulting the cache and the VL server
292 * both
293 */
294static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
295					struct key *key)
296{
297	struct afs_cache_vlocation vldb;
298	int ret;
299
300	_enter("");
301
302	ASSERTCMP(vl->valid, ==, 0);
303
304	memset(&vldb, 0, sizeof(vldb));
305
306	/* see if we have an in-cache copy (will set vl->valid if there is) */
307#ifdef AFS_CACHING_SUPPORT
308	cachefs_acquire_cookie(cell->cache,
309			       &afs_volume_cache_index_def,
310			       vlocation,
311			       &vl->cache);
312#endif
313
314	if (vl->valid) {
315		/* try to update a known volume in the cell VL databases by
316		 * ID as the name may have changed */
317		_debug("found in cache");
318		ret = afs_vlocation_update_record(vl, key, &vldb);
319	} else {
320		/* try to look up an unknown volume in the cell VL databases by
321		 * name */
322		ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
323		if (ret < 0) {
324			printk("kAFS: failed to locate '%s' in cell '%s'\n",
325			       vl->vldb.name, vl->cell->name);
326			return ret;
327		}
328	}
329
330	afs_vlocation_apply_update(vl, &vldb);
331	_leave(" = 0");
332	return 0;
333}
334
335/*
336 * queue a vlocation record for updates
337 */
338void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
339{
340	struct afs_vlocation *xvl;
341
342	/* wait at least 10 minutes before updating... */
343	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
344
345	spin_lock(&afs_vlocation_updates_lock);
346
347	if (!list_empty(&afs_vlocation_updates)) {
348		/* ... but wait at least 1 second more than the newest record
349		 * already queued so that we don't spam the VL server suddenly
350		 * with lots of requests
351		 */
352		xvl = list_entry(afs_vlocation_updates.prev,
353				 struct afs_vlocation, update);
354		if (vl->update_at <= xvl->update_at)
355			vl->update_at = xvl->update_at + 1;
356	} else {
357		queue_delayed_work(afs_vlocation_update_worker,
358				   &afs_vlocation_update,
359				   afs_vlocation_update_timeout * HZ);
360	}
361
362	list_add_tail(&vl->update, &afs_vlocation_updates);
363	spin_unlock(&afs_vlocation_updates_lock);
364}
365
366/*
367 * lookup volume location
368 * - iterate through the VL servers in a cell until one of them admits knowing
369 *   about the volume in question
370 * - lookup in the local cache if not able to find on the VL server
371 * - insert/update in the local cache if did get a VL response
372 */
373struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
374					   struct key *key,
375					   const char *name,
376					   size_t namesz)
377{
378	struct afs_vlocation *vl;
379	int ret;
380
381	_enter("{%s},{%x},%*.*s,%zu",
382	       cell->name, key_serial(key),
383	       (int) namesz, (int) namesz, name, namesz);
384
385	if (namesz > sizeof(vl->vldb.name)) {
386		_leave(" = -ENAMETOOLONG");
387		return ERR_PTR(-ENAMETOOLONG);
388	}
389
390	/* see if we have an in-memory copy first */
391	down_write(&cell->vl_sem);
392	spin_lock(&cell->vl_lock);
393	list_for_each_entry(vl, &cell->vl_list, link) {
394		if (vl->vldb.name[namesz] != '\0')
395			continue;
396		if (memcmp(vl->vldb.name, name, namesz) == 0)
397			goto found_in_memory;
398	}
399	spin_unlock(&cell->vl_lock);
400
401	/* not in the cell's in-memory lists - create a new record */
402	vl = afs_vlocation_alloc(cell, name, namesz);
403	if (!vl) {
404		up_write(&cell->vl_sem);
405		return ERR_PTR(-ENOMEM);
406	}
407
408	afs_get_cell(cell);
409
410	list_add_tail(&vl->link, &cell->vl_list);
411	vl->state = AFS_VL_CREATING;
412	up_write(&cell->vl_sem);
413
414fill_in_record:
415	ret = afs_vlocation_fill_in_record(vl, key);
416	if (ret < 0)
417		goto error_abandon;
418	spin_lock(&vl->lock);
419	vl->state = AFS_VL_VALID;
420	spin_unlock(&vl->lock);
421	wake_up(&vl->waitq);
422
423	/* schedule for regular updates */
424	afs_vlocation_queue_for_updates(vl);
425	goto success;
426
427found_in_memory:
428	/* found in memory */
429	_debug("found in memory");
430	atomic_inc(&vl->usage);
431	spin_unlock(&cell->vl_lock);
432	if (!list_empty(&vl->grave)) {
433		spin_lock(&afs_vlocation_graveyard_lock);
434		list_del_init(&vl->grave);
435		spin_unlock(&afs_vlocation_graveyard_lock);
436	}
437	up_write(&cell->vl_sem);
438
439	/* see if it was an abandoned record that we might try filling in */
440	spin_lock(&vl->lock);
441	while (vl->state != AFS_VL_VALID) {
442		afs_vlocation_state_t state = vl->state;
443
444		_debug("invalid [state %d]", state);
445
446		if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
447			vl->state = AFS_VL_CREATING;
448			spin_unlock(&vl->lock);
449			goto fill_in_record;
450		}
451
452		/* must now wait for creation or update by someone else to
453		 * complete */
454		_debug("wait");
455
456		spin_unlock(&vl->lock);
457		ret = wait_event_interruptible(vl->waitq,
458					       vl->state == AFS_VL_NEW ||
459					       vl->state == AFS_VL_VALID ||
460					       vl->state == AFS_VL_NO_VOLUME);
461		if (ret < 0)
462			goto error;
463		spin_lock(&vl->lock);
464	}
465	spin_unlock(&vl->lock);
466
467success:
468	_leave(" = %p",vl);
469	return vl;
470
471error_abandon:
472	spin_lock(&vl->lock);
473	vl->state = AFS_VL_NEW;
474	spin_unlock(&vl->lock);
475	wake_up(&vl->waitq);
476error:
477	ASSERT(vl != NULL);
478	afs_put_vlocation(vl);
479	_leave(" = %d", ret);
480	return ERR_PTR(ret);
481}
482
483/*
484 * finish using a volume location record
485 */
486void afs_put_vlocation(struct afs_vlocation *vl)
487{
488	if (!vl)
489		return;
490
491	_enter("%s", vl->vldb.name);
492
493	ASSERTCMP(atomic_read(&vl->usage), >, 0);
494
495	if (likely(!atomic_dec_and_test(&vl->usage))) {
496		_leave("");
497		return;
498	}
499
500	spin_lock(&afs_vlocation_graveyard_lock);
501	if (atomic_read(&vl->usage) == 0) {
502		_debug("buried");
503		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
504		vl->time_of_death = get_seconds();
505		schedule_delayed_work(&afs_vlocation_reap,
506				      afs_vlocation_timeout * HZ);
507
508		/* suspend updates on this record */
509		if (!list_empty(&vl->update)) {
510			spin_lock(&afs_vlocation_updates_lock);
511			list_del_init(&vl->update);
512			spin_unlock(&afs_vlocation_updates_lock);
513		}
514	}
515	spin_unlock(&afs_vlocation_graveyard_lock);
516	_leave(" [killed?]");
517}
518
519/*
520 * destroy a dead volume location record
521 */
522static void afs_vlocation_destroy(struct afs_vlocation *vl)
523{
524	_enter("%p", vl);
525
526#ifdef AFS_CACHING_SUPPORT
527	cachefs_relinquish_cookie(vl->cache, 0);
528#endif
529
530	afs_put_cell(vl->cell);
531	kfree(vl);
532}
533
534/*
535 * reap dead volume location records
536 */
537static void afs_vlocation_reaper(struct work_struct *work)
538{
539	LIST_HEAD(corpses);
540	struct afs_vlocation *vl;
541	unsigned long delay, expiry;
542	time_t now;
543
544	_enter("");
545
546	now = get_seconds();
547	spin_lock(&afs_vlocation_graveyard_lock);
548
549	while (!list_empty(&afs_vlocation_graveyard)) {
550		vl = list_entry(afs_vlocation_graveyard.next,
551				struct afs_vlocation, grave);
552
553		_debug("check %p", vl);
554
555		/* the queue is ordered most dead first */
556		expiry = vl->time_of_death + afs_vlocation_timeout;
557		if (expiry > now) {
558			delay = (expiry - now) * HZ;
559			_debug("delay %lu", delay);
560			if (!schedule_delayed_work(&afs_vlocation_reap,
561						   delay)) {
562				cancel_delayed_work(&afs_vlocation_reap);
563				schedule_delayed_work(&afs_vlocation_reap,
564						      delay);
565			}
566			break;
567		}
568
569		spin_lock(&vl->cell->vl_lock);
570		if (atomic_read(&vl->usage) > 0) {
571			_debug("no reap");
572			list_del_init(&vl->grave);
573		} else {
574			_debug("reap");
575			list_move_tail(&vl->grave, &corpses);
576			list_del_init(&vl->link);
577		}
578		spin_unlock(&vl->cell->vl_lock);
579	}
580
581	spin_unlock(&afs_vlocation_graveyard_lock);
582
583	/* now reap the corpses we've extracted */
584	while (!list_empty(&corpses)) {
585		vl = list_entry(corpses.next, struct afs_vlocation, grave);
586		list_del(&vl->grave);
587		afs_vlocation_destroy(vl);
588	}
589
590	_leave("");
591}
592
593/*
594 * initialise the VL update process
595 */
596int __init afs_vlocation_update_init(void)
597{
598	afs_vlocation_update_worker =
599		create_singlethread_workqueue("kafs_vlupdated");
600	return afs_vlocation_update_worker ? 0 : -ENOMEM;
601}
602
603/*
604 * discard all the volume location records for rmmod
605 */
606void afs_vlocation_purge(void)
607{
608	afs_vlocation_timeout = 0;
609
610	spin_lock(&afs_vlocation_updates_lock);
611	list_del_init(&afs_vlocation_updates);
612	spin_unlock(&afs_vlocation_updates_lock);
613	cancel_delayed_work(&afs_vlocation_update);
614	queue_delayed_work(afs_vlocation_update_worker,
615			   &afs_vlocation_update, 0);
616	destroy_workqueue(afs_vlocation_update_worker);
617
618	cancel_delayed_work(&afs_vlocation_reap);
619	schedule_delayed_work(&afs_vlocation_reap, 0);
620}
621
622/*
623 * update a volume location
624 */
625static void afs_vlocation_updater(struct work_struct *work)
626{
627	struct afs_cache_vlocation vldb;
628	struct afs_vlocation *vl, *xvl;
629	time_t now;
630	long timeout;
631	int ret;
632
633	_enter("");
634
635	now = get_seconds();
636
637	/* find a record to update */
638	spin_lock(&afs_vlocation_updates_lock);
639	for (;;) {
640		if (list_empty(&afs_vlocation_updates)) {
641			spin_unlock(&afs_vlocation_updates_lock);
642			_leave(" [nothing]");
643			return;
644		}
645
646		vl = list_entry(afs_vlocation_updates.next,
647				struct afs_vlocation, update);
648		if (atomic_read(&vl->usage) > 0)
649			break;
650		list_del_init(&vl->update);
651	}
652
653	timeout = vl->update_at - now;
654	if (timeout > 0) {
655		queue_delayed_work(afs_vlocation_update_worker,
656				   &afs_vlocation_update, timeout * HZ);
657		spin_unlock(&afs_vlocation_updates_lock);
658		_leave(" [nothing]");
659		return;
660	}
661
662	list_del_init(&vl->update);
663	atomic_inc(&vl->usage);
664	spin_unlock(&afs_vlocation_updates_lock);
665
666	/* we can now perform the update */
667	_debug("update %s", vl->vldb.name);
668	vl->state = AFS_VL_UPDATING;
669	vl->upd_rej_cnt = 0;
670	vl->upd_busy_cnt = 0;
671
672	ret = afs_vlocation_update_record(vl, NULL, &vldb);
673	spin_lock(&vl->lock);
674	switch (ret) {
675	case 0:
676		afs_vlocation_apply_update(vl, &vldb);
677		vl->state = AFS_VL_VALID;
678		break;
679	case -ENOMEDIUM:
680		vl->state = AFS_VL_VOLUME_DELETED;
681		break;
682	default:
683		vl->state = AFS_VL_UNCERTAIN;
684		break;
685	}
686	spin_unlock(&vl->lock);
687	wake_up(&vl->waitq);
688
689	/* and then reschedule */
690	_debug("reschedule");
691	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
692
693	spin_lock(&afs_vlocation_updates_lock);
694
695	if (!list_empty(&afs_vlocation_updates)) {
696		/* next update in 10 minutes, but wait at least 1 second more
697		 * than the newest record already queued so that we don't spam
698		 * the VL server suddenly with lots of requests
699		 */
700		xvl = list_entry(afs_vlocation_updates.prev,
701				 struct afs_vlocation, update);
702		if (vl->update_at <= xvl->update_at)
703			vl->update_at = xvl->update_at + 1;
704		xvl = list_entry(afs_vlocation_updates.next,
705				 struct afs_vlocation, update);
706		timeout = xvl->update_at - now;
707		if (timeout < 0)
708			timeout = 0;
709	} else {
710		timeout = afs_vlocation_update_timeout;
711	}
712
713	ASSERT(list_empty(&vl->update));
714
715	list_add_tail(&vl->update, &afs_vlocation_updates);
716
717	_debug("timeout %ld", timeout);
718	queue_delayed_work(afs_vlocation_update_worker,
719			   &afs_vlocation_update, timeout * HZ);
720	spin_unlock(&afs_vlocation_updates_lock);
721	afs_put_vlocation(vl);
722}
723