• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-R7000-V1.0.7.12_1.2.5/components/opensource/linux/linux-2.6.36/drivers/misc/sgi-xp/
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License.  See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) support - standard version.
11 *
12 *	XPC provides a message passing capability that crosses partition
13 *	boundaries. This module is made up of two parts:
14 *
15 *	    partition	This part detects the presence/absence of other
16 *			partitions. It provides a heartbeat and monitors
17 *			the heartbeats of other partitions.
18 *
19 *	    channel	This part manages the channels and sends/receives
20 *			messages across them to/from other partitions.
21 *
22 *	There are a couple of additional functions residing in XP, which
23 *	provide an interface to XPC for its users.
24 *
25 *
26 *	Caveats:
27 *
28 *	  . Currently on sn2, we have no way to determine which nasid an IRQ
29 *	    came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
30 *	    followed by an IPI. The amo indicates where data is to be pulled
31 *	    from, so after the IPI arrives, the remote partition checks the amo
32 *	    word. The IPI can actually arrive before the amo however, so other
33 *	    code must periodically check for this case. Also, remote amo
34 *	    operations do not reliably time out. Thus we do a remote PIO read
35 *	    solely to know whether the remote partition is down and whether we
36 *	    should stop sending IPIs to it. This remote PIO read operation is
37 *	    set up in a special nofault region so SAL knows to ignore (and
38 *	    cleanup) any errors due to the remote amo write, PIO read, and/or
39 *	    PIO write operations.
40 *
41 *	    If/when new hardware solves this IPI problem, we should abandon
42 *	    the current approach.
43 *
44 */
45
46#include <linux/module.h>
47#include <linux/slab.h>
48#include <linux/sysctl.h>
49#include <linux/device.h>
50#include <linux/delay.h>
51#include <linux/reboot.h>
52#include <linux/kdebug.h>
53#include <linux/kthread.h>
54#include "xpc.h"
55
56/* define two XPC debug device structures to be used with dev_dbg() et al */
57
58struct device_driver xpc_dbg_name = {
59	.name = "xpc"
60};
61
62struct device xpc_part_dbg_subname = {
63	.init_name = "",	/* set to "part" at xpc_init() time */
64	.driver = &xpc_dbg_name
65};
66
67struct device xpc_chan_dbg_subname = {
68	.init_name = "",	/* set to "chan" at xpc_init() time */
69	.driver = &xpc_dbg_name
70};
71
72struct device *xpc_part = &xpc_part_dbg_subname;
73struct device *xpc_chan = &xpc_chan_dbg_subname;
74
75static int xpc_kdebug_ignore;
76
77/* systune related variables for /proc/sys directories */
78
79static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
80static int xpc_hb_min_interval = 1;
81static int xpc_hb_max_interval = 10;
82
83static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
84static int xpc_hb_check_min_interval = 10;
85static int xpc_hb_check_max_interval = 120;
86
87int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
88static int xpc_disengage_min_timelimit;	/* = 0 */
89static int xpc_disengage_max_timelimit = 120;
90
91static ctl_table xpc_sys_xpc_hb_dir[] = {
92	{
93	 .procname = "hb_interval",
94	 .data = &xpc_hb_interval,
95	 .maxlen = sizeof(int),
96	 .mode = 0644,
97	 .proc_handler = proc_dointvec_minmax,
98	 .extra1 = &xpc_hb_min_interval,
99	 .extra2 = &xpc_hb_max_interval},
100	{
101	 .procname = "hb_check_interval",
102	 .data = &xpc_hb_check_interval,
103	 .maxlen = sizeof(int),
104	 .mode = 0644,
105	 .proc_handler = proc_dointvec_minmax,
106	 .extra1 = &xpc_hb_check_min_interval,
107	 .extra2 = &xpc_hb_check_max_interval},
108	{}
109};
110static ctl_table xpc_sys_xpc_dir[] = {
111	{
112	 .procname = "hb",
113	 .mode = 0555,
114	 .child = xpc_sys_xpc_hb_dir},
115	{
116	 .procname = "disengage_timelimit",
117	 .data = &xpc_disengage_timelimit,
118	 .maxlen = sizeof(int),
119	 .mode = 0644,
120	 .proc_handler = proc_dointvec_minmax,
121	 .extra1 = &xpc_disengage_min_timelimit,
122	 .extra2 = &xpc_disengage_max_timelimit},
123	{}
124};
125static ctl_table xpc_sys_dir[] = {
126	{
127	 .procname = "xpc",
128	 .mode = 0555,
129	 .child = xpc_sys_xpc_dir},
130	{}
131};
132static struct ctl_table_header *xpc_sysctl;
133
134/* non-zero if any remote partition disengage was timed out */
135int xpc_disengage_timedout;
136
137/* #of activate IRQs received and not yet processed */
138int xpc_activate_IRQ_rcvd;
139DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
140
141/* IRQ handler notifies this wait queue on receipt of an IRQ */
142DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
143
144static unsigned long xpc_hb_check_timeout;
145static struct timer_list xpc_hb_timer;
146
147/* notification that the xpc_hb_checker thread has exited */
148static DECLARE_COMPLETION(xpc_hb_checker_exited);
149
150/* notification that the xpc_discovery thread has exited */
151static DECLARE_COMPLETION(xpc_discovery_exited);
152
153static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
154
155static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
156static struct notifier_block xpc_reboot_notifier = {
157	.notifier_call = xpc_system_reboot,
158};
159
160static int xpc_system_die(struct notifier_block *, unsigned long, void *);
161static struct notifier_block xpc_die_notifier = {
162	.notifier_call = xpc_system_die,
163};
164
165struct xpc_arch_operations xpc_arch_ops;
166
167/*
168 * Timer function to enforce the timelimit on the partition disengage.
169 */
170static void
171xpc_timeout_partition_disengage(unsigned long data)
172{
173	struct xpc_partition *part = (struct xpc_partition *)data;
174
175	DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
176
177	(void)xpc_partition_disengaged(part);
178
179	DBUG_ON(part->disengage_timeout != 0);
180	DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part)));
181}
182
183/*
184 * Timer to produce the heartbeat.  The timer structures function is
185 * already set when this is initially called.  A tunable is used to
186 * specify when the next timeout should occur.
187 */
188static void
189xpc_hb_beater(unsigned long dummy)
190{
191	xpc_arch_ops.increment_heartbeat();
192
193	if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
194		wake_up_interruptible(&xpc_activate_IRQ_wq);
195
196	xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
197	add_timer(&xpc_hb_timer);
198}
199
200static void
201xpc_start_hb_beater(void)
202{
203	xpc_arch_ops.heartbeat_init();
204	init_timer(&xpc_hb_timer);
205	xpc_hb_timer.function = xpc_hb_beater;
206	xpc_hb_beater(0);
207}
208
209static void
210xpc_stop_hb_beater(void)
211{
212	del_timer_sync(&xpc_hb_timer);
213	xpc_arch_ops.heartbeat_exit();
214}
215
216/*
217 * At periodic intervals, scan through all active partitions and ensure
218 * their heartbeat is still active.  If not, the partition is deactivated.
219 */
220static void
221xpc_check_remote_hb(void)
222{
223	struct xpc_partition *part;
224	short partid;
225	enum xp_retval ret;
226
227	for (partid = 0; partid < xp_max_npartitions; partid++) {
228
229		if (xpc_exiting)
230			break;
231
232		if (partid == xp_partition_id)
233			continue;
234
235		part = &xpc_partitions[partid];
236
237		if (part->act_state == XPC_P_AS_INACTIVE ||
238		    part->act_state == XPC_P_AS_DEACTIVATING) {
239			continue;
240		}
241
242		ret = xpc_arch_ops.get_remote_heartbeat(part);
243		if (ret != xpSuccess)
244			XPC_DEACTIVATE_PARTITION(part, ret);
245	}
246}
247
248/*
249 * This thread is responsible for nearly all of the partition
250 * activation/deactivation.
251 */
252static int
253xpc_hb_checker(void *ignore)
254{
255	int force_IRQ = 0;
256
257	/* this thread was marked active by xpc_hb_init() */
258
259	set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
260
261	/* set our heartbeating to other partitions into motion */
262	xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
263	xpc_start_hb_beater();
264
265	while (!xpc_exiting) {
266
267		dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
268			"been received\n",
269			(int)(xpc_hb_check_timeout - jiffies),
270			xpc_activate_IRQ_rcvd);
271
272		/* checking of remote heartbeats is skewed by IRQ handling */
273		if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
274			xpc_hb_check_timeout = jiffies +
275			    (xpc_hb_check_interval * HZ);
276
277			dev_dbg(xpc_part, "checking remote heartbeats\n");
278			xpc_check_remote_hb();
279
280			/*
281			 * On sn2 we need to periodically recheck to ensure no
282			 * IRQ/amo pairs have been missed.
283			 */
284			if (is_shub())
285				force_IRQ = 1;
286		}
287
288		/* check for outstanding IRQs */
289		if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
290			force_IRQ = 0;
291			dev_dbg(xpc_part, "processing activate IRQs "
292				"received\n");
293			xpc_arch_ops.process_activate_IRQ_rcvd();
294		}
295
296		/* wait for IRQ or timeout */
297		(void)wait_event_interruptible(xpc_activate_IRQ_wq,
298					       (time_is_before_eq_jiffies(
299						xpc_hb_check_timeout) ||
300						xpc_activate_IRQ_rcvd > 0 ||
301						xpc_exiting));
302	}
303
304	xpc_stop_hb_beater();
305
306	dev_dbg(xpc_part, "heartbeat checker is exiting\n");
307
308	/* mark this thread as having exited */
309	complete(&xpc_hb_checker_exited);
310	return 0;
311}
312
313/*
314 * This thread will attempt to discover other partitions to activate
315 * based on info provided by SAL. This new thread is short lived and
316 * will exit once discovery is complete.
317 */
318static int
319xpc_initiate_discovery(void *ignore)
320{
321	xpc_discovery();
322
323	dev_dbg(xpc_part, "discovery thread is exiting\n");
324
325	/* mark this thread as having exited */
326	complete(&xpc_discovery_exited);
327	return 0;
328}
329
330/*
331 * The first kthread assigned to a newly activated partition is the one
332 * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
333 * that kthread until the partition is brought down, at which time that kthread
334 * returns back to XPC HB. (The return of that kthread will signify to XPC HB
335 * that XPC has dismantled all communication infrastructure for the associated
336 * partition.) This kthread becomes the channel manager for that partition.
337 *
338 * Each active partition has a channel manager, who, besides connecting and
339 * disconnecting channels, will ensure that each of the partition's connected
340 * channels has the required number of assigned kthreads to get the work done.
341 */
342static void
343xpc_channel_mgr(struct xpc_partition *part)
344{
345	while (part->act_state != XPC_P_AS_DEACTIVATING ||
346	       atomic_read(&part->nchannels_active) > 0 ||
347	       !xpc_partition_disengaged(part)) {
348
349		xpc_process_sent_chctl_flags(part);
350
351		/*
352		 * Wait until we've been requested to activate kthreads or
353		 * all of the channel's message queues have been torn down or
354		 * a signal is pending.
355		 *
356		 * The channel_mgr_requests is set to 1 after being awakened,
357		 * This is done to prevent the channel mgr from making one pass
358		 * through the loop for each request, since he will
359		 * be servicing all the requests in one pass. The reason it's
360		 * set to 1 instead of 0 is so that other kthreads will know
361		 * that the channel mgr is running and won't bother trying to
362		 * wake him up.
363		 */
364		atomic_dec(&part->channel_mgr_requests);
365		(void)wait_event_interruptible(part->channel_mgr_wq,
366				(atomic_read(&part->channel_mgr_requests) > 0 ||
367				 part->chctl.all_flags != 0 ||
368				 (part->act_state == XPC_P_AS_DEACTIVATING &&
369				 atomic_read(&part->nchannels_active) == 0 &&
370				 xpc_partition_disengaged(part))));
371		atomic_set(&part->channel_mgr_requests, 1);
372	}
373}
374
375/*
376 * Guarantee that the kzalloc'd memory is cacheline aligned.
377 */
378void *
379xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
380{
381	/* see if kzalloc will give us cachline aligned memory by default */
382	*base = kzalloc(size, flags);
383	if (*base == NULL)
384		return NULL;
385
386	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
387		return *base;
388
389	kfree(*base);
390
391	/* nope, we'll have to do it ourselves */
392	*base = kzalloc(size + L1_CACHE_BYTES, flags);
393	if (*base == NULL)
394		return NULL;
395
396	return (void *)L1_CACHE_ALIGN((u64)*base);
397}
398
399/*
400 * Setup the channel structures necessary to support XPartition Communication
401 * between the specified remote partition and the local one.
402 */
403static enum xp_retval
404xpc_setup_ch_structures(struct xpc_partition *part)
405{
406	enum xp_retval ret;
407	int ch_number;
408	struct xpc_channel *ch;
409	short partid = XPC_PARTID(part);
410
411	/*
412	 * Allocate all of the channel structures as a contiguous chunk of
413	 * memory.
414	 */
415	DBUG_ON(part->channels != NULL);
416	part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_MAX_NCHANNELS,
417				 GFP_KERNEL);
418	if (part->channels == NULL) {
419		dev_err(xpc_chan, "can't get memory for channels\n");
420		return xpNoMemory;
421	}
422
423	/* allocate the remote open and close args */
424
425	part->remote_openclose_args =
426	    xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
427					  GFP_KERNEL, &part->
428					  remote_openclose_args_base);
429	if (part->remote_openclose_args == NULL) {
430		dev_err(xpc_chan, "can't get memory for remote connect args\n");
431		ret = xpNoMemory;
432		goto out_1;
433	}
434
435	part->chctl.all_flags = 0;
436	spin_lock_init(&part->chctl_lock);
437
438	atomic_set(&part->channel_mgr_requests, 1);
439	init_waitqueue_head(&part->channel_mgr_wq);
440
441	part->nchannels = XPC_MAX_NCHANNELS;
442
443	atomic_set(&part->nchannels_active, 0);
444	atomic_set(&part->nchannels_engaged, 0);
445
446	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
447		ch = &part->channels[ch_number];
448
449		ch->partid = partid;
450		ch->number = ch_number;
451		ch->flags = XPC_C_DISCONNECTED;
452
453		atomic_set(&ch->kthreads_assigned, 0);
454		atomic_set(&ch->kthreads_idle, 0);
455		atomic_set(&ch->kthreads_active, 0);
456
457		atomic_set(&ch->references, 0);
458		atomic_set(&ch->n_to_notify, 0);
459
460		spin_lock_init(&ch->lock);
461		init_completion(&ch->wdisconnect_wait);
462
463		atomic_set(&ch->n_on_msg_allocate_wq, 0);
464		init_waitqueue_head(&ch->msg_allocate_wq);
465		init_waitqueue_head(&ch->idle_wq);
466	}
467
468	ret = xpc_arch_ops.setup_ch_structures(part);
469	if (ret != xpSuccess)
470		goto out_2;
471
472	/*
473	 * With the setting of the partition setup_state to XPC_P_SS_SETUP,
474	 * we're declaring that this partition is ready to go.
475	 */
476	part->setup_state = XPC_P_SS_SETUP;
477
478	return xpSuccess;
479
480	/* setup of ch structures failed */
481out_2:
482	kfree(part->remote_openclose_args_base);
483	part->remote_openclose_args = NULL;
484out_1:
485	kfree(part->channels);
486	part->channels = NULL;
487	return ret;
488}
489
490/*
491 * Teardown the channel structures necessary to support XPartition Communication
492 * between the specified remote partition and the local one.
493 */
494static void
495xpc_teardown_ch_structures(struct xpc_partition *part)
496{
497	DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
498	DBUG_ON(atomic_read(&part->nchannels_active) != 0);
499
500	/*
501	 * Make this partition inaccessible to local processes by marking it
502	 * as no longer setup. Then wait before proceeding with the teardown
503	 * until all existing references cease.
504	 */
505	DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
506	part->setup_state = XPC_P_SS_WTEARDOWN;
507
508	wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
509
510	/* now we can begin tearing down the infrastructure */
511
512	xpc_arch_ops.teardown_ch_structures(part);
513
514	kfree(part->remote_openclose_args_base);
515	part->remote_openclose_args = NULL;
516	kfree(part->channels);
517	part->channels = NULL;
518
519	part->setup_state = XPC_P_SS_TORNDOWN;
520}
521
522/*
523 * When XPC HB determines that a partition has come up, it will create a new
524 * kthread and that kthread will call this function to attempt to set up the
525 * basic infrastructure used for Cross Partition Communication with the newly
526 * upped partition.
527 *
528 * The kthread that was created by XPC HB and which setup the XPC
529 * infrastructure will remain assigned to the partition becoming the channel
530 * manager for that partition until the partition is deactivating, at which
531 * time the kthread will teardown the XPC infrastructure and then exit.
532 */
533static int
534xpc_activating(void *__partid)
535{
536	short partid = (u64)__partid;
537	struct xpc_partition *part = &xpc_partitions[partid];
538	unsigned long irq_flags;
539
540	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
541
542	spin_lock_irqsave(&part->act_lock, irq_flags);
543
544	if (part->act_state == XPC_P_AS_DEACTIVATING) {
545		part->act_state = XPC_P_AS_INACTIVE;
546		spin_unlock_irqrestore(&part->act_lock, irq_flags);
547		part->remote_rp_pa = 0;
548		return 0;
549	}
550
551	/* indicate the thread is activating */
552	DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
553	part->act_state = XPC_P_AS_ACTIVATING;
554
555	XPC_SET_REASON(part, 0, 0);
556	spin_unlock_irqrestore(&part->act_lock, irq_flags);
557
558	dev_dbg(xpc_part, "activating partition %d\n", partid);
559
560	xpc_arch_ops.allow_hb(partid);
561
562	if (xpc_setup_ch_structures(part) == xpSuccess) {
563		(void)xpc_part_ref(part);	/* this will always succeed */
564
565		if (xpc_arch_ops.make_first_contact(part) == xpSuccess) {
566			xpc_mark_partition_active(part);
567			xpc_channel_mgr(part);
568			/* won't return until partition is deactivating */
569		}
570
571		xpc_part_deref(part);
572		xpc_teardown_ch_structures(part);
573	}
574
575	xpc_arch_ops.disallow_hb(partid);
576	xpc_mark_partition_inactive(part);
577
578	if (part->reason == xpReactivating) {
579		/* interrupting ourselves results in activating partition */
580		xpc_arch_ops.request_partition_reactivation(part);
581	}
582
583	return 0;
584}
585
586void
587xpc_activate_partition(struct xpc_partition *part)
588{
589	short partid = XPC_PARTID(part);
590	unsigned long irq_flags;
591	struct task_struct *kthread;
592
593	spin_lock_irqsave(&part->act_lock, irq_flags);
594
595	DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
596
597	part->act_state = XPC_P_AS_ACTIVATION_REQ;
598	XPC_SET_REASON(part, xpCloneKThread, __LINE__);
599
600	spin_unlock_irqrestore(&part->act_lock, irq_flags);
601
602	kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
603			      partid);
604	if (IS_ERR(kthread)) {
605		spin_lock_irqsave(&part->act_lock, irq_flags);
606		part->act_state = XPC_P_AS_INACTIVE;
607		XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
608		spin_unlock_irqrestore(&part->act_lock, irq_flags);
609	}
610}
611
612void
613xpc_activate_kthreads(struct xpc_channel *ch, int needed)
614{
615	int idle = atomic_read(&ch->kthreads_idle);
616	int assigned = atomic_read(&ch->kthreads_assigned);
617	int wakeup;
618
619	DBUG_ON(needed <= 0);
620
621	if (idle > 0) {
622		wakeup = (needed > idle) ? idle : needed;
623		needed -= wakeup;
624
625		dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
626			"channel=%d\n", wakeup, ch->partid, ch->number);
627
628		/* only wakeup the requested number of kthreads */
629		wake_up_nr(&ch->idle_wq, wakeup);
630	}
631
632	if (needed <= 0)
633		return;
634
635	if (needed + assigned > ch->kthreads_assigned_limit) {
636		needed = ch->kthreads_assigned_limit - assigned;
637		if (needed <= 0)
638			return;
639	}
640
641	dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
642		needed, ch->partid, ch->number);
643
644	xpc_create_kthreads(ch, needed, 0);
645}
646
647/*
648 * This function is where XPC's kthreads wait for messages to deliver.
649 */
650static void
651xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
652{
653	int (*n_of_deliverable_payloads) (struct xpc_channel *) =
654		xpc_arch_ops.n_of_deliverable_payloads;
655
656	do {
657		/* deliver messages to their intended recipients */
658
659		while (n_of_deliverable_payloads(ch) > 0 &&
660		       !(ch->flags & XPC_C_DISCONNECTING)) {
661			xpc_deliver_payload(ch);
662		}
663
664		if (atomic_inc_return(&ch->kthreads_idle) >
665		    ch->kthreads_idle_limit) {
666			/* too many idle kthreads on this channel */
667			atomic_dec(&ch->kthreads_idle);
668			break;
669		}
670
671		dev_dbg(xpc_chan, "idle kthread calling "
672			"wait_event_interruptible_exclusive()\n");
673
674		(void)wait_event_interruptible_exclusive(ch->idle_wq,
675				(n_of_deliverable_payloads(ch) > 0 ||
676				 (ch->flags & XPC_C_DISCONNECTING)));
677
678		atomic_dec(&ch->kthreads_idle);
679
680	} while (!(ch->flags & XPC_C_DISCONNECTING));
681}
682
683static int
684xpc_kthread_start(void *args)
685{
686	short partid = XPC_UNPACK_ARG1(args);
687	u16 ch_number = XPC_UNPACK_ARG2(args);
688	struct xpc_partition *part = &xpc_partitions[partid];
689	struct xpc_channel *ch;
690	int n_needed;
691	unsigned long irq_flags;
692	int (*n_of_deliverable_payloads) (struct xpc_channel *) =
693		xpc_arch_ops.n_of_deliverable_payloads;
694
695	dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
696		partid, ch_number);
697
698	ch = &part->channels[ch_number];
699
700	if (!(ch->flags & XPC_C_DISCONNECTING)) {
701
702		/* let registerer know that connection has been established */
703
704		spin_lock_irqsave(&ch->lock, irq_flags);
705		if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
706			ch->flags |= XPC_C_CONNECTEDCALLOUT;
707			spin_unlock_irqrestore(&ch->lock, irq_flags);
708
709			xpc_connected_callout(ch);
710
711			spin_lock_irqsave(&ch->lock, irq_flags);
712			ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
713			spin_unlock_irqrestore(&ch->lock, irq_flags);
714
715			/*
716			 * It is possible that while the callout was being
717			 * made that the remote partition sent some messages.
718			 * If that is the case, we may need to activate
719			 * additional kthreads to help deliver them. We only
720			 * need one less than total #of messages to deliver.
721			 */
722			n_needed = n_of_deliverable_payloads(ch) - 1;
723			if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
724				xpc_activate_kthreads(ch, n_needed);
725
726		} else {
727			spin_unlock_irqrestore(&ch->lock, irq_flags);
728		}
729
730		xpc_kthread_waitmsgs(part, ch);
731	}
732
733	/* let registerer know that connection is disconnecting */
734
735	spin_lock_irqsave(&ch->lock, irq_flags);
736	if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
737	    !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
738		ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
739		spin_unlock_irqrestore(&ch->lock, irq_flags);
740
741		xpc_disconnect_callout(ch, xpDisconnecting);
742
743		spin_lock_irqsave(&ch->lock, irq_flags);
744		ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
745	}
746	spin_unlock_irqrestore(&ch->lock, irq_flags);
747
748	if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
749	    atomic_dec_return(&part->nchannels_engaged) == 0) {
750		xpc_arch_ops.indicate_partition_disengaged(part);
751	}
752
753	xpc_msgqueue_deref(ch);
754
755	dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
756		partid, ch_number);
757
758	xpc_part_deref(part);
759	return 0;
760}
761
762/*
763 * For each partition that XPC has established communications with, there is
764 * a minimum of one kernel thread assigned to perform any operation that
765 * may potentially sleep or block (basically the callouts to the asynchronous
766 * functions registered via xpc_connect()).
767 *
768 * Additional kthreads are created and destroyed by XPC as the workload
769 * demands.
770 *
771 * A kthread is assigned to one of the active channels that exists for a given
772 * partition.
773 */
774void
775xpc_create_kthreads(struct xpc_channel *ch, int needed,
776		    int ignore_disconnecting)
777{
778	unsigned long irq_flags;
779	u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
780	struct xpc_partition *part = &xpc_partitions[ch->partid];
781	struct task_struct *kthread;
782	void (*indicate_partition_disengaged) (struct xpc_partition *) =
783		xpc_arch_ops.indicate_partition_disengaged;
784
785	while (needed-- > 0) {
786
787		/*
788		 * The following is done on behalf of the newly created
789		 * kthread. That kthread is responsible for doing the
790		 * counterpart to the following before it exits.
791		 */
792		if (ignore_disconnecting) {
793			if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
794				/* kthreads assigned had gone to zero */
795				BUG_ON(!(ch->flags &
796					 XPC_C_DISCONNECTINGCALLOUT_MADE));
797				break;
798			}
799
800		} else if (ch->flags & XPC_C_DISCONNECTING) {
801			break;
802
803		} else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
804			   atomic_inc_return(&part->nchannels_engaged) == 1) {
805			xpc_arch_ops.indicate_partition_engaged(part);
806		}
807		(void)xpc_part_ref(part);
808		xpc_msgqueue_ref(ch);
809
810		kthread = kthread_run(xpc_kthread_start, (void *)args,
811				      "xpc%02dc%d", ch->partid, ch->number);
812		if (IS_ERR(kthread)) {
813			/* the fork failed */
814
815			/*
816			 * NOTE: if (ignore_disconnecting &&
817			 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
818			 * then we'll deadlock if all other kthreads assigned
819			 * to this channel are blocked in the channel's
820			 * registerer, because the only thing that will unblock
821			 * them is the xpDisconnecting callout that this
822			 * failed kthread_run() would have made.
823			 */
824
825			if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
826			    atomic_dec_return(&part->nchannels_engaged) == 0) {
827				indicate_partition_disengaged(part);
828			}
829			xpc_msgqueue_deref(ch);
830			xpc_part_deref(part);
831
832			if (atomic_read(&ch->kthreads_assigned) <
833			    ch->kthreads_idle_limit) {
834				/*
835				 * Flag this as an error only if we have an
836				 * insufficient #of kthreads for the channel
837				 * to function.
838				 */
839				spin_lock_irqsave(&ch->lock, irq_flags);
840				XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources,
841						       &irq_flags);
842				spin_unlock_irqrestore(&ch->lock, irq_flags);
843			}
844			break;
845		}
846	}
847}
848
849void
850xpc_disconnect_wait(int ch_number)
851{
852	unsigned long irq_flags;
853	short partid;
854	struct xpc_partition *part;
855	struct xpc_channel *ch;
856	int wakeup_channel_mgr;
857
858	/* now wait for all callouts to the caller's function to cease */
859	for (partid = 0; partid < xp_max_npartitions; partid++) {
860		part = &xpc_partitions[partid];
861
862		if (!xpc_part_ref(part))
863			continue;
864
865		ch = &part->channels[ch_number];
866
867		if (!(ch->flags & XPC_C_WDISCONNECT)) {
868			xpc_part_deref(part);
869			continue;
870		}
871
872		wait_for_completion(&ch->wdisconnect_wait);
873
874		spin_lock_irqsave(&ch->lock, irq_flags);
875		DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
876		wakeup_channel_mgr = 0;
877
878		if (ch->delayed_chctl_flags) {
879			if (part->act_state != XPC_P_AS_DEACTIVATING) {
880				spin_lock(&part->chctl_lock);
881				part->chctl.flags[ch->number] |=
882				    ch->delayed_chctl_flags;
883				spin_unlock(&part->chctl_lock);
884				wakeup_channel_mgr = 1;
885			}
886			ch->delayed_chctl_flags = 0;
887		}
888
889		ch->flags &= ~XPC_C_WDISCONNECT;
890		spin_unlock_irqrestore(&ch->lock, irq_flags);
891
892		if (wakeup_channel_mgr)
893			xpc_wakeup_channel_mgr(part);
894
895		xpc_part_deref(part);
896	}
897}
898
899static int
900xpc_setup_partitions(void)
901{
902	short partid;
903	struct xpc_partition *part;
904
905	xpc_partitions = kzalloc(sizeof(struct xpc_partition) *
906				 xp_max_npartitions, GFP_KERNEL);
907	if (xpc_partitions == NULL) {
908		dev_err(xpc_part, "can't get memory for partition structure\n");
909		return -ENOMEM;
910	}
911
912	/*
913	 * The first few fields of each entry of xpc_partitions[] need to
914	 * be initialized now so that calls to xpc_connect() and
915	 * xpc_disconnect() can be made prior to the activation of any remote
916	 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
917	 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
918	 * PARTITION HAS BEEN ACTIVATED.
919	 */
920	for (partid = 0; partid < xp_max_npartitions; partid++) {
921		part = &xpc_partitions[partid];
922
923		DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
924
925		part->activate_IRQ_rcvd = 0;
926		spin_lock_init(&part->act_lock);
927		part->act_state = XPC_P_AS_INACTIVE;
928		XPC_SET_REASON(part, 0, 0);
929
930		init_timer(&part->disengage_timer);
931		part->disengage_timer.function =
932		    xpc_timeout_partition_disengage;
933		part->disengage_timer.data = (unsigned long)part;
934
935		part->setup_state = XPC_P_SS_UNSET;
936		init_waitqueue_head(&part->teardown_wq);
937		atomic_set(&part->references, 0);
938	}
939
940	return xpc_arch_ops.setup_partitions();
941}
942
943static void
944xpc_teardown_partitions(void)
945{
946	xpc_arch_ops.teardown_partitions();
947	kfree(xpc_partitions);
948}
949
950static void
951xpc_do_exit(enum xp_retval reason)
952{
953	short partid;
954	int active_part_count, printed_waiting_msg = 0;
955	struct xpc_partition *part;
956	unsigned long printmsg_time, disengage_timeout = 0;
957
958	/* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
959	DBUG_ON(xpc_exiting == 1);
960
961	/*
962	 * Let the heartbeat checker thread and the discovery thread
963	 * (if one is running) know that they should exit. Also wake up
964	 * the heartbeat checker thread in case it's sleeping.
965	 */
966	xpc_exiting = 1;
967	wake_up_interruptible(&xpc_activate_IRQ_wq);
968
969	/* wait for the discovery thread to exit */
970	wait_for_completion(&xpc_discovery_exited);
971
972	/* wait for the heartbeat checker thread to exit */
973	wait_for_completion(&xpc_hb_checker_exited);
974
975	/* sleep for a 1/3 of a second or so */
976	(void)msleep_interruptible(300);
977
978	/* wait for all partitions to become inactive */
979
980	printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
981	xpc_disengage_timedout = 0;
982
983	do {
984		active_part_count = 0;
985
986		for (partid = 0; partid < xp_max_npartitions; partid++) {
987			part = &xpc_partitions[partid];
988
989			if (xpc_partition_disengaged(part) &&
990			    part->act_state == XPC_P_AS_INACTIVE) {
991				continue;
992			}
993
994			active_part_count++;
995
996			XPC_DEACTIVATE_PARTITION(part, reason);
997
998			if (part->disengage_timeout > disengage_timeout)
999				disengage_timeout = part->disengage_timeout;
1000		}
1001
1002		if (xpc_arch_ops.any_partition_engaged()) {
1003			if (time_is_before_jiffies(printmsg_time)) {
1004				dev_info(xpc_part, "waiting for remote "
1005					 "partitions to deactivate, timeout in "
1006					 "%ld seconds\n", (disengage_timeout -
1007					 jiffies) / HZ);
1008				printmsg_time = jiffies +
1009				    (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
1010				printed_waiting_msg = 1;
1011			}
1012
1013		} else if (active_part_count > 0) {
1014			if (printed_waiting_msg) {
1015				dev_info(xpc_part, "waiting for local partition"
1016					 " to deactivate\n");
1017				printed_waiting_msg = 0;
1018			}
1019
1020		} else {
1021			if (!xpc_disengage_timedout) {
1022				dev_info(xpc_part, "all partitions have "
1023					 "deactivated\n");
1024			}
1025			break;
1026		}
1027
1028		/* sleep for a 1/3 of a second or so */
1029		(void)msleep_interruptible(300);
1030
1031	} while (1);
1032
1033	DBUG_ON(xpc_arch_ops.any_partition_engaged());
1034
1035	xpc_teardown_rsvd_page();
1036
1037	if (reason == xpUnloading) {
1038		(void)unregister_die_notifier(&xpc_die_notifier);
1039		(void)unregister_reboot_notifier(&xpc_reboot_notifier);
1040	}
1041
1042	/* clear the interface to XPC's functions */
1043	xpc_clear_interface();
1044
1045	if (xpc_sysctl)
1046		unregister_sysctl_table(xpc_sysctl);
1047
1048	xpc_teardown_partitions();
1049
1050	if (is_shub())
1051		xpc_exit_sn2();
1052	else if (is_uv())
1053		xpc_exit_uv();
1054}
1055
1056/*
1057 * This function is called when the system is being rebooted.
1058 */
1059static int
1060xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1061{
1062	enum xp_retval reason;
1063
1064	switch (event) {
1065	case SYS_RESTART:
1066		reason = xpSystemReboot;
1067		break;
1068	case SYS_HALT:
1069		reason = xpSystemHalt;
1070		break;
1071	case SYS_POWER_OFF:
1072		reason = xpSystemPoweroff;
1073		break;
1074	default:
1075		reason = xpSystemGoingDown;
1076	}
1077
1078	xpc_do_exit(reason);
1079	return NOTIFY_DONE;
1080}
1081
1082/*
1083 * Notify other partitions to deactivate from us by first disengaging from all
1084 * references to our memory.
1085 */
1086static void
1087xpc_die_deactivate(void)
1088{
1089	struct xpc_partition *part;
1090	short partid;
1091	int any_engaged;
1092	long keep_waiting;
1093	long wait_to_print;
1094
1095	/* keep xpc_hb_checker thread from doing anything (just in case) */
1096	xpc_exiting = 1;
1097
1098	xpc_arch_ops.disallow_all_hbs();   /*indicate we're deactivated */
1099
1100	for (partid = 0; partid < xp_max_npartitions; partid++) {
1101		part = &xpc_partitions[partid];
1102
1103		if (xpc_arch_ops.partition_engaged(partid) ||
1104		    part->act_state != XPC_P_AS_INACTIVE) {
1105			xpc_arch_ops.request_partition_deactivation(part);
1106			xpc_arch_ops.indicate_partition_disengaged(part);
1107		}
1108	}
1109
1110	/*
1111	 * Though we requested that all other partitions deactivate from us,
1112	 * we only wait until they've all disengaged or we've reached the
1113	 * defined timelimit.
1114	 *
1115	 * Given that one iteration through the following while-loop takes
1116	 * approximately 200 microseconds, calculate the #of loops to take
1117	 * before bailing and the #of loops before printing a waiting message.
1118	 */
1119	keep_waiting = xpc_disengage_timelimit * 1000 * 5;
1120	wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
1121
1122	while (1) {
1123		any_engaged = xpc_arch_ops.any_partition_engaged();
1124		if (!any_engaged) {
1125			dev_info(xpc_part, "all partitions have deactivated\n");
1126			break;
1127		}
1128
1129		if (!keep_waiting--) {
1130			for (partid = 0; partid < xp_max_npartitions;
1131			     partid++) {
1132				if (xpc_arch_ops.partition_engaged(partid)) {
1133					dev_info(xpc_part, "deactivate from "
1134						 "remote partition %d timed "
1135						 "out\n", partid);
1136				}
1137			}
1138			break;
1139		}
1140
1141		if (!wait_to_print--) {
1142			dev_info(xpc_part, "waiting for remote partitions to "
1143				 "deactivate, timeout in %ld seconds\n",
1144				 keep_waiting / (1000 * 5));
1145			wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
1146			    1000 * 5;
1147		}
1148
1149		udelay(200);
1150	}
1151}
1152
1153/*
1154 * This function is called when the system is being restarted or halted due
1155 * to some sort of system failure. If this is the case we need to notify the
1156 * other partitions to disengage from all references to our memory.
1157 * This function can also be called when our heartbeater could be offlined
1158 * for a time. In this case we need to notify other partitions to not worry
1159 * about the lack of a heartbeat.
1160 */
1161static int
1162xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1163{
1164#ifdef CONFIG_IA64		    /* !!! temporary kludge */
1165	switch (event) {
1166	case DIE_MACHINE_RESTART:
1167	case DIE_MACHINE_HALT:
1168		xpc_die_deactivate();
1169		break;
1170
1171	case DIE_KDEBUG_ENTER:
1172		/* Should lack of heartbeat be ignored by other partitions? */
1173		if (!xpc_kdebug_ignore)
1174			break;
1175
1176		/* fall through */
1177	case DIE_MCA_MONARCH_ENTER:
1178	case DIE_INIT_MONARCH_ENTER:
1179		xpc_arch_ops.offline_heartbeat();
1180		break;
1181
1182	case DIE_KDEBUG_LEAVE:
1183		/* Is lack of heartbeat being ignored by other partitions? */
1184		if (!xpc_kdebug_ignore)
1185			break;
1186
1187		/* fall through */
1188	case DIE_MCA_MONARCH_LEAVE:
1189	case DIE_INIT_MONARCH_LEAVE:
1190		xpc_arch_ops.online_heartbeat();
1191		break;
1192	}
1193#else
1194	xpc_die_deactivate();
1195#endif
1196
1197	return NOTIFY_DONE;
1198}
1199
1200int __init
1201xpc_init(void)
1202{
1203	int ret;
1204	struct task_struct *kthread;
1205
1206	dev_set_name(xpc_part, "part");
1207	dev_set_name(xpc_chan, "chan");
1208
1209	if (is_shub()) {
1210		/*
1211		 * The ia64-sn2 architecture supports at most 64 partitions.
1212		 * And the inability to unregister remote amos restricts us
1213		 * further to only support exactly 64 partitions on this
1214		 * architecture, no less.
1215		 */
1216		if (xp_max_npartitions != 64) {
1217			dev_err(xpc_part, "max #of partitions not set to 64\n");
1218			ret = -EINVAL;
1219		} else {
1220			ret = xpc_init_sn2();
1221		}
1222
1223	} else if (is_uv()) {
1224		ret = xpc_init_uv();
1225
1226	} else {
1227		ret = -ENODEV;
1228	}
1229
1230	if (ret != 0)
1231		return ret;
1232
1233	ret = xpc_setup_partitions();
1234	if (ret != 0) {
1235		dev_err(xpc_part, "can't get memory for partition structure\n");
1236		goto out_1;
1237	}
1238
1239	xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1240
1241	/*
1242	 * Fill the partition reserved page with the information needed by
1243	 * other partitions to discover we are alive and establish initial
1244	 * communications.
1245	 */
1246	ret = xpc_setup_rsvd_page();
1247	if (ret != 0) {
1248		dev_err(xpc_part, "can't setup our reserved page\n");
1249		goto out_2;
1250	}
1251
1252	/* add ourselves to the reboot_notifier_list */
1253	ret = register_reboot_notifier(&xpc_reboot_notifier);
1254	if (ret != 0)
1255		dev_warn(xpc_part, "can't register reboot notifier\n");
1256
1257	/* add ourselves to the die_notifier list */
1258	ret = register_die_notifier(&xpc_die_notifier);
1259	if (ret != 0)
1260		dev_warn(xpc_part, "can't register die notifier\n");
1261
1262	/*
1263	 * The real work-horse behind xpc.  This processes incoming
1264	 * interrupts and monitors remote heartbeats.
1265	 */
1266	kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1267	if (IS_ERR(kthread)) {
1268		dev_err(xpc_part, "failed while forking hb check thread\n");
1269		ret = -EBUSY;
1270		goto out_3;
1271	}
1272
1273	/*
1274	 * Startup a thread that will attempt to discover other partitions to
1275	 * activate based on info provided by SAL. This new thread is short
1276	 * lived and will exit once discovery is complete.
1277	 */
1278	kthread = kthread_run(xpc_initiate_discovery, NULL,
1279			      XPC_DISCOVERY_THREAD_NAME);
1280	if (IS_ERR(kthread)) {
1281		dev_err(xpc_part, "failed while forking discovery thread\n");
1282
1283		/* mark this new thread as a non-starter */
1284		complete(&xpc_discovery_exited);
1285
1286		xpc_do_exit(xpUnloading);
1287		return -EBUSY;
1288	}
1289
1290	/* set the interface to point at XPC's functions */
1291	xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1292			  xpc_initiate_send, xpc_initiate_send_notify,
1293			  xpc_initiate_received, xpc_initiate_partid_to_nasids);
1294
1295	return 0;
1296
1297	/* initialization was not successful */
1298out_3:
1299	xpc_teardown_rsvd_page();
1300
1301	(void)unregister_die_notifier(&xpc_die_notifier);
1302	(void)unregister_reboot_notifier(&xpc_reboot_notifier);
1303out_2:
1304	if (xpc_sysctl)
1305		unregister_sysctl_table(xpc_sysctl);
1306
1307	xpc_teardown_partitions();
1308out_1:
1309	if (is_shub())
1310		xpc_exit_sn2();
1311	else if (is_uv())
1312		xpc_exit_uv();
1313	return ret;
1314}
1315
1316module_init(xpc_init);
1317
1318void __exit
1319xpc_exit(void)
1320{
1321	xpc_do_exit(xpUnloading);
1322}
1323
1324module_exit(xpc_exit);
1325
1326MODULE_AUTHOR("Silicon Graphics, Inc.");
1327MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1328MODULE_LICENSE("GPL");
1329
1330module_param(xpc_hb_interval, int, 0);
1331MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1332		 "heartbeat increments.");
1333
1334module_param(xpc_hb_check_interval, int, 0);
1335MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1336		 "heartbeat checks.");
1337
1338module_param(xpc_disengage_timelimit, int, 0);
1339MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
1340		 "for disengage to complete.");
1341
1342module_param(xpc_kdebug_ignore, int, 0);
1343MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1344		 "other partitions when dropping into kdebug.");
1345