• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6/arch/x86/kernel/
1/*
2 *	SGI UltraViolet TLB flush routines.
3 *
4 *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 *	This code is released under the GNU General Public License version 2 or
7 *	later.
8 */
9#include <linux/seq_file.h>
10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
12#include <linux/kernel.h>
13#include <linux/slab.h>
14
15#include <asm/mmu_context.h>
16#include <asm/uv/uv.h>
17#include <asm/uv/uv_mmrs.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20#include <asm/apic.h>
21#include <asm/idle.h>
22#include <asm/tsc.h>
23#include <asm/irq_vectors.h>
24#include <asm/timer.h>
25
26/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
27static int timeout_base_ns[] = {
28		20,
29		160,
30		1280,
31		10240,
32		81920,
33		655360,
34		5242880,
35		167772160
36};
37static int timeout_us;
38static int nobau;
39static int baudisabled;
40static spinlock_t disable_lock;
41static cycles_t congested_cycles;
42
43/* tunables: */
44static int max_bau_concurrent = MAX_BAU_CONCURRENT;
45static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
46static int plugged_delay = PLUGGED_DELAY;
47static int plugsb4reset = PLUGSB4RESET;
48static int timeoutsb4reset = TIMEOUTSB4RESET;
49static int ipi_reset_limit = IPI_RESET_LIMIT;
50static int complete_threshold = COMPLETE_THRESHOLD;
51static int congested_response_us = CONGESTED_RESPONSE_US;
52static int congested_reps = CONGESTED_REPS;
53static int congested_period = CONGESTED_PERIOD;
54static struct dentry *tunables_dir;
55static struct dentry *tunables_file;
56
57static int __init setup_nobau(char *arg)
58{
59	nobau = 1;
60	return 0;
61}
62early_param("nobau", setup_nobau);
63
64/* base pnode in this partition */
65static int uv_partition_base_pnode __read_mostly;
66/* position of pnode (which is nasid>>1): */
67static int uv_nshift __read_mostly;
68static unsigned long uv_mmask __read_mostly;
69
70static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
71static DEFINE_PER_CPU(struct bau_control, bau_control);
72static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
73
74/*
75 * Determine the first node on a uvhub. 'Nodes' are used for kernel
76 * memory allocation.
77 */
78static int __init uvhub_to_first_node(int uvhub)
79{
80	int node, b;
81
82	for_each_online_node(node) {
83		b = uv_node_to_blade_id(node);
84		if (uvhub == b)
85			return node;
86	}
87	return -1;
88}
89
90/*
91 * Determine the apicid of the first cpu on a uvhub.
92 */
93static int __init uvhub_to_first_apicid(int uvhub)
94{
95	int cpu;
96
97	for_each_present_cpu(cpu)
98		if (uvhub == uv_cpu_to_blade_id(cpu))
99			return per_cpu(x86_cpu_to_apicid, cpu);
100	return -1;
101}
102
103/*
104 * Free a software acknowledge hardware resource by clearing its Pending
105 * bit. This will return a reply to the sender.
106 * If the message has timed out, a reply has already been sent by the
107 * hardware but the resource has not been released. In that case our
108 * clear of the Timeout bit (as well) will free the resource. No reply will
109 * be sent (the hardware will only do one reply per message).
110 */
111static inline void uv_reply_to_message(struct msg_desc *mdp,
112				       struct bau_control *bcp)
113{
114	unsigned long dw;
115	struct bau_payload_queue_entry *msg;
116
117	msg = mdp->msg;
118	if (!msg->canceled) {
119		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
120						msg->sw_ack_vector;
121		uv_write_local_mmr(
122				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
123	}
124	msg->replied_to = 1;
125	msg->sw_ack_vector = 0;
126}
127
128/*
129 * Process the receipt of a RETRY message
130 */
131static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
132					    struct bau_control *bcp)
133{
134	int i;
135	int cancel_count = 0;
136	int slot2;
137	unsigned long msg_res;
138	unsigned long mmr = 0;
139	struct bau_payload_queue_entry *msg;
140	struct bau_payload_queue_entry *msg2;
141	struct ptc_stats *stat;
142
143	msg = mdp->msg;
144	stat = bcp->statp;
145	stat->d_retries++;
146	/*
147	 * cancel any message from msg+1 to the retry itself
148	 */
149	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
150		if (msg2 > mdp->va_queue_last)
151			msg2 = mdp->va_queue_first;
152		if (msg2 == msg)
153			break;
154
155		/* same conditions for cancellation as uv_do_reset */
156		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
157		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
158			msg->sw_ack_vector) == 0) &&
159		    (msg2->sending_cpu == msg->sending_cpu) &&
160		    (msg2->msg_type != MSG_NOOP)) {
161			slot2 = msg2 - mdp->va_queue_first;
162			mmr = uv_read_local_mmr
163				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
164			msg_res = msg2->sw_ack_vector;
165			/*
166			 * This is a message retry; clear the resources held
167			 * by the previous message only if they timed out.
168			 * If it has not timed out we have an unexpected
169			 * situation to report.
170			 */
171			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
172				/*
173				 * is the resource timed out?
174				 * make everyone ignore the cancelled message.
175				 */
176				msg2->canceled = 1;
177				stat->d_canceled++;
178				cancel_count++;
179				uv_write_local_mmr(
180				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
181					(msg_res << UV_SW_ACK_NPENDING) |
182					 msg_res);
183			}
184		}
185	}
186	if (!cancel_count)
187		stat->d_nocanceled++;
188}
189
190/*
191 * Do all the things a cpu should do for a TLB shootdown message.
192 * Other cpu's may come here at the same time for this message.
193 */
194static void uv_bau_process_message(struct msg_desc *mdp,
195				   struct bau_control *bcp)
196{
197	int msg_ack_count;
198	short socket_ack_count = 0;
199	struct ptc_stats *stat;
200	struct bau_payload_queue_entry *msg;
201	struct bau_control *smaster = bcp->socket_master;
202
203	/*
204	 * This must be a normal message, or retry of a normal message
205	 */
206	msg = mdp->msg;
207	stat = bcp->statp;
208	if (msg->address == TLB_FLUSH_ALL) {
209		local_flush_tlb();
210		stat->d_alltlb++;
211	} else {
212		__flush_tlb_one(msg->address);
213		stat->d_onetlb++;
214	}
215	stat->d_requestee++;
216
217	/*
218	 * One cpu on each uvhub has the additional job on a RETRY
219	 * of releasing the resource held by the message that is
220	 * being retried.  That message is identified by sending
221	 * cpu number.
222	 */
223	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
224		uv_bau_process_retry_msg(mdp, bcp);
225
226	/*
227	 * This is a sw_ack message, so we have to reply to it.
228	 * Count each responding cpu on the socket. This avoids
229	 * pinging the count's cache line back and forth between
230	 * the sockets.
231	 */
232	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
233			&smaster->socket_acknowledge_count[mdp->msg_slot]);
234	if (socket_ack_count == bcp->cpus_in_socket) {
235		/*
236		 * Both sockets dump their completed count total into
237		 * the message's count.
238		 */
239		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
240		msg_ack_count = atomic_add_short_return(socket_ack_count,
241				(struct atomic_short *)&msg->acknowledge_count);
242
243		if (msg_ack_count == bcp->cpus_in_uvhub) {
244			/*
245			 * All cpus in uvhub saw it; reply
246			 */
247			uv_reply_to_message(mdp, bcp);
248		}
249	}
250
251	return;
252}
253
254/*
255 * Determine the first cpu on a uvhub.
256 */
257static int uvhub_to_first_cpu(int uvhub)
258{
259	int cpu;
260	for_each_present_cpu(cpu)
261		if (uvhub == uv_cpu_to_blade_id(cpu))
262			return cpu;
263	return -1;
264}
265
266/*
267 * Last resort when we get a large number of destination timeouts is
268 * to clear resources held by a given cpu.
269 * Do this with IPI so that all messages in the BAU message queue
270 * can be identified by their nonzero sw_ack_vector field.
271 *
272 * This is entered for a single cpu on the uvhub.
273 * The sender want's this uvhub to free a specific message's
274 * sw_ack resources.
275 */
276static void
277uv_do_reset(void *ptr)
278{
279	int i;
280	int slot;
281	int count = 0;
282	unsigned long mmr;
283	unsigned long msg_res;
284	struct bau_control *bcp;
285	struct reset_args *rap;
286	struct bau_payload_queue_entry *msg;
287	struct ptc_stats *stat;
288
289	bcp = &per_cpu(bau_control, smp_processor_id());
290	rap = (struct reset_args *)ptr;
291	stat = bcp->statp;
292	stat->d_resets++;
293
294	/*
295	 * We're looking for the given sender, and
296	 * will free its sw_ack resource.
297	 * If all cpu's finally responded after the timeout, its
298	 * message 'replied_to' was set.
299	 */
300	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
301		/* uv_do_reset: same conditions for cancellation as
302		   uv_bau_process_retry_msg() */
303		if ((msg->replied_to == 0) &&
304		    (msg->canceled == 0) &&
305		    (msg->sending_cpu == rap->sender) &&
306		    (msg->sw_ack_vector) &&
307		    (msg->msg_type != MSG_NOOP)) {
308			/*
309			 * make everyone else ignore this message
310			 */
311			msg->canceled = 1;
312			slot = msg - bcp->va_queue_first;
313			count++;
314			/*
315			 * only reset the resource if it is still pending
316			 */
317			mmr = uv_read_local_mmr
318					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
319			msg_res = msg->sw_ack_vector;
320			if (mmr & msg_res) {
321				stat->d_rcanceled++;
322				uv_write_local_mmr(
323				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
324					(msg_res << UV_SW_ACK_NPENDING) |
325					 msg_res);
326			}
327		}
328	}
329	return;
330}
331
332/*
333 * Use IPI to get all target uvhubs to release resources held by
334 * a given sending cpu number.
335 */
336static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
337			      int sender)
338{
339	int uvhub;
340	int cpu;
341	cpumask_t mask;
342	struct reset_args reset_args;
343
344	reset_args.sender = sender;
345
346	cpus_clear(mask);
347	/* find a single cpu for each uvhub in this distribution mask */
348	for (uvhub = 0;
349		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
350		    uvhub++) {
351		if (!bau_uvhub_isset(uvhub, distribution))
352			continue;
353		/* find a cpu for this uvhub */
354		cpu = uvhub_to_first_cpu(uvhub);
355		cpu_set(cpu, mask);
356	}
357	/* IPI all cpus; Preemption is already disabled */
358	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
359	return;
360}
361
362static inline unsigned long
363cycles_2_us(unsigned long long cyc)
364{
365	unsigned long long ns;
366	unsigned long us;
367	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
368						>> CYC2NS_SCALE_FACTOR;
369	us = ns / 1000;
370	return us;
371}
372
373/*
374 * wait for all cpus on this hub to finish their sends and go quiet
375 * leaves uvhub_quiesce set so that no new broadcasts are started by
376 * bau_flush_send_and_wait()
377 */
378static inline void
379quiesce_local_uvhub(struct bau_control *hmaster)
380{
381	atomic_add_short_return(1, (struct atomic_short *)
382		 &hmaster->uvhub_quiesce);
383}
384
385/*
386 * mark this quiet-requestor as done
387 */
388static inline void
389end_uvhub_quiesce(struct bau_control *hmaster)
390{
391	atomic_add_short_return(-1, (struct atomic_short *)
392		&hmaster->uvhub_quiesce);
393}
394
395/*
396 * Wait for completion of a broadcast software ack message
397 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
398 */
399static int uv_wait_completion(struct bau_desc *bau_desc,
400	unsigned long mmr_offset, int right_shift, int this_cpu,
401	struct bau_control *bcp, struct bau_control *smaster, long try)
402{
403	unsigned long descriptor_status;
404	cycles_t ttime;
405	struct ptc_stats *stat = bcp->statp;
406	struct bau_control *hmaster;
407
408	hmaster = bcp->uvhub_master;
409
410	/* spin on the status MMR, waiting for it to go idle */
411	while ((descriptor_status = (((unsigned long)
412		uv_read_local_mmr(mmr_offset) >>
413			right_shift) & UV_ACT_STATUS_MASK)) !=
414			DESC_STATUS_IDLE) {
415		/*
416		 * Our software ack messages may be blocked because there are
417		 * no swack resources available.  As long as none of them
418		 * has timed out hardware will NACK our message and its
419		 * state will stay IDLE.
420		 */
421		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
422			stat->s_stimeout++;
423			return FLUSH_GIVEUP;
424		} else if (descriptor_status ==
425					DESC_STATUS_DESTINATION_TIMEOUT) {
426			stat->s_dtimeout++;
427			ttime = get_cycles();
428
429			/*
430			 * Our retries may be blocked by all destination
431			 * swack resources being consumed, and a timeout
432			 * pending.  In that case hardware returns the
433			 * ERROR that looks like a destination timeout.
434			 */
435			if (cycles_2_us(ttime - bcp->send_message) <
436							timeout_us) {
437				bcp->conseccompletes = 0;
438				return FLUSH_RETRY_PLUGGED;
439			}
440
441			bcp->conseccompletes = 0;
442			return FLUSH_RETRY_TIMEOUT;
443		} else {
444			/*
445			 * descriptor_status is still BUSY
446			 */
447			cpu_relax();
448		}
449	}
450	bcp->conseccompletes++;
451	return FLUSH_COMPLETE;
452}
453
454static inline cycles_t
455sec_2_cycles(unsigned long sec)
456{
457	unsigned long ns;
458	cycles_t cyc;
459
460	ns = sec * 1000000000;
461	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
462	return cyc;
463}
464
465/*
466 * conditionally add 1 to *v, unless *v is >= u
467 * return 0 if we cannot add 1 to *v because it is >= u
468 * return 1 if we can add 1 to *v because it is < u
469 * the add is atomic
470 *
471 * This is close to atomic_add_unless(), but this allows the 'u' value
472 * to be lowered below the current 'v'.  atomic_add_unless can only stop
473 * on equal.
474 */
475static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
476{
477	spin_lock(lock);
478	if (atomic_read(v) >= u) {
479		spin_unlock(lock);
480		return 0;
481	}
482	atomic_inc(v);
483	spin_unlock(lock);
484	return 1;
485}
486
487/*
488 * Our retries are blocked by all destination swack resources being
489 * in use, and a timeout is pending. In that case hardware immediately
490 * returns the ERROR that looks like a destination timeout.
491 */
492static void
493destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
494			struct bau_control *hmaster, struct ptc_stats *stat)
495{
496	udelay(bcp->plugged_delay);
497	bcp->plugged_tries++;
498	if (bcp->plugged_tries >= bcp->plugsb4reset) {
499		bcp->plugged_tries = 0;
500		quiesce_local_uvhub(hmaster);
501		spin_lock(&hmaster->queue_lock);
502		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
503		spin_unlock(&hmaster->queue_lock);
504		end_uvhub_quiesce(hmaster);
505		bcp->ipi_attempts++;
506		stat->s_resets_plug++;
507	}
508}
509
510static void
511destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
512			struct bau_control *hmaster, struct ptc_stats *stat)
513{
514	hmaster->max_bau_concurrent = 1;
515	bcp->timeout_tries++;
516	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
517		bcp->timeout_tries = 0;
518		quiesce_local_uvhub(hmaster);
519		spin_lock(&hmaster->queue_lock);
520		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
521		spin_unlock(&hmaster->queue_lock);
522		end_uvhub_quiesce(hmaster);
523		bcp->ipi_attempts++;
524		stat->s_resets_timeout++;
525	}
526}
527
528/*
529 * Completions are taking a very long time due to a congested numalink
530 * network.
531 */
532static void
533disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
534{
535	int tcpu;
536	struct bau_control *tbcp;
537
538	/* let only one cpu do this disabling */
539	spin_lock(&disable_lock);
540	if (!baudisabled && bcp->period_requests &&
541	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
542		/* it becomes this cpu's job to turn on the use of the
543		   BAU again */
544		baudisabled = 1;
545		bcp->set_bau_off = 1;
546		bcp->set_bau_on_time = get_cycles() +
547			sec_2_cycles(bcp->congested_period);
548		stat->s_bau_disabled++;
549		for_each_present_cpu(tcpu) {
550			tbcp = &per_cpu(bau_control, tcpu);
551				tbcp->baudisabled = 1;
552		}
553	}
554	spin_unlock(&disable_lock);
555}
556
557/**
558 * uv_flush_send_and_wait
559 *
560 * Send a broadcast and wait for it to complete.
561 *
562 * The flush_mask contains the cpus the broadcast is to be sent to including
563 * cpus that are on the local uvhub.
564 *
565 * Returns 0 if all flushing represented in the mask was done.
566 * Returns 1 if it gives up entirely and the original cpu mask is to be
567 * returned to the kernel.
568 */
569int uv_flush_send_and_wait(struct bau_desc *bau_desc,
570			   struct cpumask *flush_mask, struct bau_control *bcp)
571{
572	int right_shift;
573	int completion_status = 0;
574	int seq_number = 0;
575	long try = 0;
576	int cpu = bcp->uvhub_cpu;
577	int this_cpu = bcp->cpu;
578	unsigned long mmr_offset;
579	unsigned long index;
580	cycles_t time1;
581	cycles_t time2;
582	cycles_t elapsed;
583	struct ptc_stats *stat = bcp->statp;
584	struct bau_control *smaster = bcp->socket_master;
585	struct bau_control *hmaster = bcp->uvhub_master;
586
587	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
588			&hmaster->active_descriptor_count,
589			hmaster->max_bau_concurrent)) {
590		stat->s_throttles++;
591		do {
592			cpu_relax();
593		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
594			&hmaster->active_descriptor_count,
595			hmaster->max_bau_concurrent));
596	}
597	while (hmaster->uvhub_quiesce)
598		cpu_relax();
599
600	if (cpu < UV_CPUS_PER_ACT_STATUS) {
601		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
602		right_shift = cpu * UV_ACT_STATUS_SIZE;
603	} else {
604		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
605		right_shift =
606		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
607	}
608	time1 = get_cycles();
609	do {
610		if (try == 0) {
611			bau_desc->header.msg_type = MSG_REGULAR;
612			seq_number = bcp->message_number++;
613		} else {
614			bau_desc->header.msg_type = MSG_RETRY;
615			stat->s_retry_messages++;
616		}
617		bau_desc->header.sequence = seq_number;
618		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
619			bcp->uvhub_cpu;
620		bcp->send_message = get_cycles();
621		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
622		try++;
623		completion_status = uv_wait_completion(bau_desc, mmr_offset,
624			right_shift, this_cpu, bcp, smaster, try);
625
626		if (completion_status == FLUSH_RETRY_PLUGGED) {
627			destination_plugged(bau_desc, bcp, hmaster, stat);
628		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
629			destination_timeout(bau_desc, bcp, hmaster, stat);
630		}
631		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
632			bcp->ipi_attempts = 0;
633			completion_status = FLUSH_GIVEUP;
634			break;
635		}
636		cpu_relax();
637	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
638		 (completion_status == FLUSH_RETRY_TIMEOUT));
639	time2 = get_cycles();
640	bcp->plugged_tries = 0;
641	bcp->timeout_tries = 0;
642	if ((completion_status == FLUSH_COMPLETE) &&
643	    (bcp->conseccompletes > bcp->complete_threshold) &&
644	    (hmaster->max_bau_concurrent <
645					hmaster->max_bau_concurrent_constant))
646			hmaster->max_bau_concurrent++;
647	while (hmaster->uvhub_quiesce)
648		cpu_relax();
649	atomic_dec(&hmaster->active_descriptor_count);
650	if (time2 > time1) {
651		elapsed = time2 - time1;
652		stat->s_time += elapsed;
653		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
654			bcp->period_requests++;
655			bcp->period_time += elapsed;
656			if ((elapsed > congested_cycles) &&
657			    (bcp->period_requests > bcp->congested_reps)) {
658				disable_for_congestion(bcp, stat);
659			}
660		}
661	} else
662		stat->s_requestor--;
663	if (completion_status == FLUSH_COMPLETE && try > 1)
664		stat->s_retriesok++;
665	else if (completion_status == FLUSH_GIVEUP) {
666		stat->s_giveup++;
667		return 1;
668	}
669	return 0;
670}
671
672/**
673 * uv_flush_tlb_others - globally purge translation cache of a virtual
674 * address or all TLB's
675 * @cpumask: mask of all cpu's in which the address is to be removed
676 * @mm: mm_struct containing virtual address range
677 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
678 * @cpu: the current cpu
679 *
680 * This is the entry point for initiating any UV global TLB shootdown.
681 *
682 * Purges the translation caches of all specified processors of the given
683 * virtual address, or purges all TLB's on specified processors.
684 *
685 * The caller has derived the cpumask from the mm_struct.  This function
686 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
687 *
688 * The cpumask is converted into a uvhubmask of the uvhubs containing
689 * those cpus.
690 *
691 * Note that this function should be called with preemption disabled.
692 *
693 * Returns NULL if all remote flushing was done.
694 * Returns pointer to cpumask if some remote flushing remains to be
695 * done.  The returned pointer is valid till preemption is re-enabled.
696 */
697const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
698					  struct mm_struct *mm,
699					  unsigned long va, unsigned int cpu)
700{
701	int tcpu;
702	int uvhub;
703	int locals = 0;
704	int remotes = 0;
705	int hubs = 0;
706	struct bau_desc *bau_desc;
707	struct cpumask *flush_mask;
708	struct ptc_stats *stat;
709	struct bau_control *bcp;
710	struct bau_control *tbcp;
711
712	/* kernel was booted 'nobau' */
713	if (nobau)
714		return cpumask;
715
716	bcp = &per_cpu(bau_control, cpu);
717	stat = bcp->statp;
718
719	/* bau was disabled due to slow response */
720	if (bcp->baudisabled) {
721		/* the cpu that disabled it must re-enable it */
722		if (bcp->set_bau_off) {
723			if (get_cycles() >= bcp->set_bau_on_time) {
724				stat->s_bau_reenabled++;
725				baudisabled = 0;
726				for_each_present_cpu(tcpu) {
727					tbcp = &per_cpu(bau_control, tcpu);
728					tbcp->baudisabled = 0;
729					tbcp->period_requests = 0;
730					tbcp->period_time = 0;
731				}
732			}
733		}
734		return cpumask;
735	}
736
737	/*
738	 * Each sending cpu has a per-cpu mask which it fills from the caller's
739	 * cpu mask.  All cpus are converted to uvhubs and copied to the
740	 * activation descriptor.
741	 */
742	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
743	/* don't actually do a shootdown of the local cpu */
744	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
745	if (cpu_isset(cpu, *cpumask))
746		stat->s_ntargself++;
747
748	bau_desc = bcp->descriptor_base;
749	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
750	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
751
752	/* cpu statistics */
753	for_each_cpu(tcpu, flush_mask) {
754		uvhub = uv_cpu_to_blade_id(tcpu);
755		bau_uvhub_set(uvhub, &bau_desc->distribution);
756		if (uvhub == bcp->uvhub)
757			locals++;
758		else
759			remotes++;
760	}
761	if ((locals + remotes) == 0)
762		return NULL;
763	stat->s_requestor++;
764	stat->s_ntargcpu += remotes + locals;
765	stat->s_ntargremotes += remotes;
766	stat->s_ntarglocals += locals;
767	remotes = bau_uvhub_weight(&bau_desc->distribution);
768
769	/* uvhub statistics */
770	hubs = bau_uvhub_weight(&bau_desc->distribution);
771	if (locals) {
772		stat->s_ntarglocaluvhub++;
773		stat->s_ntargremoteuvhub += (hubs - 1);
774	} else
775		stat->s_ntargremoteuvhub += hubs;
776	stat->s_ntarguvhub += hubs;
777	if (hubs >= 16)
778		stat->s_ntarguvhub16++;
779	else if (hubs >= 8)
780		stat->s_ntarguvhub8++;
781	else if (hubs >= 4)
782		stat->s_ntarguvhub4++;
783	else if (hubs >= 2)
784		stat->s_ntarguvhub2++;
785	else
786		stat->s_ntarguvhub1++;
787
788	bau_desc->payload.address = va;
789	bau_desc->payload.sending_cpu = cpu;
790
791	/*
792	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
793	 * or 1 if it gave up and the original cpumask should be returned.
794	 */
795	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
796		return NULL;
797	else
798		return cpumask;
799}
800
801/*
802 * The BAU message interrupt comes here. (registered by set_intr_gate)
803 * See entry_64.S
804 *
805 * We received a broadcast assist message.
806 *
807 * Interrupts are disabled; this interrupt could represent
808 * the receipt of several messages.
809 *
810 * All cores/threads on this hub get this interrupt.
811 * The last one to see it does the software ack.
812 * (the resource will not be freed until noninterruptable cpus see this
813 *  interrupt; hardware may timeout the s/w ack and reply ERROR)
814 */
815void uv_bau_message_interrupt(struct pt_regs *regs)
816{
817	int count = 0;
818	cycles_t time_start;
819	struct bau_payload_queue_entry *msg;
820	struct bau_control *bcp;
821	struct ptc_stats *stat;
822	struct msg_desc msgdesc;
823
824	time_start = get_cycles();
825	bcp = &per_cpu(bau_control, smp_processor_id());
826	stat = bcp->statp;
827	msgdesc.va_queue_first = bcp->va_queue_first;
828	msgdesc.va_queue_last = bcp->va_queue_last;
829	msg = bcp->bau_msg_head;
830	while (msg->sw_ack_vector) {
831		count++;
832		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
833		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
834		msgdesc.msg = msg;
835		uv_bau_process_message(&msgdesc, bcp);
836		msg++;
837		if (msg > msgdesc.va_queue_last)
838			msg = msgdesc.va_queue_first;
839		bcp->bau_msg_head = msg;
840	}
841	stat->d_time += (get_cycles() - time_start);
842	if (!count)
843		stat->d_nomsg++;
844	else if (count > 1)
845		stat->d_multmsg++;
846	ack_APIC_irq();
847}
848
849/*
850 * uv_enable_timeouts
851 *
852 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
853 * shootdown message timeouts enabled.  The timeout does not cause
854 * an interrupt, but causes an error message to be returned to
855 * the sender.
856 */
857static void uv_enable_timeouts(void)
858{
859	int uvhub;
860	int nuvhubs;
861	int pnode;
862	unsigned long mmr_image;
863
864	nuvhubs = uv_num_possible_blades();
865
866	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
867		if (!uv_blade_nr_possible_cpus(uvhub))
868			continue;
869
870		pnode = uv_blade_to_pnode(uvhub);
871		mmr_image =
872		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
873		/*
874		 * Set the timeout period and then lock it in, in three
875		 * steps; captures and locks in the period.
876		 *
877		 * To program the period, the SOFT_ACK_MODE must be off.
878		 */
879		mmr_image &= ~((unsigned long)1 <<
880		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
881		uv_write_global_mmr64
882		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
883		/*
884		 * Set the 4-bit period.
885		 */
886		mmr_image &= ~((unsigned long)0xf <<
887		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
888		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
889		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
890		uv_write_global_mmr64
891		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
892		/*
893		 * Subsequent reversals of the timebase bit (3) cause an
894		 * immediate timeout of one or all INTD resources as
895		 * indicated in bits 2:0 (7 causes all of them to timeout).
896		 */
897		mmr_image |= ((unsigned long)1 <<
898		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
899		uv_write_global_mmr64
900		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
901	}
902}
903
904static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
905{
906	if (*offset < num_possible_cpus())
907		return offset;
908	return NULL;
909}
910
911static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
912{
913	(*offset)++;
914	if (*offset < num_possible_cpus())
915		return offset;
916	return NULL;
917}
918
919static void uv_ptc_seq_stop(struct seq_file *file, void *data)
920{
921}
922
923static inline unsigned long long
924microsec_2_cycles(unsigned long microsec)
925{
926	unsigned long ns;
927	unsigned long long cyc;
928
929	ns = microsec * 1000;
930	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
931	return cyc;
932}
933
934/*
935 * Display the statistics thru /proc.
936 * 'data' points to the cpu number
937 */
938static int uv_ptc_seq_show(struct seq_file *file, void *data)
939{
940	struct ptc_stats *stat;
941	int cpu;
942
943	cpu = *(loff_t *)data;
944
945	if (!cpu) {
946		seq_printf(file,
947			"# cpu sent stime self locals remotes ncpus localhub ");
948		seq_printf(file,
949			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
950		seq_printf(file,
951			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
952		seq_printf(file,
953			"retries rok resetp resett giveup sto bz throt ");
954		seq_printf(file,
955			"sw_ack recv rtime all ");
956		seq_printf(file,
957			"one mult none retry canc nocan reset rcan ");
958		seq_printf(file,
959			"disable enable\n");
960	}
961	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
962		stat = &per_cpu(ptcstats, cpu);
963		/* source side statistics */
964		seq_printf(file,
965			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
966			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
967			   stat->s_ntargself, stat->s_ntarglocals,
968			   stat->s_ntargremotes, stat->s_ntargcpu,
969			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
970			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
971		seq_printf(file, "%ld %ld %ld %ld %ld ",
972			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
973			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
974			   stat->s_dtimeout);
975		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
976			   stat->s_retry_messages, stat->s_retriesok,
977			   stat->s_resets_plug, stat->s_resets_timeout,
978			   stat->s_giveup, stat->s_stimeout,
979			   stat->s_busy, stat->s_throttles);
980
981		/* destination side statistics */
982		seq_printf(file,
983			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
984			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
985					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
986			   stat->d_requestee, cycles_2_us(stat->d_time),
987			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
988			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
989			   stat->d_nocanceled, stat->d_resets,
990			   stat->d_rcanceled);
991		seq_printf(file, "%ld %ld\n",
992			stat->s_bau_disabled, stat->s_bau_reenabled);
993	}
994
995	return 0;
996}
997
998/*
999 * Display the tunables thru debugfs
1000 */
1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002						size_t count, loff_t *ppos)
1003{
1004	char buf[300];
1005	int ret;
1006
1007	ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008		"max_bau_concurrent plugged_delay plugsb4reset",
1009		"timeoutsb4reset ipi_reset_limit complete_threshold",
1010		"congested_response_us congested_reps congested_period",
1011		max_bau_concurrent, plugged_delay, plugsb4reset,
1012		timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013		congested_response_us, congested_reps, congested_period);
1014
1015	return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
1016}
1017
1018/*
1019 * -1: resetf the statistics
1020 *  0: display meaning of the statistics
1021 */
1022static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1023				 size_t count, loff_t *data)
1024{
1025	int cpu;
1026	long input_arg;
1027	char optstr[64];
1028	struct ptc_stats *stat;
1029
1030	if (count == 0 || count > sizeof(optstr))
1031		return -EINVAL;
1032	if (copy_from_user(optstr, user, count))
1033		return -EFAULT;
1034	optstr[count - 1] = '\0';
1035	if (strict_strtol(optstr, 10, &input_arg) < 0) {
1036		printk(KERN_DEBUG "%s is invalid\n", optstr);
1037		return -EINVAL;
1038	}
1039
1040	if (input_arg == 0) {
1041		printk(KERN_DEBUG "# cpu:      cpu number\n");
1042		printk(KERN_DEBUG "Sender statistics:\n");
1043		printk(KERN_DEBUG
1044		"sent:     number of shootdown messages sent\n");
1045		printk(KERN_DEBUG
1046		"stime:    time spent sending messages\n");
1047		printk(KERN_DEBUG
1048		"numuvhubs: number of hubs targeted with shootdown\n");
1049		printk(KERN_DEBUG
1050		"numuvhubs16: number times 16 or more hubs targeted\n");
1051		printk(KERN_DEBUG
1052		"numuvhubs8: number times 8 or more hubs targeted\n");
1053		printk(KERN_DEBUG
1054		"numuvhubs4: number times 4 or more hubs targeted\n");
1055		printk(KERN_DEBUG
1056		"numuvhubs2: number times 2 or more hubs targeted\n");
1057		printk(KERN_DEBUG
1058		"numuvhubs1: number times 1 hub targeted\n");
1059		printk(KERN_DEBUG
1060		"numcpus:  number of cpus targeted with shootdown\n");
1061		printk(KERN_DEBUG
1062		"dto:      number of destination timeouts\n");
1063		printk(KERN_DEBUG
1064		"retries:  destination timeout retries sent\n");
1065		printk(KERN_DEBUG
1066		"rok:   :  destination timeouts successfully retried\n");
1067		printk(KERN_DEBUG
1068		"resetp:   ipi-style resource resets for plugs\n");
1069		printk(KERN_DEBUG
1070		"resett:   ipi-style resource resets for timeouts\n");
1071		printk(KERN_DEBUG
1072		"giveup:   fall-backs to ipi-style shootdowns\n");
1073		printk(KERN_DEBUG
1074		"sto:      number of source timeouts\n");
1075		printk(KERN_DEBUG
1076		"bz:       number of stay-busy's\n");
1077		printk(KERN_DEBUG
1078		"throt:    number times spun in throttle\n");
1079		printk(KERN_DEBUG "Destination side statistics:\n");
1080		printk(KERN_DEBUG
1081		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
1082		printk(KERN_DEBUG
1083		"recv:     shootdown messages received\n");
1084		printk(KERN_DEBUG
1085		"rtime:    time spent processing messages\n");
1086		printk(KERN_DEBUG
1087		"all:      shootdown all-tlb messages\n");
1088		printk(KERN_DEBUG
1089		"one:      shootdown one-tlb messages\n");
1090		printk(KERN_DEBUG
1091		"mult:     interrupts that found multiple messages\n");
1092		printk(KERN_DEBUG
1093		"none:     interrupts that found no messages\n");
1094		printk(KERN_DEBUG
1095		"retry:    number of retry messages processed\n");
1096		printk(KERN_DEBUG
1097		"canc:     number messages canceled by retries\n");
1098		printk(KERN_DEBUG
1099		"nocan:    number retries that found nothing to cancel\n");
1100		printk(KERN_DEBUG
1101		"reset:    number of ipi-style reset requests processed\n");
1102		printk(KERN_DEBUG
1103		"rcan:     number messages canceled by reset requests\n");
1104		printk(KERN_DEBUG
1105		"disable:  number times use of the BAU was disabled\n");
1106		printk(KERN_DEBUG
1107		"enable:   number times use of the BAU was re-enabled\n");
1108	} else if (input_arg == -1) {
1109		for_each_present_cpu(cpu) {
1110			stat = &per_cpu(ptcstats, cpu);
1111			memset(stat, 0, sizeof(struct ptc_stats));
1112		}
1113	}
1114
1115	return count;
1116}
1117
1118static int local_atoi(const char *name)
1119{
1120	int val = 0;
1121
1122	for (;; name++) {
1123		switch (*name) {
1124		case '0' ... '9':
1125			val = 10*val+(*name-'0');
1126			break;
1127		default:
1128			return val;
1129		}
1130	}
1131}
1132
1133/*
1134 * set the tunables
1135 * 0 values reset them to defaults
1136 */
1137static ssize_t tunables_write(struct file *file, const char __user *user,
1138				 size_t count, loff_t *data)
1139{
1140	int cpu;
1141	int cnt = 0;
1142	int val;
1143	char *p;
1144	char *q;
1145	char instr[64];
1146	struct bau_control *bcp;
1147
1148	if (count == 0 || count > sizeof(instr)-1)
1149		return -EINVAL;
1150	if (copy_from_user(instr, user, count))
1151		return -EFAULT;
1152
1153	instr[count] = '\0';
1154	/* count the fields */
1155	p = instr + strspn(instr, WHITESPACE);
1156	q = p;
1157	for (; *p; p = q + strspn(q, WHITESPACE)) {
1158		q = p + strcspn(p, WHITESPACE);
1159		cnt++;
1160		if (q == p)
1161			break;
1162	}
1163	if (cnt != 9) {
1164		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
1165		return -EINVAL;
1166	}
1167
1168	p = instr + strspn(instr, WHITESPACE);
1169	q = p;
1170	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1171		q = p + strcspn(p, WHITESPACE);
1172		val = local_atoi(p);
1173		switch (cnt) {
1174		case 0:
1175			if (val == 0) {
1176				max_bau_concurrent = MAX_BAU_CONCURRENT;
1177				max_bau_concurrent_constant =
1178							MAX_BAU_CONCURRENT;
1179				continue;
1180			}
1181			bcp = &per_cpu(bau_control, smp_processor_id());
1182			if (val < 1 || val > bcp->cpus_in_uvhub) {
1183				printk(KERN_DEBUG
1184				"Error: BAU max concurrent %d is invalid\n",
1185				val);
1186				return -EINVAL;
1187			}
1188			max_bau_concurrent = val;
1189			max_bau_concurrent_constant = val;
1190			continue;
1191		case 1:
1192			if (val == 0)
1193				plugged_delay = PLUGGED_DELAY;
1194			else
1195				plugged_delay = val;
1196			continue;
1197		case 2:
1198			if (val == 0)
1199				plugsb4reset = PLUGSB4RESET;
1200			else
1201				plugsb4reset = val;
1202			continue;
1203		case 3:
1204			if (val == 0)
1205				timeoutsb4reset = TIMEOUTSB4RESET;
1206			else
1207				timeoutsb4reset = val;
1208			continue;
1209		case 4:
1210			if (val == 0)
1211				ipi_reset_limit = IPI_RESET_LIMIT;
1212			else
1213				ipi_reset_limit = val;
1214			continue;
1215		case 5:
1216			if (val == 0)
1217				complete_threshold = COMPLETE_THRESHOLD;
1218			else
1219				complete_threshold = val;
1220			continue;
1221		case 6:
1222			if (val == 0)
1223				congested_response_us = CONGESTED_RESPONSE_US;
1224			else
1225				congested_response_us = val;
1226			continue;
1227		case 7:
1228			if (val == 0)
1229				congested_reps = CONGESTED_REPS;
1230			else
1231				congested_reps = val;
1232			continue;
1233		case 8:
1234			if (val == 0)
1235				congested_period = CONGESTED_PERIOD;
1236			else
1237				congested_period = val;
1238			continue;
1239		}
1240		if (q == p)
1241			break;
1242	}
1243	for_each_present_cpu(cpu) {
1244		bcp = &per_cpu(bau_control, cpu);
1245		bcp->max_bau_concurrent = max_bau_concurrent;
1246		bcp->max_bau_concurrent_constant = max_bau_concurrent;
1247		bcp->plugged_delay = plugged_delay;
1248		bcp->plugsb4reset = plugsb4reset;
1249		bcp->timeoutsb4reset = timeoutsb4reset;
1250		bcp->ipi_reset_limit = ipi_reset_limit;
1251		bcp->complete_threshold = complete_threshold;
1252		bcp->congested_response_us = congested_response_us;
1253		bcp->congested_reps = congested_reps;
1254		bcp->congested_period = congested_period;
1255	}
1256	return count;
1257}
1258
1259static const struct seq_operations uv_ptc_seq_ops = {
1260	.start		= uv_ptc_seq_start,
1261	.next		= uv_ptc_seq_next,
1262	.stop		= uv_ptc_seq_stop,
1263	.show		= uv_ptc_seq_show
1264};
1265
1266static int uv_ptc_proc_open(struct inode *inode, struct file *file)
1267{
1268	return seq_open(file, &uv_ptc_seq_ops);
1269}
1270
1271static int tunables_open(struct inode *inode, struct file *file)
1272{
1273	return 0;
1274}
1275
1276static const struct file_operations proc_uv_ptc_operations = {
1277	.open		= uv_ptc_proc_open,
1278	.read		= seq_read,
1279	.write		= uv_ptc_proc_write,
1280	.llseek		= seq_lseek,
1281	.release	= seq_release,
1282};
1283
1284static const struct file_operations tunables_fops = {
1285	.open		= tunables_open,
1286	.read		= tunables_read,
1287	.write		= tunables_write,
1288};
1289
1290static int __init uv_ptc_init(void)
1291{
1292	struct proc_dir_entry *proc_uv_ptc;
1293
1294	if (!is_uv_system())
1295		return 0;
1296
1297	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
1298				  &proc_uv_ptc_operations);
1299	if (!proc_uv_ptc) {
1300		printk(KERN_ERR "unable to create %s proc entry\n",
1301		       UV_PTC_BASENAME);
1302		return -EINVAL;
1303	}
1304
1305	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1306	if (!tunables_dir) {
1307		printk(KERN_ERR "unable to create debugfs directory %s\n",
1308		       UV_BAU_TUNABLES_DIR);
1309		return -EINVAL;
1310	}
1311	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1312			tunables_dir, NULL, &tunables_fops);
1313	if (!tunables_file) {
1314		printk(KERN_ERR "unable to create debugfs file %s\n",
1315		       UV_BAU_TUNABLES_FILE);
1316		return -EINVAL;
1317	}
1318	return 0;
1319}
1320
1321/*
1322 * initialize the sending side's sending buffers
1323 */
1324static void
1325uv_activation_descriptor_init(int node, int pnode)
1326{
1327	int i;
1328	int cpu;
1329	unsigned long pa;
1330	unsigned long m;
1331	unsigned long n;
1332	struct bau_desc *bau_desc;
1333	struct bau_desc *bd2;
1334	struct bau_control *bcp;
1335
1336	/*
1337	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1338	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
1339	 */
1340	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
1341		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
1342	BUG_ON(!bau_desc);
1343
1344	pa = uv_gpa(bau_desc); /* need the real nasid*/
1345	n = pa >> uv_nshift;
1346	m = pa & uv_mmask;
1347
1348	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
1349			      (n << UV_DESC_BASE_PNODE_SHIFT | m));
1350
1351	/*
1352	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
1353	 * cpu even though we only use the first one; one descriptor can
1354	 * describe a broadcast to 256 uv hubs.
1355	 */
1356	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
1357		i++, bd2++) {
1358		memset(bd2, 0, sizeof(struct bau_desc));
1359		bd2->header.sw_ack_flag = 1;
1360		/*
1361		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
1362		 * in the partition. The bit map will indicate uvhub numbers,
1363		 * which are 0-N in a partition. Pnodes are unique system-wide.
1364		 */
1365		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
1366		bd2->header.dest_subnodeid = 0x10; /* the LB */
1367		bd2->header.command = UV_NET_ENDPOINT_INTD;
1368		bd2->header.int_both = 1;
1369		/*
1370		 * all others need to be set to zero:
1371		 *   fairness chaining multilevel count replied_to
1372		 */
1373	}
1374	for_each_present_cpu(cpu) {
1375		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1376			continue;
1377		bcp = &per_cpu(bau_control, cpu);
1378		bcp->descriptor_base = bau_desc;
1379	}
1380}
1381
1382/*
1383 * initialize the destination side's receiving buffers
1384 * entered for each uvhub in the partition
1385 * - node is first node (kernel memory notion) on the uvhub
1386 * - pnode is the uvhub's physical identifier
1387 */
1388static void
1389uv_payload_queue_init(int node, int pnode)
1390{
1391	int pn;
1392	int cpu;
1393	char *cp;
1394	unsigned long pa;
1395	struct bau_payload_queue_entry *pqp;
1396	struct bau_payload_queue_entry *pqp_malloc;
1397	struct bau_control *bcp;
1398
1399	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
1400		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
1401		GFP_KERNEL, node);
1402	BUG_ON(!pqp);
1403	pqp_malloc = pqp;
1404
1405	cp = (char *)pqp + 31;
1406	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
1407
1408	for_each_present_cpu(cpu) {
1409		if (pnode != uv_cpu_to_pnode(cpu))
1410			continue;
1411		/* for every cpu on this pnode: */
1412		bcp = &per_cpu(bau_control, cpu);
1413		bcp->va_queue_first = pqp;
1414		bcp->bau_msg_head = pqp;
1415		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1416	}
1417	/*
1418	 * need the pnode of where the memory was really allocated
1419	 */
1420	pa = uv_gpa(pqp);
1421	pn = pa >> uv_nshift;
1422	uv_write_global_mmr64(pnode,
1423			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
1424			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
1425			      uv_physnodeaddr(pqp));
1426	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
1427			      uv_physnodeaddr(pqp));
1428	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
1429			      (unsigned long)
1430			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1431	/* in effect, all msg_type's are set to MSG_NOOP */
1432	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
1433}
1434
1435/*
1436 * Initialization of each UV hub's structures
1437 */
1438static void __init uv_init_uvhub(int uvhub, int vector)
1439{
1440	int node;
1441	int pnode;
1442	unsigned long apicid;
1443
1444	node = uvhub_to_first_node(uvhub);
1445	pnode = uv_blade_to_pnode(uvhub);
1446	uv_activation_descriptor_init(node, pnode);
1447	uv_payload_queue_init(node, pnode);
1448	/*
1449	 * the below initialization can't be in firmware because the
1450	 * messaging IRQ will be determined by the OS
1451	 */
1452	apicid = uvhub_to_first_apicid(uvhub);
1453	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
1454				      ((apicid << 32) | vector));
1455}
1456
1457/*
1458 * We will set BAU_MISC_CONTROL with a timeout period.
1459 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1460 * So the destination timeout period has be be calculated from them.
1461 */
1462static int
1463calculate_destination_timeout(void)
1464{
1465	unsigned long mmr_image;
1466	int mult1;
1467	int mult2;
1468	int index;
1469	int base;
1470	int ret;
1471	unsigned long ts_ns;
1472
1473	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1474	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1475	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1476	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1477	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1478	base = timeout_base_ns[index];
1479	ts_ns = base * mult1 * mult2;
1480	ret = ts_ns / 1000;
1481	return ret;
1482}
1483
1484/*
1485 * initialize the bau_control structure for each cpu
1486 */
1487static void __init uv_init_per_cpu(int nuvhubs)
1488{
1489	int i;
1490	int cpu;
1491	int pnode;
1492	int uvhub;
1493	int have_hmaster;
1494	short socket = 0;
1495	unsigned short socket_mask;
1496	unsigned char *uvhub_mask;
1497	struct bau_control *bcp;
1498	struct uvhub_desc *bdp;
1499	struct socket_desc *sdp;
1500	struct bau_control *hmaster = NULL;
1501	struct bau_control *smaster = NULL;
1502	struct socket_desc {
1503		short num_cpus;
1504		short cpu_number[16];
1505	};
1506	struct uvhub_desc {
1507		unsigned short socket_mask;
1508		short num_cpus;
1509		short uvhub;
1510		short pnode;
1511		struct socket_desc socket[2];
1512	};
1513	struct uvhub_desc *uvhub_descs;
1514
1515	timeout_us = calculate_destination_timeout();
1516
1517	uvhub_descs = (struct uvhub_desc *)
1518		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1519	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1520	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1521	for_each_present_cpu(cpu) {
1522		bcp = &per_cpu(bau_control, cpu);
1523		memset(bcp, 0, sizeof(struct bau_control));
1524		pnode = uv_cpu_hub_info(cpu)->pnode;
1525		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1526		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1527		bdp = &uvhub_descs[uvhub];
1528		bdp->num_cpus++;
1529		bdp->uvhub = uvhub;
1530		bdp->pnode = pnode;
1531		/* kludge: 'assuming' one node per socket, and assuming that
1532		   disabling a socket just leaves a gap in node numbers */
1533		socket = (cpu_to_node(cpu) & 1);
1534		bdp->socket_mask |= (1 << socket);
1535		sdp = &bdp->socket[socket];
1536		sdp->cpu_number[sdp->num_cpus] = cpu;
1537		sdp->num_cpus++;
1538	}
1539	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1540		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1541			continue;
1542		have_hmaster = 0;
1543		bdp = &uvhub_descs[uvhub];
1544		socket_mask = bdp->socket_mask;
1545		socket = 0;
1546		while (socket_mask) {
1547			if (!(socket_mask & 1))
1548				goto nextsocket;
1549			sdp = &bdp->socket[socket];
1550			for (i = 0; i < sdp->num_cpus; i++) {
1551				cpu = sdp->cpu_number[i];
1552				bcp = &per_cpu(bau_control, cpu);
1553				bcp->cpu = cpu;
1554				if (i == 0) {
1555					smaster = bcp;
1556					if (!have_hmaster) {
1557						have_hmaster++;
1558						hmaster = bcp;
1559					}
1560				}
1561				bcp->cpus_in_uvhub = bdp->num_cpus;
1562				bcp->cpus_in_socket = sdp->num_cpus;
1563				bcp->socket_master = smaster;
1564				bcp->uvhub = bdp->uvhub;
1565				bcp->uvhub_master = hmaster;
1566				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1567						blade_processor_id;
1568			}
1569nextsocket:
1570			socket++;
1571			socket_mask = (socket_mask >> 1);
1572		}
1573	}
1574	kfree(uvhub_descs);
1575	kfree(uvhub_mask);
1576	for_each_present_cpu(cpu) {
1577		bcp = &per_cpu(bau_control, cpu);
1578		bcp->baudisabled = 0;
1579		bcp->statp = &per_cpu(ptcstats, cpu);
1580		/* time interval to catch a hardware stay-busy bug */
1581		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1582		bcp->max_bau_concurrent = max_bau_concurrent;
1583		bcp->max_bau_concurrent_constant = max_bau_concurrent;
1584		bcp->plugged_delay = plugged_delay;
1585		bcp->plugsb4reset = plugsb4reset;
1586		bcp->timeoutsb4reset = timeoutsb4reset;
1587		bcp->ipi_reset_limit = ipi_reset_limit;
1588		bcp->complete_threshold = complete_threshold;
1589		bcp->congested_response_us = congested_response_us;
1590		bcp->congested_reps = congested_reps;
1591		bcp->congested_period = congested_period;
1592	}
1593}
1594
1595/*
1596 * Initialization of BAU-related structures
1597 */
1598static int __init uv_bau_init(void)
1599{
1600	int uvhub;
1601	int pnode;
1602	int nuvhubs;
1603	int cur_cpu;
1604	int vector;
1605	unsigned long mmr;
1606
1607	if (!is_uv_system())
1608		return 0;
1609
1610	if (nobau)
1611		return 0;
1612
1613	for_each_possible_cpu(cur_cpu)
1614		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
1615				       GFP_KERNEL, cpu_to_node(cur_cpu));
1616
1617	uv_nshift = uv_hub_info->m_val;
1618	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1619	nuvhubs = uv_num_possible_blades();
1620	spin_lock_init(&disable_lock);
1621	congested_cycles = microsec_2_cycles(congested_response_us);
1622
1623	uv_init_per_cpu(nuvhubs);
1624
1625	uv_partition_base_pnode = 0x7fffffff;
1626	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
1627		if (uv_blade_nr_possible_cpus(uvhub) &&
1628			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
1629			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
1630
1631	vector = UV_BAU_MESSAGE;
1632	for_each_possible_blade(uvhub)
1633		if (uv_blade_nr_possible_cpus(uvhub))
1634			uv_init_uvhub(uvhub, vector);
1635
1636	uv_enable_timeouts();
1637	alloc_intr_gate(vector, uv_bau_message_intr1);
1638
1639	for_each_possible_blade(uvhub) {
1640		if (uv_blade_nr_possible_cpus(uvhub)) {
1641			pnode = uv_blade_to_pnode(uvhub);
1642			/* INIT the bau */
1643			uv_write_global_mmr64(pnode,
1644					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1645					((unsigned long)1 << 63));
1646			mmr = 1; /* should be 1 to broadcast to both sockets */
1647			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
1648						mmr);
1649		}
1650	}
1651
1652	return 0;
1653}
1654core_initcall(uv_bau_init);
1655fs_initcall(uv_ptc_init);
1656