common/kmdb/kaif_start.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * The main CPU-control loops, used to control masters and slaves.
 */

#include <sys/types.h>

#include <kmdb/kaif.h>
#include <kmdb/kaif_start.h>
#include <kmdb/kmdb_asmutil.h>
#include <kmdb/kmdb_dpi_impl.h>
#include <kmdb/kmdb_kdi.h>

#define	KAIF_SLAVE_CMD_SPIN	0
#define	KAIF_SLAVE_CMD_SWITCH	1
#define	KAIF_SLAVE_CMD_RESUME	2
#define	KAIF_SLAVE_CMD_FLUSH	3
#define	KAIF_SLAVE_CMD_REBOOT	4
#if defined(__sparc)
#define	KAIF_SLAVE_CMD_ACK	5
#endif


/*
 * Used to synchronize attempts to set kaif_master_cpuid.  kaif_master_cpuid may
 * be read without kaif_master_lock, and may be written by the current master
 * CPU.
 */
int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
static uintptr_t kaif_master_lock = 0;

/*
 * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must
 * be held to write kaif_looping, but need not be held to read it.
 */
static volatile uint_t kaif_looping;
static uintptr_t kaif_loop_lock;

static volatile int kaif_slave_cmd;
static volatile int kaif_slave_tgt;	/* target cpuid for CMD_SWITCH */

static void
kaif_lock_enter(uintptr_t *lock)
{
	while (cas(lock, 0, 1) != 0)
		continue;
	membar_producer();
}

static void
kaif_lock_exit(uintptr_t *lock)
{
	*lock = 0;
	membar_producer();
}

static void
kaif_start_slaves(int cmd)
{
	kaif_slave_cmd = cmd;
	kmdb_kdi_start_slaves();
}

static int
kaif_master_loop(kaif_cpusave_t *cpusave)
{
	int notflushed, i;

#if defined(__sparc)
	kaif_prom_rearm();
#endif
	kaif_trap_set_debugger();

	/*
	 * If we re-entered due to a ::switch, we need to tell the slave CPUs
	 * to sleep again.
	 */
	kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0);

master_loop:
	switch (kmdb_dpi_reenter()) {
	case KMDB_DPI_CMD_SWITCH_CPU:
		/*
		 * We assume that the target CPU is a valid slave.  There's no
		 * easy way to complain here, so we'll assume that the caller
		 * has done the proper checking.
		 */
		if (kmdb_dpi_switch_target == cpusave->krs_cpu_id)
			break;

		kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target;
		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
		membar_producer();

		/*
		 * Switch back to the saved trap table before we switch CPUs --
		 * we need to make sure that only one CPU is on the debugger's
		 * table at a time.
		 */
		kaif_trap_set_saved(cpusave);

		kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH);

		/* The new master is now awake */
		return (KAIF_CPU_CMD_SWITCH);

	case KMDB_DPI_CMD_RESUME_ALL:
	case KMDB_DPI_CMD_RESUME_UNLOAD:
		/*
		 * Resume everyone, clean up for next entry.
		 */
		kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
		membar_producer();
		kaif_start_slaves(KAIF_SLAVE_CMD_RESUME);

		if (kmdb_dpi_work_required())
			kmdb_dpi_wrintr_fire();

		kaif_trap_set_saved(cpusave);

		return (KAIF_CPU_CMD_RESUME);

	case KMDB_DPI_CMD_RESUME_MASTER:
		/*
		 * Single-CPU resume, which is performed on the debugger's
		 * trap table (so no need to switch back).
		 */
		return (KAIF_CPU_CMD_RESUME_MASTER);

	case KMDB_DPI_CMD_FLUSH_CACHES:
		kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH);

		/*
		 * Wait for the other cpus to finish flushing their caches.
		 */
		do {
			notflushed = 0;
			for (i = 0; i < kaif_ncpusave; i++) {
				kaif_cpusave_t *save = &kaif_cpusave[i];

				if (save->krs_cpu_state ==
				    KAIF_CPU_STATE_SLAVE &&
				    !save->krs_cpu_flushed) {
					notflushed++;
					break;
				}
			}
		} while (notflushed > 0);

		kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
		break;

#if defined(__i386) || defined(__amd64)
	case KMDB_DPI_CMD_REBOOT:
		/*
		 * Reboot must be initiated by CPU 0.  I could ask why, but I'm
		 * afraid that I don't want to know the answer.
		 */
		if (cpusave->krs_cpu_id == 0)
			kmdb_kdi_reboot();

		kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT);

		/*
		 * Spin forever, waiting for CPU 0 (apparently a slave) to
		 * reboot the system.
		 */
		for (;;)
			continue;

		/*NOTREACHED*/
		break;
#endif
	}

	goto master_loop;
}

static int
kaif_slave_loop(kaif_cpusave_t *cpusave)
{
	int slavecmd, rv;

#if defined(__sparc)
	/*
	 * If the user elects to drop to OBP from the debugger, some OBP
	 * implementations will cross-call the slaves.  We have to turn
	 * IE back on so we can receive the cross-calls.  If we don't,
	 * some OBP implementations will wait forever.
	 */
	interrupts_on();
#endif

	/* Wait for duty to call */
	for (;;) {
		slavecmd = kaif_slave_cmd;

		if (slavecmd == KAIF_SLAVE_CMD_SWITCH &&
		    kaif_slave_tgt == cpusave->krs_cpu_id) {
			kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
			cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
			rv = KAIF_CPU_CMD_SWITCH;
			break;

		} else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) {
			kmdb_kdi_flush_caches();
			cpusave->krs_cpu_flushed = 1;
			continue;

#if defined(__i386) || defined(__amd64)
		} else if (slavecmd == KAIF_SLAVE_CMD_REBOOT &&
		    cpusave->krs_cpu_id == 0) {
			rv = 0;
			kmdb_kdi_reboot();
			break;
#endif

		} else if (slavecmd == KAIF_SLAVE_CMD_RESUME) {
			rv = KAIF_CPU_CMD_RESUME;
			break;
#if defined(__sparc)
		} else if (slavecmd == KAIF_SLAVE_CMD_ACK) {
			cpusave->krs_cpu_acked = 1;
		} else if (cpusave->krs_cpu_acked &&
			slavecmd == KAIF_SLAVE_CMD_SPIN) {
			cpusave->krs_cpu_acked = 0;
#endif
		}

		kmdb_kdi_slave_wait();
	}

#if defined(__sparc)
	interrupts_off();
#endif

	return (rv);
}

static void
kaif_select_master(kaif_cpusave_t *cpusave)
{
	kaif_lock_enter(&kaif_master_lock);

	if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
		/* This is the master. */
		kaif_master_cpuid = cpusave->krs_cpu_id;
		cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
		kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;

		membar_producer();

		kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1);
	} else {
		/* The master was already chosen - go be a slave */
		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
		membar_producer();
	}

	kaif_lock_exit(&kaif_master_lock);
}

int
kaif_main_loop(kaif_cpusave_t *cpusave)
{
	int cmd;

	if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
		if (!kmdb_dpi_resume_requested &&
		    kmdb_kdi_get_unload_request()) {
			/*
			 * Special case: Unload requested before first debugger
			 * entry.  Don't stop the world, as there's nothing to
			 * clean up that can't be handled by the running kernel.
			 */
			cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
			return (KAIF_CPU_CMD_RESUME);
		}

		kaif_select_master(cpusave);

#ifdef __sparc
		if (kaif_master_cpuid == cpusave->krs_cpu_id) {
			/*
			 * Everyone has arrived, so we can disarm the post-PROM
			 * entry point.
			 */
			*kaif_promexitarmp = 0;
			membar_producer();
		}
#endif
	} else if (kaif_master_cpuid == cpusave->krs_cpu_id) {
		cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
	} else {
		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
	}

	cpusave->krs_cpu_flushed = 0;

	kaif_lock_enter(&kaif_loop_lock);
	kaif_looping++;
	kaif_lock_exit(&kaif_loop_lock);

	/*
	 * We know who the master and slaves are, so now they can go off
	 * to their respective loops.
	 */
	do {
		if (kaif_master_cpuid == cpusave->krs_cpu_id)
			cmd = kaif_master_loop(cpusave);
		else
			cmd = kaif_slave_loop(cpusave);
	} while (cmd == KAIF_CPU_CMD_SWITCH);

	kaif_lock_enter(&kaif_loop_lock);
	kaif_looping--;
	kaif_lock_exit(&kaif_loop_lock);

	cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;

	if (cmd == KAIF_CPU_CMD_RESUME) {
		/*
		 * By this point, the master has directed the slaves to resume,
		 * and everyone is making their way to this point.  We're going
		 * to block here until all CPUs leave the master and slave
		 * loops.  When all have arrived, we'll turn them all loose.
		 * This barrier is required for two reasons:
		 *
		 * 1. There exists a race condition whereby a CPU could reenter
		 *    the debugger while another CPU is still in the slave loop
		 *    from this debugger entry.  This usually happens when the
		 *    current master releases the slaves, and makes it back to
		 *    the world before the slaves notice the release.  The
		 *    former master then triggers a debugger entry, and attempts
		 *    to stop the slaves for this entry before they've even
		 *    resumed from the last one.  When the slaves arrive here,
		 *    they'll have re-disabled interrupts, and will thus ignore
		 *    cross-calls until they finish resuming.
		 *
		 * 2. At the time of this writing, there exists a SPARC bug that
		 *    causes an apparently unsolicited interrupt vector trap
		 *    from OBP to one of the slaves.  This wouldn't normally be
		 *    a problem but for the fact that the cross-called CPU
		 *    encounters some sort of failure while in OBP.  OBP
		 *    recovers by executing the debugger-hook word, which sends
		 *    the slave back into the debugger, triggering a debugger
		 *    fault.  This problem seems to only happen during resume,
		 *    the result being that all CPUs save for the cross-called
		 *    one make it back into the world, while the cross-called
		 *    one is stuck at the debugger fault prompt.  Leave the
		 *    world in that state too long, and you'll get a mondo
		 *    timeout panic.  If we hold everyone here, we can give the
		 *    the user a chance to trigger a panic for further analysis.
		 *    To trigger the bug, "pool_unlock:b :c" and "while : ; do
		 *    psrset -p ; done".
		 *
		 * When the second item is fixed, the barrier can move into
		 * kaif_select_master(), immediately prior to the setting of
		 * kaif_master_cpuid.
		 */
		while (kaif_looping != 0)
			continue;
	}

	return (cmd);
}


#if defined(__sparc)

static int slave_loop_barrier_failures = 0;	/* for debug */

/*
 * There exist a race condition observed by some
 * platforms where the kmdb master cpu exits to OBP via
 * prom_enter_mon (e.g. "$q" command) and then later re-enter
 * kmdb (typing "go") while the slaves are still proceeding
 * from the OBP idle-loop back to the kmdb slave loop. The
 * problem arises when the master cpu now back in kmdb proceed
 * to re-enter OBP (e.g. doing a prom_read() from the kmdb main
 * loop) while the slaves are still trying to get out of (the
 * previous trip in) OBP into the safety of the kmdb slave loop.
 * This routine forces the slaves to explicitly acknowledge
 * that they are back in the slave loop. The master cpu can
 * call this routine to ensure that all slave cpus are back
 * in the slave loop before proceeding.
 */
void
kaif_slave_loop_barrier(void)
{
	extern void kdi_usecwait(clock_t);
	int i;
	int not_acked;
	int timeout_count = 0;

	kaif_start_slaves(KAIF_SLAVE_CMD_ACK);

	/*
	 * Wait for slave cpus to explicitly acknowledge
	 * that they are spinning in the slave loop.
	 */
	do {
		not_acked = 0;
		for (i = 0; i < kaif_ncpusave; i++) {
			kaif_cpusave_t *save = &kaif_cpusave[i];

			if (save->krs_cpu_state ==
			    KAIF_CPU_STATE_SLAVE &&
			    !save->krs_cpu_acked) {
				not_acked++;
				break;
			}
		}

		if (not_acked == 0)
			break;

		/*
		 * Play it safe and do a timeout delay.
		 * We will do at most kaif_ncpusave delays before
		 * bailing out of this barrier.
		 */
		kdi_usecwait(200);

	} while (++timeout_count < kaif_ncpusave);

	if (not_acked > 0)
		/*
		 * we cannot establish a barrier with all
		 * the slave cpus coming back from OBP
		 * Record this fact for future debugging
		 */
		slave_loop_barrier_failures++;

	kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
}
#endif