/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms
 * of the Common Development and Distribution License
 * (the "License").  You may not use this file except
 * in compliance with the License.
 *
 * You can obtain a copy of the license at
 * src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing
 * permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL
 * HEADER in each file and include the License file at
 * usr/src/OPENSOLARIS.LICENSE.  If applicable,
 * add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your
 * own identifying information: Portions Copyright [yyyy]
 * [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * routine to benchmark cache-to-cache transfer times... uses
 * solaris features to find and bind to cpus in the current
 * processor set, so not likely to work elsewhere.
 */


#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <sys/processor.h>
#include <sys/types.h>
#include <stdio.h>
#include <errno.h>
#include <sys/pset.h>

#include "libmicro.h"

static long			opts = 1024*512;

typedef struct {
	long			**ts_data;
	long			ts_result;
	pthread_mutex_t		ts_lock;
} tsd_t;

static unsigned int ncpu = 1024;

static tsd_t *thread_data[1024];
static processorid_t cpus[1024];

int traverse_ptrchain(long **, int, int);

int
benchmark_init()
{
	lm_tsdsize = sizeof (tsd_t);

	(void) sprintf(lm_optstr, "s:");

	(void) sprintf(lm_usage,
	    "       [-s size] size of access area in bytes"
	    " (default %ld)\n"
	    "notes: measures cache to cache transfer times on Solaris\n",
	    opts);

	(void) sprintf(lm_header, "%8s", "size");

	return (0);
}

int
benchmark_optswitch(int opt, char *optarg)
{
	switch (opt) {
	case 's':
		opts = sizetoint(optarg);
		break;
	default:
		return (-1);
	}

	return (0);
}

int
benchmark_initrun()
{
	if (pset_info(PS_MYID, NULL, &ncpu, cpus) < 0) {
		perror("pset_info");
		return (1);
	}

	return (0);
}

int
benchmark_initworker(void *tsd)
{
	tsd_t			*ts = (tsd_t *)tsd;
	int i, j;
	processorid_t cpu;

	ts->ts_data = malloc(opts);

	if (ts->ts_data == NULL) {
		return (1);
	}

	(void) pthread_mutex_init(&ts->ts_lock, NULL);


	if (processor_bind(P_LWPID, P_MYID,
	    cpu = cpus[(pthread_self() - 1) % ncpu],
	    NULL) < 0) {
		perror("processor_bind:");
		return (1);
	}

	(void) printf("# thread %d using processor %d\n", pthread_self(), cpu);

	/*
	 * use lmbench style backwards stride
	 */

	for (i = 0; i < opts / sizeof (long); i++) {
		j = i - 128;
		if (j < 0)
			j = j + opts / sizeof (long);
		ts->ts_data[i] = (long *)&(ts->ts_data[j]);
	}

	thread_data[pthread_self() - 1] = ts;

	return (0);
}

/*
 * here we go in order for each thread, causing inherent serialization
 * this is normally not a good idea, but in this case we're trying to
 * measure cache-to-cache transfer times, and if we run threads in
 * parallel we're likely to see saturation effects rather than cache-to-cache,
 * esp. on wimpy memory platforms like P4.
 */


/*ARGSUSED*/
int
benchmark(void *tsd, result_t *res)
{
	tsd_t			*ts;
	int			i, j;
	int 			count = opts / 128 / sizeof (long);

	for (j = 0; j < lm_optB; j++)
		for (i = 0; i < lm_optT; i++) {
			ts = thread_data[i];
			(void) pthread_mutex_lock(&ts->ts_lock);
			ts->ts_result += traverse_ptrchain(
			    (long **)ts->ts_data, count, 0);
			(void) pthread_mutex_unlock(&ts->ts_lock);
		}

	res->re_count = lm_optB * lm_optT * count;

	return (0);
}

int
traverse_ptrchain(long **ptr, int count, int value)
{
	int i;

	for (i = 0; i < count; i += 10) {
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
		ptr = (long **)*ptr;
		*ptr = *ptr + value;
	}
	return ((int)*ptr); /* bogus return */
}


char *
benchmark_result()
{
	static char  result[256];

	(void) sprintf(result, "%8ld ", opts);


	return (result);
}