machdep.c revision 3434:5142e1d7d0bc
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/types.h>
29#include <sys/kstat.h>
30#include <sys/param.h>
31#include <sys/stack.h>
32#include <sys/regset.h>
33#include <sys/thread.h>
34#include <sys/proc.h>
35#include <sys/procfs_isa.h>
36#include <sys/kmem.h>
37#include <sys/cpuvar.h>
38#include <sys/systm.h>
39#include <sys/machpcb.h>
40#include <sys/machasi.h>
41#include <sys/vis.h>
42#include <sys/fpu/fpusystm.h>
43#include <sys/cpu_module.h>
44#include <sys/privregs.h>
45#include <sys/archsystm.h>
46#include <sys/atomic.h>
47#include <sys/cmn_err.h>
48#include <sys/time.h>
49#include <sys/clock.h>
50#include <sys/cmp.h>
51#include <sys/platform_module.h>
52#include <sys/bl.h>
53#include <sys/nvpair.h>
54#include <sys/kdi_impl.h>
55#include <sys/machsystm.h>
56#include <sys/sysmacros.h>
57#include <sys/promif.h>
58#include <sys/pool_pset.h>
59#include <vm/seg_kmem.h>
60
61int maxphys = MMU_PAGESIZE * 16;	/* 128k */
62int klustsize = MMU_PAGESIZE * 16;	/* 128k */
63
64/*
65 * Initialize kernel thread's stack.
66 */
67caddr_t
68thread_stk_init(caddr_t stk)
69{
70	kfpu_t *fp;
71	ulong_t align;
72
73	/* allocate extra space for floating point state */
74	stk -= SA(sizeof (kfpu_t) + GSR_SIZE);
75	align = (uintptr_t)stk & 0x3f;
76	stk -= align;		/* force v9_fpu to be 16 byte aligned */
77	fp = (kfpu_t *)stk;
78	fp->fpu_fprs = 0;
79
80	stk -= SA(MINFRAME);
81	return (stk);
82}
83
84#define	WIN32_SIZE	(MAXWIN * sizeof (struct rwindow32))
85#define	WIN64_SIZE	(MAXWIN * sizeof (struct rwindow64))
86
87kmem_cache_t	*wbuf32_cache;
88kmem_cache_t	*wbuf64_cache;
89
90void
91lwp_stk_cache_init(void)
92{
93	/*
94	 * Window buffers are allocated from the static arena
95	 * because they are accessed at TL>0. We also must use
96	 * KMC_NOHASH to prevent them from straddling page
97	 * boundaries as they are accessed by physical address.
98	 */
99	wbuf32_cache = kmem_cache_create("wbuf32_cache", WIN32_SIZE,
100	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
101	wbuf64_cache = kmem_cache_create("wbuf64_cache", WIN64_SIZE,
102	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
103}
104
105/*
106 * Initialize lwp's kernel stack.
107 * Note that now that the floating point register save area (kfpu_t)
108 * has been broken out from machpcb and aligned on a 64 byte boundary so that
109 * we can do block load/stores to/from it, there are a couple of potential
110 * optimizations to save stack space. 1. The floating point register save
111 * area could be aligned on a 16 byte boundary, and the floating point code
112 * changed to (a) check the alignment and (b) use different save/restore
113 * macros depending upon the alignment. 2. The lwp_stk_init code below
114 * could be changed to calculate if less space would be wasted if machpcb
115 * was first instead of second. However there is a REGOFF macro used in
116 * locore, syscall_trap, machdep and mlsetup that assumes that the saved
117 * register area is a fixed distance from the %sp, and would have to be
118 * changed to a pointer or something...JJ said later.
119 */
120caddr_t
121lwp_stk_init(klwp_t *lwp, caddr_t stk)
122{
123	struct machpcb *mpcb;
124	kfpu_t *fp;
125	uintptr_t aln;
126
127	stk -= SA(sizeof (kfpu_t) + GSR_SIZE);
128	aln = (uintptr_t)stk & 0x3F;
129	stk -= aln;
130	fp = (kfpu_t *)stk;
131	stk -= SA(sizeof (struct machpcb));
132	mpcb = (struct machpcb *)stk;
133	bzero(mpcb, sizeof (struct machpcb));
134	bzero(fp, sizeof (kfpu_t) + GSR_SIZE);
135	lwp->lwp_regs = (void *)&mpcb->mpcb_regs;
136	lwp->lwp_fpu = (void *)fp;
137	mpcb->mpcb_fpu = fp;
138	mpcb->mpcb_fpu->fpu_q = mpcb->mpcb_fpu_q;
139	mpcb->mpcb_thread = lwp->lwp_thread;
140	mpcb->mpcb_wbcnt = 0;
141	if (lwp->lwp_procp->p_model == DATAMODEL_ILP32) {
142		mpcb->mpcb_wstate = WSTATE_USER32;
143		mpcb->mpcb_wbuf = kmem_cache_alloc(wbuf32_cache, KM_SLEEP);
144	} else {
145		mpcb->mpcb_wstate = WSTATE_USER64;
146		mpcb->mpcb_wbuf = kmem_cache_alloc(wbuf64_cache, KM_SLEEP);
147	}
148	ASSERT(((uintptr_t)mpcb->mpcb_wbuf & 7) == 0);
149	mpcb->mpcb_wbuf_pa = va_to_pa(mpcb->mpcb_wbuf);
150	mpcb->mpcb_pa = va_to_pa(mpcb);
151	return (stk);
152}
153
154void
155lwp_stk_fini(klwp_t *lwp)
156{
157	struct machpcb *mpcb = lwptompcb(lwp);
158
159	/*
160	 * there might be windows still in the wbuf due to unmapped
161	 * stack, misaligned stack pointer, etc.  We just free it.
162	 */
163	mpcb->mpcb_wbcnt = 0;
164	if (mpcb->mpcb_wstate == WSTATE_USER32)
165		kmem_cache_free(wbuf32_cache, mpcb->mpcb_wbuf);
166	else
167		kmem_cache_free(wbuf64_cache, mpcb->mpcb_wbuf);
168	mpcb->mpcb_wbuf = NULL;
169	mpcb->mpcb_wbuf_pa = -1;
170}
171
172
173/*
174 * Copy regs from parent to child.
175 */
176void
177lwp_forkregs(klwp_t *lwp, klwp_t *clwp)
178{
179	kthread_t *t, *pt = lwptot(lwp);
180	struct machpcb *mpcb = lwptompcb(clwp);
181	struct machpcb *pmpcb = lwptompcb(lwp);
182	kfpu_t *fp, *pfp = lwptofpu(lwp);
183	caddr_t wbuf;
184	uint_t wstate;
185
186	t = mpcb->mpcb_thread;
187	/*
188	 * remember child's fp and wbuf since they will get erased during
189	 * the bcopy.
190	 */
191	fp = mpcb->mpcb_fpu;
192	wbuf = mpcb->mpcb_wbuf;
193	wstate = mpcb->mpcb_wstate;
194	/*
195	 * Don't copy mpcb_frame since we hand-crafted it
196	 * in thread_load().
197	 */
198	bcopy(lwp->lwp_regs, clwp->lwp_regs, sizeof (struct machpcb) - REGOFF);
199	mpcb->mpcb_thread = t;
200	mpcb->mpcb_fpu = fp;
201	fp->fpu_q = mpcb->mpcb_fpu_q;
202
203	/*
204	 * It is theoretically possibly for the lwp's wstate to
205	 * be different from its value assigned in lwp_stk_init,
206	 * since lwp_stk_init assumed the data model of the process.
207	 * Here, we took on the data model of the cloned lwp.
208	 */
209	if (mpcb->mpcb_wstate != wstate) {
210		if (wstate == WSTATE_USER32) {
211			kmem_cache_free(wbuf32_cache, wbuf);
212			wbuf = kmem_cache_alloc(wbuf64_cache, KM_SLEEP);
213			wstate = WSTATE_USER64;
214		} else {
215			kmem_cache_free(wbuf64_cache, wbuf);
216			wbuf = kmem_cache_alloc(wbuf32_cache, KM_SLEEP);
217			wstate = WSTATE_USER32;
218		}
219	}
220
221	mpcb->mpcb_pa = va_to_pa(mpcb);
222	mpcb->mpcb_wbuf = wbuf;
223	mpcb->mpcb_wbuf_pa = va_to_pa(wbuf);
224
225	ASSERT(mpcb->mpcb_wstate == wstate);
226
227	if (mpcb->mpcb_wbcnt != 0) {
228		bcopy(pmpcb->mpcb_wbuf, mpcb->mpcb_wbuf,
229		    mpcb->mpcb_wbcnt * ((mpcb->mpcb_wstate == WSTATE_USER32) ?
230		    sizeof (struct rwindow32) : sizeof (struct rwindow64)));
231	}
232
233	if (pt == curthread)
234		pfp->fpu_fprs = _fp_read_fprs();
235	if ((pfp->fpu_en) || (pfp->fpu_fprs & FPRS_FEF)) {
236		if (pt == curthread && fpu_exists) {
237			save_gsr(clwp->lwp_fpu);
238		} else {
239			uint64_t gsr;
240			gsr = get_gsr(lwp->lwp_fpu);
241			set_gsr(gsr, clwp->lwp_fpu);
242		}
243		fp_fork(lwp, clwp);
244	}
245}
246
247/*
248 * Free lwp fpu regs.
249 */
250void
251lwp_freeregs(klwp_t *lwp, int isexec)
252{
253	kfpu_t *fp = lwptofpu(lwp);
254
255	if (lwptot(lwp) == curthread)
256		fp->fpu_fprs = _fp_read_fprs();
257	if ((fp->fpu_en) || (fp->fpu_fprs & FPRS_FEF))
258		fp_free(fp, isexec);
259}
260
261/*
262 * This function is currently unused on sparc.
263 */
264/*ARGSUSED*/
265void
266lwp_attach_brand_hdlrs(klwp_t *lwp)
267{}
268
269/*
270 * fill in the extra register state area specified with the
271 * specified lwp's platform-dependent non-floating-point extra
272 * register state information
273 */
274/* ARGSUSED */
275void
276xregs_getgfiller(klwp_id_t lwp, caddr_t xrp)
277{
278	/* for sun4u nothing to do here, added for symmetry */
279}
280
281/*
282 * fill in the extra register state area specified with the specified lwp's
283 * platform-dependent floating-point extra register state information.
284 * NOTE:  'lwp' might not correspond to 'curthread' since this is
285 * called from code in /proc to get the registers of another lwp.
286 */
287void
288xregs_getfpfiller(klwp_id_t lwp, caddr_t xrp)
289{
290	prxregset_t *xregs = (prxregset_t *)xrp;
291	kfpu_t *fp = lwptofpu(lwp);
292	uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL);
293	uint64_t gsr;
294
295	/*
296	 * fp_fksave() does not flush the GSR register into
297	 * the lwp area, so do it now
298	 */
299	kpreempt_disable();
300	if (ttolwp(curthread) == lwp && fpu_exists) {
301		fp->fpu_fprs = _fp_read_fprs();
302		if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) {
303			_fp_write_fprs(fprs);
304			fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs;
305		}
306		save_gsr(fp);
307	}
308	gsr = get_gsr(fp);
309	kpreempt_enable();
310	PRXREG_GSR(xregs) = gsr;
311}
312
313/*
314 * set the specified lwp's platform-dependent non-floating-point
315 * extra register state based on the specified input
316 */
317/* ARGSUSED */
318void
319xregs_setgfiller(klwp_id_t lwp, caddr_t xrp)
320{
321	/* for sun4u nothing to do here, added for symmetry */
322}
323
324/*
325 * set the specified lwp's platform-dependent floating-point
326 * extra register state based on the specified input
327 */
328void
329xregs_setfpfiller(klwp_id_t lwp, caddr_t xrp)
330{
331	prxregset_t *xregs = (prxregset_t *)xrp;
332	kfpu_t *fp = lwptofpu(lwp);
333	uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL);
334	uint64_t gsr = PRXREG_GSR(xregs);
335
336	kpreempt_disable();
337	set_gsr(gsr, lwptofpu(lwp));
338
339	if ((lwp == ttolwp(curthread)) && fpu_exists) {
340		fp->fpu_fprs = _fp_read_fprs();
341		if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) {
342			_fp_write_fprs(fprs);
343			fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs;
344		}
345		restore_gsr(lwptofpu(lwp));
346	}
347	kpreempt_enable();
348}
349
350/*
351 * fill in the sun4u asrs, ie, the lwp's platform-dependent
352 * non-floating-point extra register state information
353 */
354/* ARGSUSED */
355void
356getasrs(klwp_t *lwp, asrset_t asr)
357{
358	/* for sun4u nothing to do here, added for symmetry */
359}
360
361/*
362 * fill in the sun4u asrs, ie, the lwp's platform-dependent
363 * floating-point extra register state information
364 */
365void
366getfpasrs(klwp_t *lwp, asrset_t asr)
367{
368	kfpu_t *fp = lwptofpu(lwp);
369	uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL);
370
371	kpreempt_disable();
372	if (ttolwp(curthread) == lwp)
373		fp->fpu_fprs = _fp_read_fprs();
374	if ((fp->fpu_en) || (fp->fpu_fprs & FPRS_FEF)) {
375		if (fpu_exists && ttolwp(curthread) == lwp) {
376			if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) {
377				_fp_write_fprs(fprs);
378				fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs;
379			}
380			save_gsr(fp);
381		}
382		asr[ASR_GSR] = (int64_t)get_gsr(fp);
383	}
384	kpreempt_enable();
385}
386
387/*
388 * set the sun4u asrs, ie, the lwp's platform-dependent
389 * non-floating-point extra register state information
390 */
391/* ARGSUSED */
392void
393setasrs(klwp_t *lwp, asrset_t asr)
394{
395	/* for sun4u nothing to do here, added for symmetry */
396}
397
398void
399setfpasrs(klwp_t *lwp, asrset_t asr)
400{
401	kfpu_t *fp = lwptofpu(lwp);
402	uint32_t fprs = (FPRS_FEF|FPRS_DU|FPRS_DL);
403
404	kpreempt_disable();
405	if (ttolwp(curthread) == lwp)
406		fp->fpu_fprs = _fp_read_fprs();
407	if ((fp->fpu_en) || (fp->fpu_fprs & FPRS_FEF)) {
408		set_gsr(asr[ASR_GSR], fp);
409		if (fpu_exists && ttolwp(curthread) == lwp) {
410			if ((fp->fpu_fprs & FPRS_FEF) != FPRS_FEF) {
411				_fp_write_fprs(fprs);
412				fp->fpu_fprs = (V9_FPU_FPRS_TYPE)fprs;
413			}
414			restore_gsr(fp);
415		}
416	}
417	kpreempt_enable();
418}
419
420/*
421 * Create interrupt kstats for this CPU.
422 */
423void
424cpu_create_intrstat(cpu_t *cp)
425{
426	int		i;
427	kstat_t		*intr_ksp;
428	kstat_named_t	*knp;
429	char		name[KSTAT_STRLEN];
430	zoneid_t	zoneid;
431
432	ASSERT(MUTEX_HELD(&cpu_lock));
433
434	if (pool_pset_enabled())
435		zoneid = GLOBAL_ZONEID;
436	else
437		zoneid = ALL_ZONES;
438
439	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
440	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
441
442	/*
443	 * Initialize each PIL's named kstat
444	 */
445	if (intr_ksp != NULL) {
446		intr_ksp->ks_update = cpu_kstat_intrstat_update;
447		knp = (kstat_named_t *)intr_ksp->ks_data;
448		intr_ksp->ks_private = cp;
449		for (i = 0; i < PIL_MAX; i++) {
450			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
451			    i + 1);
452			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
453			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
454			    i + 1);
455			kstat_named_init(&knp[(i * 2) + 1], name,
456			    KSTAT_DATA_UINT64);
457		}
458		kstat_install(intr_ksp);
459	}
460}
461
462/*
463 * Delete interrupt kstats for this CPU.
464 */
465void
466cpu_delete_intrstat(cpu_t *cp)
467{
468	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
469}
470
471/*
472 * Convert interrupt statistics from CPU ticks to nanoseconds and
473 * update kstat.
474 */
475int
476cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
477{
478	kstat_named_t	*knp = ksp->ks_data;
479	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
480	int		i;
481
482	if (rw == KSTAT_WRITE)
483		return (EACCES);
484
485	/*
486	 * We use separate passes to copy and convert the statistics to
487	 * nanoseconds. This assures that the snapshot of the data is as
488	 * self-consistent as possible.
489	 */
490
491	for (i = 0; i < PIL_MAX; i++) {
492		knp[i * 2].value.ui64 = cpup->cpu_m.intrstat[i + 1][0];
493		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
494	}
495
496	for (i = 0; i < PIL_MAX; i++) {
497		knp[i * 2].value.ui64 =
498		    (uint64_t)tick2ns((hrtime_t)knp[i * 2].value.ui64,
499			cpup->cpu_id);
500	}
501
502	return (0);
503}
504
505/*
506 * Called by common/os/cpu.c for psrinfo(1m) kstats
507 */
508char *
509cpu_fru_fmri(cpu_t *cp)
510{
511	return (cpunodes[cp->cpu_id].fru_fmri);
512}
513
514/*
515 * An interrupt thread is ending a time slice, so compute the interval it
516 * ran for and update the statistic for its PIL.
517 */
518void
519cpu_intr_swtch_enter(kthread_id_t t)
520{
521	uint64_t	interval;
522	uint64_t	start;
523	cpu_t		*cpu;
524
525	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
526	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
527
528	/*
529	 * We could be here with a zero timestamp. This could happen if:
530	 * an interrupt thread which no longer has a pinned thread underneath
531	 * it (i.e. it blocked at some point in its past) has finished running
532	 * its handler. intr_thread() updated the interrupt statistic for its
533	 * PIL and zeroed its timestamp. Since there was no pinned thread to
534	 * return to, swtch() gets called and we end up here.
535	 *
536	 * It can also happen if an interrupt thread in intr_thread() calls
537	 * preempt. It will have already taken care of updating stats. In
538	 * this event, the interrupt thread will be runnable.
539	 */
540	if (t->t_intr_start) {
541		do {
542			start = t->t_intr_start;
543			interval = gettick_counter() - start;
544		} while (cas64(&t->t_intr_start, start, 0) != start);
545		cpu = CPU;
546		if (cpu->cpu_m.divisor > 1)
547			interval *= cpu->cpu_m.divisor;
548		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
549
550		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
551		    interval);
552	} else
553		ASSERT(t->t_intr == NULL || t->t_state == TS_RUN);
554}
555
556
557/*
558 * An interrupt thread is returning from swtch(). Place a starting timestamp
559 * in its thread structure.
560 */
561void
562cpu_intr_swtch_exit(kthread_id_t t)
563{
564	uint64_t ts;
565
566	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
567	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
568
569	do {
570		ts = t->t_intr_start;
571	} while (cas64(&t->t_intr_start, ts, gettick_counter()) != ts);
572}
573
574
575int
576blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class)
577{
578	if (&plat_blacklist)
579		return (plat_blacklist(cmd, scheme, fmri, class));
580
581	return (ENOTSUP);
582}
583
584int
585kdi_pread(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp)
586{
587	extern void kdi_flush_caches(void);
588	size_t nread = 0;
589	uint32_t word;
590	int slop, i;
591
592	kdi_flush_caches();
593	membar_enter();
594
595	/* We might not begin on a word boundary. */
596	if ((slop = addr & 3) != 0) {
597		word = ldphys(addr & ~3);
598		for (i = slop; i < 4 && nbytes > 0; i++, nbytes--, nread++)
599			*buf++ = ((uchar_t *)&word)[i];
600		addr = roundup(addr, 4);
601	}
602
603	while (nbytes > 0) {
604		word = ldphys(addr);
605		for (i = 0; i < 4 && nbytes > 0; i++, nbytes--, nread++, addr++)
606			*buf++ = ((uchar_t *)&word)[i];
607	}
608
609	kdi_flush_caches();
610
611	*ncopiedp = nread;
612	return (0);
613}
614
615int
616kdi_pwrite(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp)
617{
618	extern void kdi_flush_caches(void);
619	size_t nwritten = 0;
620	uint32_t word;
621	int slop, i;
622
623	kdi_flush_caches();
624
625	/* We might not begin on a word boundary. */
626	if ((slop = addr & 3) != 0) {
627		word = ldphys(addr & ~3);
628		for (i = slop; i < 4 && nbytes > 0; i++, nbytes--, nwritten++)
629			((uchar_t *)&word)[i] = *buf++;
630		stphys(addr & ~3, word);
631		addr = roundup(addr, 4);
632	}
633
634	while (nbytes > 3) {
635		for (word = 0, i = 0; i < 4; i++, nbytes--, nwritten++)
636			((uchar_t *)&word)[i] = *buf++;
637		stphys(addr, word);
638		addr += 4;
639	}
640
641	/* We might not end with a whole word. */
642	if (nbytes > 0) {
643		word = ldphys(addr);
644		for (i = 0; nbytes > 0; i++, nbytes--, nwritten++)
645			((uchar_t *)&word)[i] = *buf++;
646		stphys(addr, word);
647	}
648
649	membar_enter();
650	kdi_flush_caches();
651
652	*ncopiedp = nwritten;
653	return (0);
654}
655
656static void
657kdi_kernpanic(struct regs *regs, uint_t tt)
658{
659	sync_reg_buf = *regs;
660	sync_tt = tt;
661
662	sync_handler();
663}
664
665static void
666kdi_plat_call(void (*platfn)(void))
667{
668	if (platfn != NULL) {
669		prom_suspend_prepost();
670		platfn();
671		prom_resume_prepost();
672	}
673}
674
675void
676mach_kdi_init(kdi_t *kdi)
677{
678	kdi->kdi_plat_call = kdi_plat_call;
679	kdi->mkdi_cpu_index = kdi_cpu_index;
680	kdi->mkdi_trap_vatotte = kdi_trap_vatotte;
681	kdi->mkdi_kernpanic = kdi_kernpanic;
682}
683
684
685/*
686 * get_cpu_mstate() is passed an array of timestamps, NCMSTATES
687 * long, and it fills in the array with the time spent on cpu in
688 * each of the mstates, where time is returned in nsec.
689 *
690 * No guarantee is made that the returned values in times[] will
691 * monotonically increase on sequential calls, although this will
692 * be true in the long run. Any such guarantee must be handled by
693 * the caller, if needed. This can happen if we fail to account
694 * for elapsed time due to a generation counter conflict, yet we
695 * did account for it on a prior call (see below).
696 *
697 * The complication is that the cpu in question may be updating
698 * its microstate at the same time that we are reading it.
699 * Because the microstate is only updated when the CPU's state
700 * changes, the values in cpu_intracct[] can be indefinitely out
701 * of date. To determine true current values, it is necessary to
702 * compare the current time with cpu_mstate_start, and add the
703 * difference to times[cpu_mstate].
704 *
705 * This can be a problem if those values are changing out from
706 * under us. Because the code path in new_cpu_mstate() is
707 * performance critical, we have not added a lock to it. Instead,
708 * we have added a generation counter. Before beginning
709 * modifications, the counter is set to 0. After modifications,
710 * it is set to the old value plus one.
711 *
712 * get_cpu_mstate() will not consider the values of cpu_mstate
713 * and cpu_mstate_start to be usable unless the value of
714 * cpu_mstate_gen is both non-zero and unchanged, both before and
715 * after reading the mstate information. Note that we must
716 * protect against out-of-order loads around accesses to the
717 * generation counter. Also, this is a best effort approach in
718 * that we do not retry should the counter be found to have
719 * changed.
720 *
721 * cpu_intracct[] is used to identify time spent in each CPU
722 * mstate while handling interrupts. Such time should be reported
723 * against system time, and so is subtracted out from its
724 * corresponding cpu_acct[] time and added to
725 * cpu_acct[CMS_SYSTEM]. Additionally, intracct time is stored in
726 * %ticks, but acct time may be stored as %sticks, thus requiring
727 * different conversions before they can be compared.
728 */
729
730void
731get_cpu_mstate(cpu_t *cpu, hrtime_t *times)
732{
733	int i;
734	hrtime_t now, start;
735	uint16_t gen;
736	uint16_t state;
737	hrtime_t intracct[NCMSTATES];
738
739	/*
740	 * Load all volatile state under the protection of membar.
741	 * cpu_acct[cpu_mstate] must be loaded to avoid double counting
742	 * of (now - cpu_mstate_start) by a change in CPU mstate that
743	 * arrives after we make our last check of cpu_mstate_gen.
744	 */
745
746	now = gethrtime_unscaled();
747	gen = cpu->cpu_mstate_gen;
748
749	membar_consumer();	/* guarantee load ordering */
750	start = cpu->cpu_mstate_start;
751	state = cpu->cpu_mstate;
752	for (i = 0; i < NCMSTATES; i++) {
753		intracct[i] = cpu->cpu_intracct[i];
754		times[i] = cpu->cpu_acct[i];
755	}
756	membar_consumer();	/* guarantee load ordering */
757
758	if (gen != 0 && gen == cpu->cpu_mstate_gen && now > start)
759		times[state] += now - start;
760
761	for (i = 0; i < NCMSTATES; i++) {
762		scalehrtime(&times[i]);
763		intracct[i] = tick2ns((hrtime_t)intracct[i], cpu->cpu_id);
764	}
765
766	for (i = 0; i < NCMSTATES; i++) {
767		if (i == CMS_SYSTEM)
768			continue;
769		times[i] -= intracct[i];
770		if (times[i] < 0) {
771			intracct[i] += times[i];
772			times[i] = 0;
773		}
774		times[CMS_SYSTEM] += intracct[i];
775	}
776}
777