1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2009, Intel Corporation.
23 * All rights reserved.
24 */
25
26#include <sys/atomic.h>
27#include <sys/cpuvar.h>
28#include <sys/cpu.h>
29#include <sys/cpu_event.h>
30#include <sys/cmn_err.h>
31#include <sys/ddi.h>
32#include <sys/kmem.h>
33#include <sys/kstat.h>
34#include <sys/pci.h>
35#include <sys/sunddi.h>
36#include <sys/sunndi.h>
37#include <sys/synch.h>
38#include <sys/sysmacros.h>
39#include <sys/fipe.h>
40#include <vm/hat.h>
41
42/* Current PM policy, configurable through /etc/system and fipe.conf. */
43fipe_pm_policy_t fipe_pm_policy = FIPE_PM_POLICY_BALANCE;
44int fipe_pm_throttle_level = 1;
45
46/* Enable kstat support. */
47#define	FIPE_KSTAT_SUPPORT		1
48
49/* Enable performance relative statistics. */
50#define	FIPE_KSTAT_DETAIL		1
51
52/* Enable builtin IOAT driver if no IOAT driver is available. */
53#define	FIPE_IOAT_BUILTIN		0
54#if defined(FIPE_IOAT_BUILTIN) && (FIPE_IOAT_BUILTIN == 0)
55#undef	FIPE_IOAT_BUILTIN
56#endif
57
58#ifdef	FIPE_IOAT_BUILTIN
59/* Use IOAT channel 3 to generate memory transactions. */
60#define	FIPE_IOAT_CHAN_CTRL		0x200
61#define	FIPE_IOAT_CHAN_STS_LO		0x204
62#define	FIPE_IOAT_CHAN_STS_HI		0x208
63#define	FIPE_IOAT_CHAN_ADDR_LO		0x20C
64#define	FIPE_IOAT_CHAN_ADDR_HI		0x210
65#define	FIPE_IOAT_CHAN_CMD		0x214
66#define	FIPE_IOAT_CHAN_ERR		0x228
67#else	/* FIPE_IOAT_BUILTIN */
68#include <sys/dcopy.h>
69#endif	/* FIPE_IOAT_BUILTIN */
70
71/* Memory controller relative PCI configuration constants. */
72#define	FIPE_MC_GBLACT			0x60
73#define	FIPE_MC_THRTLOW			0x64
74#define	FIPE_MC_THRTCTRL 		0x67
75#define	FIPE_MC_THRTCTRL_HUNT		0x1
76
77/* Hardware recommended values. */
78#define	FIPE_MC_MEMORY_OFFSET		1024
79#define	FIPE_MC_MEMORY_SIZE		128
80
81/* Number of IOAT commands posted when entering idle. */
82#define	FIPE_IOAT_CMD_NUM		2
83
84/* Resource allocation retry interval in microsecond. */
85#define	FIPE_IOAT_RETRY_INTERVAL	(15 * 1000 * 1000)
86
87/* Statistics update interval in nanosecond. */
88#define	FIPE_STAT_INTERVAL		(10 * 1000 * 1000)
89
90/* Configuration profile support. */
91#define	FIPE_PROFILE_FIELD(field)	(fipe_profile_curr->field)
92#define	FIPE_PROF_IDLE_COUNT		FIPE_PROFILE_FIELD(idle_count)
93#define	FIPE_PROF_BUSY_THRESHOLD	FIPE_PROFILE_FIELD(busy_threshold)
94#define	FIPE_PROF_INTR_THRESHOLD	FIPE_PROFILE_FIELD(intr_threshold)
95#define	FIPE_PROF_INTR_BUSY_THRESHOLD	FIPE_PROFILE_FIELD(intr_busy_threshold)
96#define	FIPE_PROF_INTR_BUSY_THROTTLE	FIPE_PROFILE_FIELD(intr_busy_throttle)
97
98/* Priority assigned to FIPE memory power management driver on x86. */
99#define	CPU_IDLE_CB_PRIO_FIPE		(CPU_IDLE_CB_PRIO_LOW_BASE + 0x4000000)
100
101/* Structure to support power management profile. */
102static struct fipe_profile {
103	uint32_t			idle_count;
104	uint32_t			busy_threshold;
105	uint32_t			intr_threshold;
106	uint32_t			intr_busy_threshold;
107	uint32_t			intr_busy_throttle;
108} fipe_profiles[FIPE_PM_POLICY_MAX] = {
109	{ 0,	0,	0,	0,	0 },
110	{ 5,	30,	20,	50,	5 },
111	{ 10,	40,	40,	75,	4 },
112	{ 15,	50,	60,	100,	2 },
113};
114#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_profiles)
115
116/* Structure to store memory controller relative data. */
117static struct fipe_mc_ctrl {
118	ddi_acc_handle_t		mc_pci_hdl;
119	unsigned char			mc_thrtctrl;
120	unsigned char			mc_thrtlow;
121	unsigned char			mc_gblact;
122	dev_info_t			*mc_dip;
123	boolean_t			mc_initialized;
124} fipe_mc_ctrl;
125#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_mc_ctrl)
126
127/* Structure to store IOAT relative information. */
128static struct fipe_ioat_control {
129	kmutex_t			ioat_lock;
130	boolean_t			ioat_ready;
131#ifdef	FIPE_IOAT_BUILTIN
132	boolean_t			ioat_reg_mapped;
133	ddi_acc_handle_t		ioat_reg_handle;
134	uint8_t				*ioat_reg_addr;
135	uint64_t			ioat_cmd_physaddr;
136#else	/* FIPE_IOAT_BUILTIN */
137	dcopy_cmd_t			ioat_cmds[FIPE_IOAT_CMD_NUM + 1];
138	dcopy_handle_t			ioat_handle;
139#endif	/* FIPE_IOAT_BUILTIN */
140	dev_info_t			*ioat_dev_info;
141	uint64_t			ioat_buf_physaddr;
142	char				*ioat_buf_virtaddr;
143	char				*ioat_buf_start;
144	size_t				ioat_buf_size;
145	timeout_id_t			ioat_timerid;
146	boolean_t			ioat_failed;
147	boolean_t			ioat_cancel;
148	boolean_t			ioat_try_alloc;
149} fipe_ioat_ctrl;
150#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_ioat_ctrl)
151
152static struct fipe_idle_ctrl {
153	boolean_t			idle_ready;
154	cpu_idle_callback_handle_t	cb_handle;
155	cpu_idle_prop_handle_t		prop_enter;
156	cpu_idle_prop_handle_t		prop_exit;
157	cpu_idle_prop_handle_t		prop_busy;
158	cpu_idle_prop_handle_t		prop_idle;
159	cpu_idle_prop_handle_t		prop_intr;
160
161	/* Put here for cache efficiency, it should be in fipe_global_ctrl. */
162	hrtime_t			tick_interval;
163} fipe_idle_ctrl;
164#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_idle_ctrl)
165
166/*
167 * Global control structure.
168 * Solaris idle thread has no reentrance issue, so it's enough to count CPUs
169 * in idle state. Otherwise cpuset_t bitmap should be used to track idle CPUs.
170 */
171static struct fipe_global_ctrl {
172	kmutex_t			lock;
173	boolean_t			pm_enabled;
174	volatile boolean_t		pm_active;
175	volatile uint32_t		cpu_count;
176	volatile uint64_t		io_waiters;
177	hrtime_t			enter_ts;
178	hrtime_t			time_in_pm;
179	size_t				state_size;
180	char				*state_buf;
181#ifdef	FIPE_KSTAT_SUPPORT
182	kstat_t				*fipe_kstat;
183#endif	/* FIPE_KSTAT_SUPPORT */
184} fipe_gbl_ctrl;
185#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_gbl_ctrl)
186
187#define	FIPE_CPU_STATE_PAD		(128 - \
188	2 * sizeof (boolean_t) -  4 * sizeof (hrtime_t) - \
189	2 * sizeof (uint64_t) - 2 * sizeof (uint32_t))
190
191/* Per-CPU status. */
192#pragma pack(1)
193typedef struct fipe_cpu_state {
194	boolean_t			cond_ready;
195	boolean_t			state_ready;
196	uint32_t			idle_count;
197	uint32_t			throttle_cnt;
198	hrtime_t			throttle_ts;
199	hrtime_t			next_ts;
200	hrtime_t			last_busy;
201	hrtime_t			last_idle;
202	uint64_t			last_intr;
203	uint64_t			last_iowait;
204	char				pad1[FIPE_CPU_STATE_PAD];
205} fipe_cpu_state_t;
206#pragma pack()
207
208#ifdef	FIPE_KSTAT_SUPPORT
209static struct fipe_kstat_s {
210	kstat_named_t		fipe_enabled;
211	kstat_named_t		fipe_policy;
212	kstat_named_t		fipe_pm_time;
213#ifdef	FIPE_KSTAT_DETAIL
214	kstat_named_t		ioat_ready;
215	kstat_named_t		pm_tryenter_cnt;
216	kstat_named_t		pm_success_cnt;
217	kstat_named_t		pm_race_cnt;
218	kstat_named_t		cpu_loop_cnt;
219	kstat_named_t		cpu_busy_cnt;
220	kstat_named_t		cpu_idle_cnt;
221	kstat_named_t		cpu_intr_busy_cnt;
222	kstat_named_t		cpu_intr_throttle_cnt;
223	kstat_named_t		bio_busy_cnt;
224	kstat_named_t		ioat_start_fail_cnt;
225	kstat_named_t		ioat_stop_fail_cnt;
226#endif	/* FIPE_KSTAT_DETAIL */
227} fipe_kstat = {
228	{ "fipe_enabled",	KSTAT_DATA_INT32 },
229	{ "fipe_policy",	KSTAT_DATA_INT32 },
230	{ "fipe_pm_time",	KSTAT_DATA_UINT64 },
231#ifdef	FIPE_KSTAT_DETAIL
232	{ "ioat_ready",		KSTAT_DATA_INT32 },
233	{ "pm_tryenter_cnt",	KSTAT_DATA_UINT64 },
234	{ "pm_success_cnt",	KSTAT_DATA_UINT64 },
235	{ "pm_race_cnt",	KSTAT_DATA_UINT64 },
236	{ "cpu_loop_cnt",	KSTAT_DATA_UINT64 },
237	{ "cpu_busy_cnt",	KSTAT_DATA_UINT64 },
238	{ "cpu_idle_cnt",	KSTAT_DATA_UINT64 },
239	{ "cpu_intr_busy_cnt",	KSTAT_DATA_UINT64 },
240	{ "cpu_intr_thrt_cnt",	KSTAT_DATA_UINT64 },
241	{ "bio_busy_cnt",	KSTAT_DATA_UINT64 },
242	{ "ioat_start_fail_cnt", KSTAT_DATA_UINT64 },
243	{ "ioat_stop_fail_cnt",	KSTAT_DATA_UINT64 }
244#endif	/* FIPE_KSTAT_DETAIL */
245};
246#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_kstat)
247
248#define	FIPE_KSTAT_INC(v)		\
249	atomic_inc_64(&fipe_kstat.v.value.ui64)
250#ifdef	FIPE_KSTAT_DETAIL
251#define	FIPE_KSTAT_DETAIL_INC(v)	\
252	atomic_inc_64(&fipe_kstat.v.value.ui64)
253#else	/* FIPE_KSTAT_DETAIL */
254#define	FIPE_KSTAT_DETAIL_INC(v)
255#endif	/* FIPE_KSTAT_DETAIL */
256
257#else	/* FIPE_KSTAT_SUPPORT */
258
259#define	FIPE_KSTAT_INC(v)
260#define	FIPE_KSTAT_DETAIL_INC(v)
261
262#endif	/* FIPE_KSTAT_SUPPORT */
263
264/* Save current power management profile during suspend/resume. */
265static fipe_pm_policy_t	fipe_pm_policy_saved = FIPE_PM_POLICY_BALANCE;
266static fipe_cpu_state_t *fipe_cpu_states = NULL;
267
268/*
269 * There is no lock to protect fipe_profile_curr, so fipe_profile_curr
270 * could change on threads in fipe_idle_enter.  This is not an issue,
271 * as it always points to a valid profile, and though it might make
272 * an incorrect choice for the new profile, it will still be a valid
273 * selection, and would do the correct operation for the new profile on
274 * next cpu_idle_enter cycle.  Since the selections would always be
275 * valid for some profile, the overhead for the lock is not wasted.
276 */
277static struct fipe_profile *fipe_profile_curr = NULL;
278
279static void fipe_idle_enter(void *arg, cpu_idle_callback_context_t ctx,
280    cpu_idle_check_wakeup_t check_func, void* check_arg);
281static void fipe_idle_exit(void* arg, cpu_idle_callback_context_t ctx,
282    int flags);
283static cpu_idle_callback_t fipe_idle_cb = {
284	CPU_IDLE_CALLBACK_VER0,
285	fipe_idle_enter,
286	fipe_idle_exit,
287};
288
289/*
290 * Configure memory controller into power saving mode:
291 * 1) OLTT activation limit is set to unlimited
292 * 2) MC works in S-CLTT mode
293 */
294static int
295fipe_mc_change(int throttle)
296{
297	/* Enable OLTT/disable S-CLTT mode */
298	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
299	    fipe_mc_ctrl.mc_thrtctrl & ~FIPE_MC_THRTCTRL_HUNT);
300	/* Set OLTT activation limit to unlimited */
301	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_GBLACT, 0);
302	/*
303	 * Set S-CLTT low throttling to desired value. The lower value,
304	 * the more power saving and the less available memory bandwidth.
305	 */
306	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTLOW, throttle);
307	/* Enable S-CLTT/disable OLTT mode */
308	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
309	    fipe_mc_ctrl.mc_thrtctrl | FIPE_MC_THRTCTRL_HUNT);
310
311	return (0);
312}
313
314/*
315 * Restore memory controller's original configuration.
316 */
317static void
318fipe_mc_restore(void)
319{
320	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
321	    fipe_mc_ctrl.mc_thrtctrl & ~FIPE_MC_THRTCTRL_HUNT);
322	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_GBLACT,
323	    fipe_mc_ctrl.mc_gblact);
324	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTLOW,
325	    fipe_mc_ctrl.mc_thrtlow);
326	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
327	    fipe_mc_ctrl.mc_thrtctrl);
328}
329
330/*
331 * Initialize memory controller's data structure and status.
332 */
333static int
334fipe_mc_init(dev_info_t *dip)
335{
336	ddi_acc_handle_t handle;
337
338	bzero(&fipe_mc_ctrl, sizeof (fipe_mc_ctrl));
339
340	/* Hold one reference count and will be released in fipe_mc_fini. */
341	ndi_hold_devi(dip);
342
343	/* Setup pci configuration handler. */
344	if (pci_config_setup(dip, &handle) != DDI_SUCCESS) {
345		cmn_err(CE_WARN,
346		    "!fipe: failed to setup pcicfg handler in mc_init.");
347		ndi_rele_devi(dip);
348		return (-1);
349	}
350
351	/* Save original configuration. */
352	fipe_mc_ctrl.mc_thrtctrl = pci_config_get8(handle, FIPE_MC_THRTCTRL);
353	fipe_mc_ctrl.mc_thrtlow = pci_config_get8(handle, FIPE_MC_THRTLOW);
354	fipe_mc_ctrl.mc_gblact = pci_config_get8(handle, FIPE_MC_GBLACT);
355	fipe_mc_ctrl.mc_dip = dip;
356	fipe_mc_ctrl.mc_pci_hdl = handle;
357	fipe_mc_ctrl.mc_initialized = B_TRUE;
358
359	return (0);
360}
361
362/*
363 * Restore memory controller's configuration and release resources.
364 */
365static void
366fipe_mc_fini(void)
367{
368	if (fipe_mc_ctrl.mc_initialized) {
369		fipe_mc_restore();
370		pci_config_teardown(&fipe_mc_ctrl.mc_pci_hdl);
371		ndi_rele_devi(fipe_mc_ctrl.mc_dip);
372		fipe_mc_ctrl.mc_initialized = B_FALSE;
373	}
374	bzero(&fipe_mc_ctrl, sizeof (fipe_mc_ctrl));
375}
376
377/* Search device with specific pci ids. */
378struct fipe_pci_ioat_id {
379	uint16_t		venid;
380	uint16_t		devid;
381	uint16_t		subvenid;
382	uint16_t		subsysid;
383	char			*unitaddr;
384};
385
386static struct fipe_pci_ioat_id fipe_pci_ioat_ids[] = {
387	{ 0x8086, 0x1a38, 0xffff, 0xffff, NULL },
388	{ 0x8086, 0x360b, 0xffff, 0xffff, NULL },
389};
390
391/*ARGSUSED*/
392static int
393fipe_search_ioat_dev(dev_info_t *dip, void *arg)
394{
395	char *unit;
396	struct fipe_pci_ioat_id *id;
397	int i, max, venid, devid, subvenid, subsysid;
398
399	/* Query PCI id properties. */
400	venid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
401	    "vendor-id", 0xffffffff);
402	if (venid == 0xffffffff) {
403		return (DDI_WALK_CONTINUE);
404	}
405	devid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
406	    "device-id", 0xffffffff);
407	if (devid == 0xffffffff) {
408		return (DDI_WALK_CONTINUE);
409	}
410	subvenid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
411	    "subsystem-vendor-id", 0xffffffff);
412	if (subvenid == 0xffffffff) {
413		return (DDI_WALK_CONTINUE);
414	}
415	subsysid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
416	    "subsystem-id", 0xffffffff);
417	if (subvenid == 0xffffffff) {
418		return (DDI_WALK_CONTINUE);
419	}
420	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
421	    "unit-address", &unit) != DDI_PROP_SUCCESS) {
422		return (DDI_WALK_CONTINUE);
423	}
424
425	max = sizeof (fipe_pci_ioat_ids) / sizeof (fipe_pci_ioat_ids[0]);
426	for (i = 0; i < max; i++) {
427		id = &fipe_pci_ioat_ids[i];
428		if ((id->venid == 0xffffu || id->venid == venid) &&
429		    (id->devid == 0xffffu || id->devid == devid) &&
430		    (id->subvenid == 0xffffu || id->subvenid == subvenid) &&
431		    (id->subsysid == 0xffffu || id->subsysid == subsysid) &&
432		    (id->unitaddr == NULL || strcmp(id->unitaddr, unit) == 0)) {
433			break;
434		}
435	}
436	ddi_prop_free(unit);
437	if (i >= max) {
438		return (DDI_WALK_CONTINUE);
439	}
440
441	/* Found IOAT device, hold one reference count. */
442	ndi_hold_devi(dip);
443	fipe_ioat_ctrl.ioat_dev_info = dip;
444
445	return (DDI_WALK_TERMINATE);
446}
447
448/*
449 * To enable FBDIMM idle power enhancement mechanism, IOAT will be used to
450 * generate enough memory traffic to trigger memory controller thermal throttle
451 * circuitry.
452 * If dcopy/ioat is available, we will use dcopy interface to communicate
453 * with IOAT. Otherwise the built-in driver will directly talk to IOAT
454 * hardware.
455 */
456#ifdef	FIPE_IOAT_BUILTIN
457static int
458fipe_ioat_trigger(void)
459{
460	uint16_t ctrl;
461	uint32_t err;
462	uint8_t	*addr = fipe_ioat_ctrl.ioat_reg_addr;
463	ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;
464
465	/* Check channel in use flag. */
466	ctrl = ddi_get16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL));
467	if (ctrl & 0x100) {
468		/*
469		 * Channel is in use by somebody else. IOAT driver may have
470		 * been loaded, forbid fipe from accessing IOAT hardware
471		 * anymore.
472		 */
473		fipe_ioat_ctrl.ioat_ready = B_FALSE;
474		fipe_ioat_ctrl.ioat_failed = B_TRUE;
475		FIPE_KSTAT_INC(ioat_start_fail_cnt);
476		return (-1);
477	} else {
478		/* Set channel in use flag. */
479		ddi_put16(handle,
480		    (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0x100);
481	}
482
483	/* Write command address. */
484	ddi_put32(handle,
485	    (uint32_t *)(addr + FIPE_IOAT_CHAN_ADDR_LO),
486	    (uint32_t)fipe_ioat_ctrl.ioat_cmd_physaddr);
487	ddi_put32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ADDR_HI),
488	    (uint32_t)(fipe_ioat_ctrl.ioat_cmd_physaddr >> 32));
489
490	/* Check and clear error flags. */
491	err = ddi_get32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ERR));
492	if (err != 0) {
493		ddi_put32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ERR), err);
494	}
495
496	/* Start channel. */
497	ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x1);
498
499	return (0);
500}
501
502static void
503fipe_ioat_cancel(void)
504{
505	uint32_t status;
506	uint8_t	*addr = fipe_ioat_ctrl.ioat_reg_addr;
507	ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;
508
509	/*
510	 * Reset channel. Sometimes reset is not reliable,
511	 * so check completion or abort status after reset.
512	 */
513	/* LINTED: constant in conditional context */
514	while (1) {
515		/* Issue reset channel command. */
516		ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x20);
517
518		/* Query command status. */
519		status = ddi_get32(handle,
520		    (uint32_t *)(addr + FIPE_IOAT_CHAN_STS_LO));
521		if (status & 0x1) {
522			/* Reset channel completed. */
523			break;
524		} else {
525			SMT_PAUSE();
526		}
527	}
528
529	/* Put channel into "not in use" state. */
530	ddi_put16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0);
531}
532
533/*ARGSUSED*/
534static void
535fipe_ioat_alloc(void *arg)
536{
537	int rc = 0, nregs;
538	dev_info_t *dip;
539	ddi_device_acc_attr_t attr;
540	boolean_t fatal = B_FALSE;
541
542	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
543	/*
544	 * fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
545	 * In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
546	 * schedule a timer and exit.
547	 */
548	if (fipe_ioat_ctrl.ioat_try_alloc == B_FALSE) {
549		fipe_ioat_ctrl.ioat_try_alloc = B_TRUE;
550		goto out_error;
551	}
552
553	/* Check whether has been initialized or encountered permanent error. */
554	if (fipe_ioat_ctrl.ioat_ready || fipe_ioat_ctrl.ioat_failed ||
555	    fipe_ioat_ctrl.ioat_cancel) {
556		fipe_ioat_ctrl.ioat_timerid = 0;
557		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
558		return;
559	}
560
561	if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
562		/* Find dev_info_t for IOAT engine. */
563		ddi_walk_devs(ddi_root_node(), fipe_search_ioat_dev, NULL);
564		if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
565			cmn_err(CE_NOTE,
566			    "!fipe: no IOAT hardware found, disable pm.");
567			fatal = B_TRUE;
568			goto out_error;
569		}
570	}
571
572	/* Map in IOAT control register window. */
573	ASSERT(fipe_ioat_ctrl.ioat_dev_info != NULL);
574	ASSERT(fipe_ioat_ctrl.ioat_reg_mapped == B_FALSE);
575	dip = fipe_ioat_ctrl.ioat_dev_info;
576	if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS || nregs < 2) {
577		cmn_err(CE_WARN, "!fipe: ioat has not enough register bars.");
578		fatal = B_TRUE;
579		goto out_error;
580	}
581	attr.devacc_attr_version = DDI_DEVICE_ATTR_V0;
582	attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
583	attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
584	rc = ddi_regs_map_setup(dip, 1,
585	    (caddr_t *)&fipe_ioat_ctrl.ioat_reg_addr,
586	    0, 0, &attr, &fipe_ioat_ctrl.ioat_reg_handle);
587	if (rc != DDI_SUCCESS) {
588		cmn_err(CE_WARN, "!fipe: failed to map IOAT registeres.");
589		fatal = B_TRUE;
590		goto out_error;
591	}
592
593	/* Mark IOAT status. */
594	fipe_ioat_ctrl.ioat_reg_mapped = B_TRUE;
595	fipe_ioat_ctrl.ioat_ready = B_TRUE;
596	fipe_ioat_ctrl.ioat_failed = B_FALSE;
597	fipe_ioat_ctrl.ioat_timerid = 0;
598	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
599
600	return;
601
602out_error:
603	fipe_ioat_ctrl.ioat_timerid = 0;
604	if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
605		if (fatal) {
606			/* Mark permanent error and give up. */
607			fipe_ioat_ctrl.ioat_failed = B_TRUE;
608			/* Release reference count hold by ddi_find_devinfo. */
609			if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
610				ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
611				fipe_ioat_ctrl.ioat_dev_info = NULL;
612			}
613		} else {
614			/*
615			 * Schedule another timer to keep on trying.
616			 * timeout() should always succeed, no need to check
617			 * return.
618			 */
619			fipe_ioat_ctrl.ioat_timerid = timeout(fipe_ioat_alloc,
620			    NULL, drv_usectohz(FIPE_IOAT_RETRY_INTERVAL));
621		}
622	}
623	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
624}
625
626static void
627fipe_ioat_free(void)
628{
629	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
630	/* Cancel timeout to avoid race condition. */
631	if (fipe_ioat_ctrl.ioat_timerid != 0) {
632		fipe_ioat_ctrl.ioat_cancel = B_TRUE;
633		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
634		(void) untimeout(fipe_ioat_ctrl.ioat_timerid);
635		mutex_enter(&fipe_ioat_ctrl.ioat_lock);
636		fipe_ioat_ctrl.ioat_timerid = 0;
637		fipe_ioat_ctrl.ioat_cancel = B_FALSE;
638	}
639
640	if (fipe_ioat_ctrl.ioat_reg_mapped) {
641		ddi_regs_map_free(&fipe_ioat_ctrl.ioat_reg_handle);
642		fipe_ioat_ctrl.ioat_reg_mapped = B_FALSE;
643	}
644
645	fipe_ioat_ctrl.ioat_ready = B_FALSE;
646	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
647}
648
649#else	/* FIPE_IOAT_BUILTIN */
650
651/*
652 * Trigger IOAT memory copy operation when entering power saving state.
653 * A group of commands will be posted to IOAT driver and those commands
654 * will be placed into an IOAT ring buffer.
655 */
656static int
657fipe_ioat_trigger(void)
658{
659	int idx;
660	dcopy_cmd_t *cmds = fipe_ioat_ctrl.ioat_cmds;
661
662	for (idx = FIPE_IOAT_CMD_NUM; idx > 0; idx--) {
663		if (dcopy_cmd_post(cmds[idx]) == DCOPY_SUCCESS) {
664			continue;
665		} else {
666			/*
667			 * Don't rollback on failure, it doesn't hurt much more
668			 * than some small memory copy operations.
669			 */
670			FIPE_KSTAT_DETAIL_INC(ioat_start_fail_cnt);
671			return (-1);
672		}
673	}
674
675	return (0);
676}
677
678/*
679 * Cancel the memory copy operations posted by fipe_ioat_trigger.
680 * It's achieved by posting a new command which will break the ring
681 * created by fipe_ioat_trigger. If it fails, the best way to recover
682 * is to just let it go. IOAT will recover when posting next command
683 * on the same channel.
684 */
685static void
686fipe_ioat_cancel(void)
687{
688	if (dcopy_cmd_post(fipe_ioat_ctrl.ioat_cmds[0]) != DCOPY_SUCCESS) {
689		FIPE_KSTAT_DETAIL_INC(ioat_stop_fail_cnt);
690	}
691}
692
693/*
694 * This function will be called from allocate IOAT resources.
695 * Allocation may fail due to following reasons:
696 * 1) IOAT driver hasn't been loaded yet. Keep on trying in this case.
697 * 2) IOAT resources are temporarily unavailable.  Keep on trying in this case.
698 * 3) Other no recoverable reasons. Disable power management function.
699 */
700/*ARGSUSED*/
701static void
702fipe_ioat_alloc(void *arg)
703{
704	int idx, flags, rc = 0;
705	uint64_t physaddr;
706	boolean_t fatal = B_FALSE;
707	dcopy_query_t info;
708	dcopy_handle_t handle;
709	dcopy_cmd_t cmds[FIPE_IOAT_CMD_NUM + 1];
710
711	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
712	/*
713	 * fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
714	 * In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
715	 * schedule a timer and exit.
716	 */
717	if (fipe_ioat_ctrl.ioat_try_alloc == B_FALSE) {
718		fipe_ioat_ctrl.ioat_try_alloc = B_TRUE;
719		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
720		goto out_error;
721	}
722
723	/*
724	 * Check whether device has been initialized or if it encountered
725	 * some permanent error.
726	 */
727	if (fipe_ioat_ctrl.ioat_ready || fipe_ioat_ctrl.ioat_failed ||
728	    fipe_ioat_ctrl.ioat_cancel) {
729		fipe_ioat_ctrl.ioat_timerid = 0;
730		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
731		return;
732	}
733
734	if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
735		/* Find dev_info_t for IOAT engine. */
736		ddi_walk_devs(ddi_root_node(), fipe_search_ioat_dev, NULL);
737		if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
738			cmn_err(CE_NOTE,
739			    "!fipe: no IOAT hardware found, disable pm.");
740			mutex_exit(&fipe_ioat_ctrl.ioat_lock);
741			fatal = B_TRUE;
742			goto out_error;
743		}
744	}
745	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
746
747	/* Check, allocate and initialize IOAT resources with lock released. */
748	dcopy_query(&info);
749	if (info.dq_version < DCOPY_QUERY_V0) {
750		/* Permanent error, give up. */
751		cmn_err(CE_WARN, "!fipe: IOAT driver version mismatch.");
752		fatal = B_TRUE;
753		goto out_error;
754	} else if (info.dq_num_channels == 0) {
755		/* IOAT driver hasn't been loaded, keep trying. */
756		goto out_error;
757	}
758
759	/* Allocate IOAT channel. */
760	rc = dcopy_alloc(DCOPY_NOSLEEP, &handle);
761	if (rc == DCOPY_NORESOURCES) {
762		/* Resource temporarily not available, keep trying. */
763		goto out_error;
764	} else if (rc != DCOPY_SUCCESS) {
765		/* Permanent error, give up. */
766		cmn_err(CE_WARN, "!fipe: failed to allocate IOAT channel.");
767		fatal = B_TRUE;
768		goto out_error;
769	}
770
771	/*
772	 * Allocate multiple IOAT commands and organize them into a ring to
773	 * loop forever. Commands number is determined by IOAT descriptor size
774	 * and memory interleave pattern.
775	 * cmd[0] is used break the loop and disable IOAT operation.
776	 * cmd[1, FIPE_IOAT_CMD_NUM] are grouped into a ring and cmd[1] is the
777	 * list head.
778	 */
779	bzero(cmds, sizeof (cmds));
780	physaddr = fipe_ioat_ctrl.ioat_buf_physaddr;
781	for (idx = FIPE_IOAT_CMD_NUM; idx >= 0; idx--) {
782		/* Allocate IOAT commands. */
783		if (idx == 0 || idx == FIPE_IOAT_CMD_NUM) {
784			flags = DCOPY_NOSLEEP;
785		} else {
786			/*
787			 * To link commands into a list, the initial value of
788			 * cmd need to be set to next cmd on list.
789			 */
790			flags = DCOPY_NOSLEEP | DCOPY_ALLOC_LINK;
791			cmds[idx] = cmds[idx + 1];
792		}
793		rc = dcopy_cmd_alloc(handle, flags, &cmds[idx]);
794		if (rc == DCOPY_NORESOURCES) {
795			goto out_freecmd;
796		} else if (rc != DCOPY_SUCCESS) {
797			/* Permanent error, give up. */
798			cmn_err(CE_WARN,
799			    "!fipe: failed to allocate IOAT command.");
800			fatal = B_TRUE;
801			goto out_freecmd;
802		}
803
804		/* Disable src/dst snoop to improve CPU cache efficiency. */
805		cmds[idx]->dp_flags = DCOPY_CMD_NOSRCSNP | DCOPY_CMD_NODSTSNP;
806		/* Specially handle commands on the list. */
807		if (idx != 0) {
808			/* Disable IOAT status. */
809			cmds[idx]->dp_flags |= DCOPY_CMD_NOSTAT;
810			/* Disable waiting for resources. */
811			cmds[idx]->dp_flags |= DCOPY_CMD_NOWAIT;
812			if (idx == 1) {
813				/* The list head, chain command into loop. */
814				cmds[idx]->dp_flags |= DCOPY_CMD_LOOP;
815			} else {
816				/* Queue all other commands except head. */
817				cmds[idx]->dp_flags |= DCOPY_CMD_QUEUE;
818			}
819		}
820		cmds[idx]->dp_cmd = DCOPY_CMD_COPY;
821		cmds[idx]->dp.copy.cc_source = physaddr;
822		cmds[idx]->dp.copy.cc_dest = physaddr + FIPE_MC_MEMORY_OFFSET;
823		if (idx == 0) {
824			/*
825			 * Command 0 is used to cancel memory copy by breaking
826			 * the ring created in fipe_ioat_trigger().
827			 * For efficiency, use the smallest memory copy size.
828			 */
829			cmds[idx]->dp.copy.cc_size = 1;
830		} else {
831			cmds[idx]->dp.copy.cc_size = FIPE_MC_MEMORY_SIZE;
832		}
833	}
834
835	/* Update IOAT control status if it hasn't been initialized yet. */
836	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
837	if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
838		fipe_ioat_ctrl.ioat_handle = handle;
839		for (idx = 0; idx <= FIPE_IOAT_CMD_NUM; idx++) {
840			fipe_ioat_ctrl.ioat_cmds[idx] = cmds[idx];
841		}
842		fipe_ioat_ctrl.ioat_ready = B_TRUE;
843		fipe_ioat_ctrl.ioat_failed = B_FALSE;
844		fipe_ioat_ctrl.ioat_timerid = 0;
845		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
846		return;
847	}
848	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
849	/* Initialized by another thread, fall through to free resources. */
850
851out_freecmd:
852	if (cmds[0] != NULL) {
853		dcopy_cmd_free(&cmds[0]);
854	}
855	/* Only need to free head, dcopy will free all commands on the list. */
856	for (idx = 1; idx <= FIPE_IOAT_CMD_NUM; idx++) {
857		if (cmds[idx] != NULL) {
858			dcopy_cmd_free(&cmds[idx]);
859			break;
860		}
861	}
862	dcopy_free(&handle);
863
864out_error:
865	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
866	fipe_ioat_ctrl.ioat_timerid = 0;
867	if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
868		if (fatal) {
869			/* Mark permanent error and give up. */
870			fipe_ioat_ctrl.ioat_failed = B_TRUE;
871			/* Release reference count hold by ddi_find_devinfo. */
872			if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
873				ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
874				fipe_ioat_ctrl.ioat_dev_info = NULL;
875			}
876		} else {
877			/*
878			 * Schedule another timer to keep on trying.
879			 * timeout() should always success, no need to check.
880			 */
881			fipe_ioat_ctrl.ioat_timerid = timeout(fipe_ioat_alloc,
882			    NULL, drv_usectohz(FIPE_IOAT_RETRY_INTERVAL));
883		}
884	}
885	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
886}
887
888/*
889 * Free resources allocated in fipe_ioat_alloc.
890 */
891static void
892fipe_ioat_free(void)
893{
894	int idx = 0;
895	dcopy_cmd_t *cmds = fipe_ioat_ctrl.ioat_cmds;
896
897	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
898
899	/* Cancel timeout to avoid race condition. */
900	if (fipe_ioat_ctrl.ioat_timerid != 0) {
901		fipe_ioat_ctrl.ioat_cancel = B_TRUE;
902		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
903		(void) untimeout(fipe_ioat_ctrl.ioat_timerid);
904		mutex_enter(&fipe_ioat_ctrl.ioat_lock);
905		fipe_ioat_ctrl.ioat_timerid = 0;
906		fipe_ioat_ctrl.ioat_cancel = B_FALSE;
907	}
908
909	/* Free ioat resources. */
910	if (fipe_ioat_ctrl.ioat_ready) {
911		if (cmds[0] != NULL) {
912			dcopy_cmd_free(&cmds[0]);
913		}
914		for (idx = 1; idx <= FIPE_IOAT_CMD_NUM; idx++) {
915			if (cmds[idx] != NULL) {
916				dcopy_cmd_free(&cmds[idx]);
917				break;
918			}
919		}
920		bzero(fipe_ioat_ctrl.ioat_cmds,
921		    sizeof (fipe_ioat_ctrl.ioat_cmds));
922		dcopy_free(&fipe_ioat_ctrl.ioat_handle);
923		fipe_ioat_ctrl.ioat_handle = NULL;
924		fipe_ioat_ctrl.ioat_ready = B_FALSE;
925	}
926
927	/* Release reference count hold by ddi_find_devinfo. */
928	if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
929		ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
930		fipe_ioat_ctrl.ioat_dev_info = NULL;
931	}
932
933	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
934}
935#endif	/* FIPE_IOAT_BUILTIN */
936
937/*
938 * Initialize IOAT relative resources.
939 */
940static int
941fipe_ioat_init(void)
942{
943	char *buf;
944	size_t size;
945
946	bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl));
947	mutex_init(&fipe_ioat_ctrl.ioat_lock, NULL, MUTEX_DRIVER, NULL);
948
949	/*
950	 * Allocate memory for IOAT memory copy operation.
951	 * The allocated memory should be page aligned to achieve better power
952	 * savings.
953	 * Don't use ddi_dma_mem_alloc here to keep thing simple.  This also
954	 * makes quiesce easier.
955	 */
956	size = PAGESIZE;
957	buf = kmem_zalloc(size, KM_SLEEP);
958	if ((intptr_t)buf & PAGEOFFSET) {
959		kmem_free(buf, PAGESIZE);
960		size <<= 1;
961		buf = kmem_zalloc(size, KM_SLEEP);
962	}
963	fipe_ioat_ctrl.ioat_buf_size = size;
964	fipe_ioat_ctrl.ioat_buf_start = buf;
965	buf = (char *)P2ROUNDUP((intptr_t)buf, PAGESIZE);
966	fipe_ioat_ctrl.ioat_buf_virtaddr = buf;
967	fipe_ioat_ctrl.ioat_buf_physaddr = hat_getpfnum(kas.a_hat, buf);
968	fipe_ioat_ctrl.ioat_buf_physaddr <<= PAGESHIFT;
969
970#ifdef	FIPE_IOAT_BUILTIN
971	{
972		uint64_t bufpa;
973		/* IOAT descriptor data structure copied from ioat.h. */
974		struct fipe_ioat_cmd_desc {
975			uint32_t	dd_size;
976			uint32_t	dd_ctrl;
977			uint64_t	dd_src_paddr;
978			uint64_t	dd_dest_paddr;
979			uint64_t	dd_next_desc;
980			uint64_t	dd_res4;
981			uint64_t	dd_res5;
982			uint64_t	dd_res6;
983			uint64_t	dd_res7;
984		} *desc;
985
986		/*
987		 * Build two IOAT command descriptors and chain them into ring.
988		 * Control flags as below:
989		 *	0x2: disable source snoop
990		 *	0x4: disable destination snoop
991		 *	0x0 << 24: memory copy operation
992		 * The layout for command descriptors and memory buffers are
993		 * organized for power saving effect, please don't change it.
994		 */
995		buf = fipe_ioat_ctrl.ioat_buf_virtaddr;
996		bufpa = fipe_ioat_ctrl.ioat_buf_physaddr;
997		fipe_ioat_ctrl.ioat_cmd_physaddr = bufpa;
998
999		/* First command descriptor. */
1000		desc = (struct fipe_ioat_cmd_desc *)(buf);
1001		desc->dd_size = 128;
1002		desc->dd_ctrl = 0x6;
1003		desc->dd_src_paddr = bufpa + 2048;
1004		desc->dd_dest_paddr = bufpa + 3072;
1005		/* Point to second descriptor. */
1006		desc->dd_next_desc = bufpa + 64;
1007
1008		/* Second command descriptor. */
1009		desc = (struct fipe_ioat_cmd_desc *)(buf + 64);
1010		desc->dd_size = 128;
1011		desc->dd_ctrl = 0x6;
1012		desc->dd_src_paddr = bufpa + 2048;
1013		desc->dd_dest_paddr = bufpa + 3072;
1014		/* Point to first descriptor. */
1015		desc->dd_next_desc = bufpa;
1016	}
1017#endif	/* FIPE_IOAT_BUILTIN */
1018
1019	return (0);
1020}
1021
1022static void
1023fipe_ioat_fini(void)
1024{
1025	/* Release reference count hold by ddi_find_devinfo. */
1026	if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
1027		ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
1028		fipe_ioat_ctrl.ioat_dev_info = NULL;
1029	}
1030
1031	if (fipe_ioat_ctrl.ioat_buf_start != NULL) {
1032		ASSERT(fipe_ioat_ctrl.ioat_buf_size != 0);
1033		kmem_free(fipe_ioat_ctrl.ioat_buf_start,
1034		    fipe_ioat_ctrl.ioat_buf_size);
1035	}
1036
1037	mutex_destroy(&fipe_ioat_ctrl.ioat_lock);
1038	bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl));
1039}
1040
1041static int
1042fipe_idle_start(void)
1043{
1044	int rc;
1045
1046	if (fipe_idle_ctrl.idle_ready) {
1047		return (0);
1048	}
1049
1050	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_ENTER_TIMESTAMP,
1051	    &fipe_idle_ctrl.prop_enter) != 0) {
1052		cmn_err(CE_WARN, "!fipe: failed to get enter_ts property.");
1053		return (-1);
1054	}
1055	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_EXIT_TIMESTAMP,
1056	    &fipe_idle_ctrl.prop_exit) != 0) {
1057		cmn_err(CE_WARN, "!fipe: failed to get exit_ts property.");
1058		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1059		return (-1);
1060	}
1061	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_TOTAL_IDLE_TIME,
1062	    &fipe_idle_ctrl.prop_idle) != 0) {
1063		cmn_err(CE_WARN, "!fipe: failed to get idle_time property.");
1064		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1065		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1066		return (-1);
1067	}
1068	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_TOTAL_BUSY_TIME,
1069	    &fipe_idle_ctrl.prop_busy) != 0) {
1070		cmn_err(CE_WARN, "!fipe: failed to get busy_time property.");
1071		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1072		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1073		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1074		return (-1);
1075	}
1076	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_INTERRUPT_COUNT,
1077	    &fipe_idle_ctrl.prop_intr) != 0) {
1078		cmn_err(CE_WARN, "!fipe: failed to get intr_count property.");
1079		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
1080		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1081		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1082		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1083		return (-1);
1084	}
1085
1086	/* Register idle state notification callback. */
1087	rc = cpu_idle_register_callback(CPU_IDLE_CB_PRIO_FIPE, &fipe_idle_cb,
1088	    NULL, &fipe_idle_ctrl.cb_handle);
1089	if (rc != 0) {
1090		cmn_err(CE_WARN, "!fipe: failed to register cpuidle callback.");
1091		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_intr);
1092		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
1093		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1094		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1095		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1096		return (-1);
1097	}
1098
1099	fipe_idle_ctrl.idle_ready = B_TRUE;
1100
1101	return (0);
1102}
1103
1104static int
1105fipe_idle_stop(void)
1106{
1107	int rc;
1108
1109	if (fipe_idle_ctrl.idle_ready == B_FALSE) {
1110		return (0);
1111	}
1112
1113	rc = cpu_idle_unregister_callback(fipe_idle_ctrl.cb_handle);
1114	if (rc != 0) {
1115		cmn_err(CE_WARN,
1116		    "!fipe: failed to unregister cpuidle callback.");
1117		return (-1);
1118	}
1119
1120	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_intr);
1121	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
1122	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1123	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1124	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1125
1126	fipe_idle_ctrl.idle_ready = B_FALSE;
1127
1128	return (0);
1129}
1130
1131#ifdef	FIPE_KSTAT_SUPPORT
1132static int
1133fipe_kstat_update(kstat_t *ksp, int rw)
1134{
1135	struct fipe_kstat_s *sp;
1136	hrtime_t hrt;
1137
1138	if (rw == KSTAT_WRITE) {
1139		return (EACCES);
1140	}
1141
1142	sp = ksp->ks_data;
1143	sp->fipe_enabled.value.i32 = fipe_gbl_ctrl.pm_enabled ? 1 : 0;
1144	sp->fipe_policy.value.i32 = fipe_pm_policy;
1145
1146	hrt = fipe_gbl_ctrl.time_in_pm;
1147	scalehrtime(&hrt);
1148	sp->fipe_pm_time.value.ui64 = (uint64_t)hrt;
1149
1150#ifdef	FIPE_KSTAT_DETAIL
1151	sp->ioat_ready.value.i32 = fipe_ioat_ctrl.ioat_ready ? 1 : 0;
1152#endif	/* FIPE_KSTAT_DETAIL */
1153
1154	return (0);
1155}
1156#endif	/* FIPE_KSTAT_SUPPORT */
1157
1158/*
1159 * Initialize memory power management subsystem.
1160 * Note: This function should only be called from ATTACH.
1161 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1162 */
1163int
1164fipe_init(dev_info_t *dip)
1165{
1166	size_t nsize;
1167	hrtime_t hrt;
1168
1169	/* Initialize global control structure. */
1170	bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
1171	mutex_init(&fipe_gbl_ctrl.lock, NULL, MUTEX_DRIVER, NULL);
1172
1173	/* Query power management policy from device property. */
1174	fipe_pm_policy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
1175	    FIPE_PROP_PM_POLICY, fipe_pm_policy);
1176	if (fipe_pm_policy < 0 || fipe_pm_policy >= FIPE_PM_POLICY_MAX) {
1177		cmn_err(CE_CONT,
1178		    "?fipe: invalid power management policy %d.\n",
1179		    fipe_pm_policy);
1180		fipe_pm_policy = FIPE_PM_POLICY_BALANCE;
1181	}
1182	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1183
1184	/*
1185	 * Compute unscaled hrtime value corresponding to FIPE_STAT_INTERVAL.
1186	 * (1 << 36) should be big enough here.
1187	 */
1188	hrt = 1ULL << 36;
1189	scalehrtime(&hrt);
1190	fipe_idle_ctrl.tick_interval = FIPE_STAT_INTERVAL * (1ULL << 36) / hrt;
1191
1192	if (fipe_mc_init(dip) != 0) {
1193		cmn_err(CE_WARN, "!fipe: failed to initialize mc state.");
1194		goto out_mc_error;
1195	}
1196	if (fipe_ioat_init() != 0) {
1197		cmn_err(CE_NOTE, "!fipe: failed to initialize ioat state.");
1198		goto out_ioat_error;
1199	}
1200
1201	/* Allocate per-CPU structure. */
1202	nsize = max_ncpus * sizeof (fipe_cpu_state_t);
1203	nsize += CPU_CACHE_COHERENCE_SIZE;
1204	fipe_gbl_ctrl.state_buf = kmem_zalloc(nsize, KM_SLEEP);
1205	fipe_gbl_ctrl.state_size = nsize;
1206	fipe_cpu_states = (fipe_cpu_state_t *)P2ROUNDUP(
1207	    (intptr_t)fipe_gbl_ctrl.state_buf, CPU_CACHE_COHERENCE_SIZE);
1208
1209#ifdef	FIPE_KSTAT_SUPPORT
1210	fipe_gbl_ctrl.fipe_kstat = kstat_create("fipe", 0, "fipe-pm", "misc",
1211	    KSTAT_TYPE_NAMED, sizeof (fipe_kstat) / sizeof (kstat_named_t),
1212	    KSTAT_FLAG_VIRTUAL);
1213	if (fipe_gbl_ctrl.fipe_kstat == NULL) {
1214		cmn_err(CE_CONT, "?fipe: failed to create kstat object.\n");
1215	} else {
1216		fipe_gbl_ctrl.fipe_kstat->ks_lock = &fipe_gbl_ctrl.lock;
1217		fipe_gbl_ctrl.fipe_kstat->ks_data = &fipe_kstat;
1218		fipe_gbl_ctrl.fipe_kstat->ks_update = fipe_kstat_update;
1219		kstat_install(fipe_gbl_ctrl.fipe_kstat);
1220	}
1221#endif	/* FIPE_KSTAT_SUPPORT */
1222
1223	return (0);
1224
1225out_ioat_error:
1226	fipe_mc_fini();
1227out_mc_error:
1228	mutex_destroy(&fipe_gbl_ctrl.lock);
1229	bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
1230
1231	return (-1);
1232}
1233
1234/*
1235 * Destroy memory power management subsystem.
1236 * Note: This function should only be called from DETACH.
1237 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1238 */
1239int
1240fipe_fini(void)
1241{
1242	if (fipe_gbl_ctrl.pm_enabled) {
1243		cmn_err(CE_NOTE, "!fipe: call fipe_fini without stopping PM.");
1244		return (EBUSY);
1245	}
1246
1247	ASSERT(!fipe_gbl_ctrl.pm_active);
1248	fipe_ioat_fini();
1249	fipe_mc_fini();
1250
1251#ifdef	FIPE_KSTAT_SUPPORT
1252	if (fipe_gbl_ctrl.fipe_kstat != NULL) {
1253		kstat_delete(fipe_gbl_ctrl.fipe_kstat);
1254		fipe_gbl_ctrl.fipe_kstat = NULL;
1255	}
1256#endif	/* FIPE_KSTAT_SUPPORT */
1257
1258	if (fipe_gbl_ctrl.state_buf != NULL) {
1259		ASSERT(fipe_gbl_ctrl.state_size != 0);
1260		kmem_free(fipe_gbl_ctrl.state_buf, fipe_gbl_ctrl.state_size);
1261		fipe_cpu_states = NULL;
1262	}
1263
1264	fipe_profile_curr = NULL;
1265	mutex_destroy(&fipe_gbl_ctrl.lock);
1266	bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
1267
1268	return (0);
1269}
1270
1271/*
1272 * Start memory power management subsystem.
1273 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1274 */
1275int
1276fipe_start(void)
1277{
1278	if (fipe_gbl_ctrl.pm_enabled == B_TRUE) {
1279		return (0);
1280	}
1281
1282	bzero(fipe_cpu_states, max_ncpus * sizeof (fipe_cpu_states[0]));
1283	fipe_ioat_alloc(NULL);
1284	if (fipe_idle_start() != 0) {
1285		cmn_err(CE_NOTE, "!fipe: failed to start PM subsystem.");
1286		fipe_ioat_free();
1287		return (-1);
1288	}
1289
1290	fipe_gbl_ctrl.pm_enabled = B_TRUE;
1291
1292	return (0);
1293}
1294
1295/*
1296 * Stop memory power management subsystem.
1297 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1298 */
1299int
1300fipe_stop(void)
1301{
1302	if (fipe_gbl_ctrl.pm_enabled) {
1303		if (fipe_idle_stop() != 0) {
1304			cmn_err(CE_NOTE,
1305			    "!fipe: failed to stop PM subsystem.");
1306			return (-1);
1307		}
1308		fipe_ioat_free();
1309		fipe_gbl_ctrl.pm_enabled = B_FALSE;
1310	}
1311	ASSERT(!fipe_gbl_ctrl.pm_active);
1312
1313	return (0);
1314}
1315
1316int
1317fipe_suspend(void)
1318{
1319	/* Save current power management policy. */
1320	fipe_pm_policy_saved = fipe_pm_policy;
1321	/* Disable PM by setting profile to FIPE_PM_POLICY_DISABLE. */
1322	fipe_pm_policy = FIPE_PM_POLICY_DISABLE;
1323	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1324
1325	return (0);
1326}
1327
1328int
1329fipe_resume(void)
1330{
1331	/* Restore saved power management policy. */
1332	fipe_pm_policy = fipe_pm_policy_saved;
1333	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1334
1335	return (0);
1336}
1337
1338fipe_pm_policy_t
1339fipe_get_pmpolicy(void)
1340{
1341	return (fipe_pm_policy);
1342}
1343
1344int
1345fipe_set_pmpolicy(fipe_pm_policy_t policy)
1346{
1347	if (policy < 0 || policy >= FIPE_PM_POLICY_MAX) {
1348		return (EINVAL);
1349	}
1350	fipe_pm_policy = policy;
1351	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1352
1353	return (0);
1354}
1355
1356/*
1357 * Check condition (fipe_gbl_ctrl.cpu_cnt == ncpus) to make sure that
1358 * there is other CPU trying to wake up system from memory power saving state.
1359 * If a CPU is waking up system, fipe_disable() will set
1360 * fipe_gbl_ctrl.pm_active to false as soon as possible and allow other CPU's
1361 * to continue, and it will take the responsibility to recover system from
1362 * memory power saving state.
1363 */
1364static void
1365fipe_enable(int throttle, cpu_idle_check_wakeup_t check_func, void* check_arg)
1366{
1367	extern void membar_sync(void);
1368
1369	FIPE_KSTAT_DETAIL_INC(pm_tryenter_cnt);
1370
1371	/*
1372	 * Check CPU wakeup events.
1373	 */
1374	if (check_func != NULL) {
1375		(*check_func)(check_arg);
1376	}
1377
1378	/*
1379	 * Try to acquire mutex, which also implicitly has the same effect
1380	 * of calling membar_sync().
1381	 * If mutex_tryenter fails, that means other CPU is waking up.
1382	 */
1383	if (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
1384		FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1385	/*
1386	 * Handle a special race condition for the case that a CPU wakes
1387	 * and then enters into idle state within a short period.
1388	 * This case can't be reliably detected by cpu_count mechanism.
1389	 */
1390	} else if (fipe_gbl_ctrl.pm_active) {
1391		FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1392		mutex_exit(&fipe_gbl_ctrl.lock);
1393	} else {
1394		fipe_gbl_ctrl.pm_active = B_TRUE;
1395		membar_sync();
1396		if (fipe_gbl_ctrl.cpu_count != ncpus) {
1397			FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1398			fipe_gbl_ctrl.pm_active = B_FALSE;
1399		} else if (fipe_ioat_trigger() != 0) {
1400			fipe_gbl_ctrl.pm_active = B_FALSE;
1401		} else if (fipe_gbl_ctrl.cpu_count != ncpus ||
1402		    fipe_mc_change(throttle) != 0) {
1403			fipe_gbl_ctrl.pm_active = B_FALSE;
1404			fipe_ioat_cancel();
1405			if (fipe_gbl_ctrl.cpu_count != ncpus) {
1406				FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1407			}
1408		} else if (fipe_gbl_ctrl.cpu_count != ncpus) {
1409			fipe_gbl_ctrl.pm_active = B_FALSE;
1410			fipe_mc_restore();
1411			fipe_ioat_cancel();
1412			FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1413		} else {
1414			FIPE_KSTAT_DETAIL_INC(pm_success_cnt);
1415		}
1416		mutex_exit(&fipe_gbl_ctrl.lock);
1417	}
1418}
1419
1420static void
1421fipe_disable(void)
1422{
1423	/*
1424	 * Try to acquire lock, which also implicitly has the same effect
1425	 * of calling membar_sync().
1426	 */
1427	while (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
1428		/*
1429		 * If power saving is inactive, just return and all dirty
1430		 * house-keeping work will be handled in fipe_enable().
1431		 */
1432		if (fipe_gbl_ctrl.pm_active == B_FALSE) {
1433			return;
1434		} else {
1435			(void) SMT_PAUSE();
1436		}
1437	}
1438
1439	/* Disable power saving if it's active. */
1440	if (fipe_gbl_ctrl.pm_active) {
1441		/*
1442		 * Set pm_active to FALSE as soon as possible to prevent
1443		 * other CPUs from waiting on pm_active flag.
1444		 */
1445		fipe_gbl_ctrl.pm_active = B_FALSE;
1446		membar_producer();
1447		fipe_mc_restore();
1448		fipe_ioat_cancel();
1449	}
1450
1451	mutex_exit(&fipe_gbl_ctrl.lock);
1452}
1453
1454/*ARGSUSED*/
1455static boolean_t
1456fipe_check_cpu(struct fipe_cpu_state *sp, cpu_idle_callback_context_t ctx,
1457    hrtime_t ts)
1458{
1459	if (cpu_flagged_offline(CPU->cpu_flags)) {
1460		/* Treat CPU in offline state as ready. */
1461		sp->cond_ready = B_TRUE;
1462		return (B_TRUE);
1463	} else if (sp->next_ts <= ts) {
1464		uint64_t intr;
1465		hrtime_t idle, busy, diff;
1466		cpu_idle_prop_value_t val;
1467
1468		/* Set default value. */
1469		sp->cond_ready = B_TRUE;
1470		sp->idle_count = 0;
1471
1472		/* Calculate idle percent. */
1473		idle = sp->last_idle;
1474		sp->last_idle = cpu_idle_prop_get_hrtime(
1475		    fipe_idle_ctrl.prop_idle, ctx);
1476		idle = sp->last_idle - idle;
1477		busy = sp->last_busy;
1478		sp->last_busy = cpu_idle_prop_get_hrtime(
1479		    fipe_idle_ctrl.prop_busy, ctx);
1480		busy = sp->last_busy - busy;
1481		/* Check idle condition. */
1482		if (idle > 0 && busy > 0) {
1483			if (busy * (100 - FIPE_PROF_BUSY_THRESHOLD) >
1484			    idle * FIPE_PROF_BUSY_THRESHOLD) {
1485				FIPE_KSTAT_DETAIL_INC(cpu_busy_cnt);
1486				sp->cond_ready = B_FALSE;
1487			} else {
1488				FIPE_KSTAT_DETAIL_INC(cpu_idle_cnt);
1489			}
1490		} else {
1491			FIPE_KSTAT_DETAIL_INC(cpu_busy_cnt);
1492			sp->cond_ready = B_FALSE;
1493		}
1494
1495		/* Calculate interrupt count. */
1496		diff = sp->next_ts;
1497		sp->next_ts = ts + fipe_idle_ctrl.tick_interval;
1498		diff = sp->next_ts - diff;
1499		intr = sp->last_intr;
1500		if (cpu_idle_prop_get_value(fipe_idle_ctrl.prop_intr, ctx,
1501		    &val) == 0) {
1502			sp->last_intr = val.cipv_uint64;
1503			intr = sp->last_intr - intr;
1504			if (diff != 0) {
1505				intr = intr * fipe_idle_ctrl.tick_interval;
1506				intr /= diff;
1507			} else {
1508				intr = FIPE_PROF_INTR_THRESHOLD;
1509			}
1510		} else {
1511			intr = FIPE_PROF_INTR_THRESHOLD;
1512		}
1513
1514		/*
1515		 * System is busy with interrupts, so disable all PM
1516		 * status checks for INTR_BUSY_THROTTLE ticks.
1517		 * Interrupts are disabled when FIPE callbacks are called,
1518		 * so this optimization will help to reduce interrupt
1519		 * latency.
1520		 */
1521		if (intr >= FIPE_PROF_INTR_BUSY_THRESHOLD) {
1522			FIPE_KSTAT_DETAIL_INC(cpu_intr_busy_cnt);
1523			sp->throttle_ts = ts + FIPE_PROF_INTR_BUSY_THROTTLE *
1524			    fipe_idle_ctrl.tick_interval;
1525			sp->cond_ready = B_FALSE;
1526		} else if (intr >= FIPE_PROF_INTR_THRESHOLD) {
1527			FIPE_KSTAT_DETAIL_INC(cpu_intr_throttle_cnt);
1528			sp->cond_ready = B_FALSE;
1529		}
1530	} else if (++sp->idle_count >= FIPE_PROF_IDLE_COUNT) {
1531		/* Too many idle enter/exit in this tick. */
1532		FIPE_KSTAT_DETAIL_INC(cpu_loop_cnt);
1533		sp->throttle_ts = sp->next_ts + fipe_idle_ctrl.tick_interval;
1534		sp->idle_count = 0;
1535		sp->cond_ready = B_FALSE;
1536		return (B_FALSE);
1537	}
1538
1539	return (sp->cond_ready);
1540}
1541
1542/*ARGSUSED*/
1543static void
1544fipe_idle_enter(void *arg, cpu_idle_callback_context_t ctx,
1545    cpu_idle_check_wakeup_t check_func, void* check_arg)
1546{
1547	hrtime_t ts;
1548	uint32_t cnt;
1549	uint64_t iowait;
1550	cpu_t *cp = CPU;
1551	struct fipe_cpu_state *sp;
1552
1553	sp = &fipe_cpu_states[cp->cpu_id];
1554	ts = cpu_idle_prop_get_hrtime(fipe_idle_ctrl.prop_enter, ctx);
1555
1556	if (fipe_pm_policy != FIPE_PM_POLICY_DISABLE &&
1557	    fipe_ioat_ctrl.ioat_ready &&
1558	    sp->state_ready && sp->throttle_ts <= ts) {
1559		/* Adjust iowait count for local CPU. */
1560		iowait = CPU_STATS(cp, sys.iowait);
1561		if (iowait != sp->last_iowait) {
1562			atomic_add_64(&fipe_gbl_ctrl.io_waiters,
1563			    iowait - sp->last_iowait);
1564			sp->last_iowait = iowait;
1565		}
1566
1567		/* Check current CPU status. */
1568		if (fipe_check_cpu(sp, ctx, ts)) {
1569			/* Increase count of CPU ready for power saving. */
1570			do {
1571				cnt = fipe_gbl_ctrl.cpu_count;
1572				ASSERT(cnt < ncpus);
1573			} while (atomic_cas_32(&fipe_gbl_ctrl.cpu_count,
1574			    cnt, cnt + 1) != cnt);
1575
1576			/*
1577			 * Enable power saving if all CPUs are idle.
1578			 */
1579			if (cnt + 1 == ncpus) {
1580				if (fipe_gbl_ctrl.io_waiters == 0) {
1581					fipe_gbl_ctrl.enter_ts = ts;
1582					fipe_enable(fipe_pm_throttle_level,
1583					    check_func, check_arg);
1584				/* There are ongoing block io operations. */
1585				} else {
1586					FIPE_KSTAT_DETAIL_INC(bio_busy_cnt);
1587				}
1588			}
1589		}
1590	} else if (fipe_pm_policy == FIPE_PM_POLICY_DISABLE ||
1591	    fipe_ioat_ctrl.ioat_ready == B_FALSE) {
1592		if (sp->cond_ready == B_TRUE) {
1593			sp->cond_ready = B_FALSE;
1594		}
1595	} else if (sp->state_ready == B_FALSE) {
1596		sp->cond_ready = B_FALSE;
1597		sp->state_ready = B_TRUE;
1598		sp->throttle_ts = 0;
1599		sp->next_ts = ts + fipe_idle_ctrl.tick_interval;
1600		sp->last_busy = cpu_idle_prop_get_hrtime(
1601		    fipe_idle_ctrl.prop_busy, ctx);
1602		sp->last_idle = cpu_idle_prop_get_hrtime(
1603		    fipe_idle_ctrl.prop_idle, ctx);
1604		sp->last_intr = cpu_idle_prop_get_hrtime(
1605		    fipe_idle_ctrl.prop_intr, ctx);
1606		sp->idle_count = 0;
1607	}
1608}
1609
1610/*ARGSUSED*/
1611static void
1612fipe_idle_exit(void* arg, cpu_idle_callback_context_t ctx, int flags)
1613{
1614	uint32_t cnt;
1615	hrtime_t ts;
1616	struct fipe_cpu_state *sp;
1617
1618	sp = &fipe_cpu_states[CPU->cpu_id];
1619	if (sp->cond_ready) {
1620		do {
1621			cnt = fipe_gbl_ctrl.cpu_count;
1622			ASSERT(cnt > 0);
1623		} while (atomic_cas_32(&fipe_gbl_ctrl.cpu_count,
1624		    cnt, cnt - 1) != cnt);
1625
1626		/*
1627		 * Try to disable power saving state.
1628		 * Only the first CPU waking from idle state will try to
1629		 * disable power saving state, all other CPUs will just go
1630		 * on and not try to wait for memory to recover from power
1631		 * saving state.
1632		 * So there are possible periods during which some CPUs are in
1633		 * active state but memory is in power saving state.
1634		 * This is OK, since it is an uncommon case, and it is
1635		 * better for performance to let them continue as their
1636		 * blocking latency is smaller than a mutex, and is only
1637		 * hit in the uncommon condition.
1638		 */
1639		if (cnt == ncpus) {
1640			fipe_disable();
1641			ts = cpu_idle_prop_get_hrtime(fipe_idle_ctrl.prop_exit,
1642			    ctx);
1643			fipe_gbl_ctrl.time_in_pm += ts - fipe_gbl_ctrl.enter_ts;
1644		}
1645	}
1646}
1647