callo.h revision 8566:65762b7ee3ce
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22/*	  All Rights Reserved  	*/
23
24
25/*
26 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30#ifndef _SYS_CALLO_H
31#define	_SYS_CALLO_H
32
33#include <sys/t_lock.h>
34#include <sys/taskq.h>
35#include <sys/lgrp.h>
36#include <sys/processor.h>
37#include <sys/cyclic.h>
38#include <sys/kstat.h>
39#include <sys/systm.h>
40
41#ifdef	__cplusplus
42extern "C" {
43#endif
44
45#ifdef	_KERNEL
46
47typedef struct callout_list	callout_list_t;
48
49/*
50 * The callout mechanism provides general-purpose event scheduling:
51 * an arbitrary function is called in a specified amount of time.
52 * The expiration time for a callout is kept in its callout list
53 * structure.
54 */
55typedef struct callout {
56	struct callout	*c_idnext;	/* next in ID hash, or on freelist */
57	struct callout	*c_idprev;	/* prev in ID hash */
58	struct callout	*c_clnext;	/* next in callout list */
59	struct callout	*c_clprev;	/* prev in callout list */
60	callout_id_t	c_xid;		/* extended callout ID; see below */
61	callout_list_t	*c_list;	/* callout list */
62	void		(*c_func)(void *); /* function to call */
63	void		*c_arg;		/* argument to function */
64} callout_t;
65
66/*
67 * The callout ID (callout_id_t) uniquely identifies a callout. The callout
68 * ID is always 64 bits internally. The lower 32 bits contain an ID value.
69 * The upper 32 bits contain a generation number and flags. When the ID value
70 * wraps the generation number is incremented during ID generation. This
71 * protects callers from ID collisions that can happen as a result of the wrap.
72 *
73 * The kernel internal interface, timeout_generic(), always returns a
74 * callout_id_t. But the legacy interfaces, timeout() and realtime_timeout()
75 * return a timeout_id_t. On a 64-bit system, timeout_id_t is also 64 bits.
76 * So, the full 64-bit ID (sans the flags) can be returned. However, on 32-bit
77 * systems, timeout_id_t is 32 bits. So, only the lower 32 bits can be
78 * returned. In such cases, a default generation number of 0 is assigned to
79 * the legacy IDs.
80 *
81 * The lower 32-bit ID space is partitioned into two spaces - one for 32-bit
82 * IDs and the other for 64-bit IDs. The 32-bit ID space is further divided
83 * into two spaces - one for short-term callouts and one for long-term.
84 *
85 * Here is the bit layout for the callout ID:
86 *
87 *      63    62    61  ...  32   31       30    29 .. X+1  X ... 1   0
88 *  -----------------------------------------------------------------------
89 *  | Exec | Hres | Generation | Long | Counter | ID bits | Table  | Type |
90 *  |      | time | number     | term | High    |         | number |      |
91 *  -----------------------------------------------------------------------
92 *
93 * Exec(uting):
94 *    This is the executing bit which is only set in the extended callout
95 *    ID. This bit indicates that the callout handler is currently being
96 *    executed.
97 *
98 * Hrestime:
99 *    Kernel features like condition variables use hrestime (system date) in
100 *    conjunction with callouts. Under normal circumstances, these callouts
101 *    are handled in the usual manner. They go off at specified times. But
102 *    when the system time is changed abruptly (e.g., via stime()), these
103 *    callouts are required to be processed immediately so that they can
104 *    wakeup their threads immediately. The Hrestime bit is used to mark
105 *    such callouts. When the system time is changed, the callout subsystem
106 *    is called to process all callouts with this bit set.
107 *
108 * Generation number:
109 *    This is the generation part of the ID.
110 *
111 * Long term:
112 *    This bit indicates whether this is a short-term or a long-term callout.
113 *    The long-term bit exists to address the problem of callout ID collision
114 *    on 32-bit systems. This is an issue because the system typically
115 *    generates a large number of timeout() requests, which means that callout
116 *    IDs eventually get recycled. Most timeouts are very short-lived, so that
117 *    ID recycling isn't a problem; but there are a handful of timeouts which
118 *    are sufficiently long-lived to see their own IDs reused. We use the
119 *    long-term bit to partition the ID namespace into pieces; the short-term
120 *    space gets all the heavy traffic and can wrap frequently (i.e., on the
121 *    order of a day) with no ill effects; the long-term space gets very little
122 *    traffic and thus never wraps. That said, we need to future proof callouts
123 *    in case 32-bit systems grow in size and are able to consume callout IDs
124 *    at faster rates. So, we should make all the kernel clients that use
125 *    callouts to use the internal interface so that they can use IDs outside
126 *    of the legacy space with a proper generation number.
127 *
128 * Counter High + ID counter bits:
129 *    These bits represent the actual ID bits in the callout ID.
130 *    The highest bit of the running counter is always set; this ensures that
131 *    the callout ID is always non-zero, thus eliminating the need for an
132 *    explicit wrap-around test during ID generation.
133 *
134 * Table number:
135 *    These bits carry the table number for the callout table where the callout
136 *    is queued. Each CPU has its own callout table. So, the callout tables are
137 *    numbered from 0 - (max_ncpus - 1). Because max_ncpus is different on
138 *    different systems, the actual number of table number bits will vary
139 *    accordingly. And so will the ID counter bits.
140 *
141 * Type:
142 *    This bit represents the callout (table) type. Each CPU has one realtime
143 *    and one normal callout table.
144 */
145#define	CALLOUT_EXECUTING	0x8000000000000000ULL
146#define	CALLOUT_HRESTIME	0x4000000000000000ULL
147#define	CALLOUT_ID_MASK		~(CALLOUT_EXECUTING | CALLOUT_HRESTIME)
148#define	CALLOUT_GENERATION_LOW	0x100000000ULL
149#define	CALLOUT_LONGTERM	0x80000000
150#define	CALLOUT_COUNTER_HIGH	0x40000000
151#define	CALLOUT_TYPE_BITS	1
152#define	CALLOUT_NTYPES		(1 << CALLOUT_TYPE_BITS)
153#define	CALLOUT_TYPE_MASK	(CALLOUT_NTYPES - 1)
154#define	CALLOUT_COUNTER_SHIFT	callout_table_bits
155#define	CALLOUT_TABLE(t, f)	(((f) << CALLOUT_TYPE_BITS) | (t))
156#define	CALLOUT_TABLE_NUM(ct)	((ct) - callout_table)
157#define	CALLOUT_TABLE_TYPE(ct)	(CALLOUT_TABLE_NUM(ct) & CALLOUT_TYPE_MASK)
158#define	CALLOUT_TABLE_SEQID(ct)	(CALLOUT_TABLE_NUM(ct) >> CALLOUT_TYPE_BITS)
159
160/*
161 * We assume that during any period of CALLOUT_LONGTERM_TICKS ticks, at most
162 * (CALLOUT_COUNTER_HIGH / callout_counter_low) callouts will be generated.
163 */
164#define	CALLOUT_LONGTERM_TICKS	0x4000UL
165#define	CALLOUT_BUCKET_SHIFT	9
166#define	CALLOUT_BUCKETS		(1 << CALLOUT_BUCKET_SHIFT)
167#define	CALLOUT_BUCKET_MASK	(CALLOUT_BUCKETS - 1)
168#define	CALLOUT_HASH(x)		((x) & CALLOUT_BUCKET_MASK)
169#define	CALLOUT_IDHASH(x)	CALLOUT_HASH((x) >> CALLOUT_COUNTER_SHIFT)
170/*
171 * The multiply by 0 and 1 below are cosmetic. Just to align things better
172 * and make it more readable. The multiplications will be done at compile
173 * time.
174 */
175#define	CALLOUT_CLHASH(x)			\
176	CALLOUT_HASH(				\
177	    ((x)>>(CALLOUT_BUCKET_SHIFT*0)) ^	\
178	    ((x)>>(CALLOUT_BUCKET_SHIFT*1)) ^	\
179	    ((x)>>(CALLOUT_BUCKET_SHIFT*2)) ^	\
180	    ((x)>>(CALLOUT_BUCKET_SHIFT*3)))
181
182#define	CALLOUT_ID_TO_TABLE(id)		((id) & callout_table_mask)
183
184#define	CALLOUT_SHORT_ID(table)		\
185		((callout_id_t)(table) | CALLOUT_COUNTER_HIGH)
186#define	CALLOUT_LONG_ID(table)		\
187		(CALLOUT_SHORT_ID(table) | CALLOUT_LONGTERM)
188
189#define	CALLOUT_THREADS		2		/* keep it simple for now */
190
191#define	CALLOUT_REALTIME	0		/* realtime callout type */
192#define	CALLOUT_NORMAL		1		/* normal callout type */
193
194/*
195 * callout_t's are cache-aligned structures allocated from kmem caches. One kmem
196 * cache is created per lgrp and is shared by all CPUs in that lgrp. Benefits:
197 *	- cache pages are mapped only in the TLBs of the CPUs of the lgrp
198 *	- data in cache pages is present only in those CPU caches
199 *	- memory access performance improves with locality-awareness in kmem
200 *
201 * The following structure is used to manage per-lgroup kmem caches.
202 *
203 * NOTE: Free callout_t's go to a callout table's freelist. CPUs map to callout
204 * tables via their sequence IDs, not CPU IDs. DR operations can cause a
205 * free list to have callouts from multiple lgrp caches. This takes away some
206 * performance, but is no worse than if we did not use lgrp caches at all.
207 */
208typedef struct callout_cache {
209	struct callout_cache	*cc_next;	/* link in the global list */
210	lgrp_handle_t		cc_hand;	/* lgroup handle */
211	kmem_cache_t		*cc_cache;	/* kmem cache pointer */
212	kmem_cache_t		*cc_lcache;	/* kmem cache pointer */
213} callout_cache_t;
214
215/*
216 * The callout hash structure is used for queueing both callouts and
217 * callout lists. That is why the fields are declared as void *.
218 */
219typedef struct callout_hash {
220	void	*ch_head;
221	void	*ch_tail;
222} callout_hash_t;
223
224struct callout_list {
225	callout_list_t	*cl_next;	/* next in clhash */
226	callout_list_t	*cl_prev;	/* prev in clhash */
227	hrtime_t	cl_expiration;	/* expiration for callouts in list */
228	callout_hash_t	cl_callouts;	/* list of callouts */
229	kcondvar_t	cl_done;	/* signal callout completion */
230	ushort_t	cl_waiting;	/* count of waiting untimeouts */
231	kthread_id_t	cl_executor;	/* thread executing callout */
232	ulong_t		cl_pad;		/* cache alignment */
233};
234
235/*
236 * Per-callout table kstats.
237 *
238 * CALLOUT_TIMEOUTS
239 *	Callouts created since boot.
240 * CALLOUT_TIMEOUTS_PENDING
241 *	Number of outstanding callouts.
242 * CALLOUT_UNTIMEOUTS_UNEXPIRED
243 *	Number of cancelled callouts that have not expired.
244 * CALLOUT_UNTIMEOUTS_EXECUTING
245 *	Number of cancelled callouts that were executing at the time of
246 *	cancellation.
247 * CALLOUT_UNTIMEOUTS_EXPIRED
248 *	Number of cancelled callouts that had already expired at the time
249 *	of cancellations.
250 * CALLOUT_EXPIRATIONS
251 *	Number of callouts that expired.
252 * CALLOUT_ALLOCATIONS
253 *	Number of callout structures allocated.
254 */
255typedef enum callout_stat_type {
256	CALLOUT_TIMEOUTS,
257	CALLOUT_TIMEOUTS_PENDING,
258	CALLOUT_UNTIMEOUTS_UNEXPIRED,
259	CALLOUT_UNTIMEOUTS_EXECUTING,
260	CALLOUT_UNTIMEOUTS_EXPIRED,
261	CALLOUT_EXPIRATIONS,
262	CALLOUT_ALLOCATIONS,
263	CALLOUT_NUM_STATS
264} callout_stat_type_t;
265
266/*
267 * Callout flags:
268 *
269 * CALLOUT_FLAG_ROUNDUP
270 *	Roundup the expiration time to the nearest resolution boundary.
271 *	If this flag is not specified, the expiration time is rounded down.
272 * CALLOUT_FLAG_ABSOLUTE
273 *	Normally, the expiration passed to the timeout API functions is an
274 *	expiration interval. If this flag is specified, then it is
275 *	interpreted as the expiration time itself.
276 * CALLOUT_FLAG_HRESTIME
277 *	Normally, callouts are not affected by changes to system time
278 *	(hrestime). This flag is used to create a callout that is affected
279 *	by system time. If system time changes, these timers must expire
280 *	at once. These are used by condition variables and LWP timers that
281 *	need this behavior.
282 * CALLOUT_FLAG_32BIT
283 *	Legacy interfaces timeout() and realtime_timeout() pass this flag
284 *	to timeout_generic() to indicate that a 32-bit ID should be allocated.
285 */
286#define	CALLOUT_FLAG_ROUNDUP		0x1
287#define	CALLOUT_FLAG_ABSOLUTE		0x2
288#define	CALLOUT_FLAG_HRESTIME		0x4
289#define	CALLOUT_FLAG_32BIT		0x8
290
291/*
292 * On 32-bit systems, the legacy interfaces, timeout() and realtime_timeout(),
293 * must pass CALLOUT_FLAG_32BIT to timeout_generic() so that a 32-bit ID
294 * can be generated.
295 */
296#ifdef _LP64
297#define	CALLOUT_LEGACY		0
298#else
299#define	CALLOUT_LEGACY		CALLOUT_FLAG_32BIT
300#endif
301
302/*
303 * All of the state information associated with a callout table.
304 * The fields are ordered with cache performance in mind.
305 */
306typedef struct callout_table {
307	kmutex_t	ct_mutex;	/* protects all callout state */
308	callout_t	*ct_free;	/* free callout structures */
309	callout_list_t	*ct_lfree;	/* free callout list structures */
310	callout_id_t	ct_short_id;	/* most recently issued short-term ID */
311	callout_id_t	ct_long_id;	/* most recently issued long-term ID */
312	callout_hash_t 	*ct_idhash;	/* ID hash chains */
313	callout_hash_t 	*ct_clhash;	/* callout list hash */
314	kstat_named_t	*ct_kstat_data;	/* callout kstat data */
315
316	uint_t		ct_type;	/* callout table type */
317	uint_t		ct_suspend;	/* suspend count */
318	cyclic_id_t	ct_cyclic;	/* cyclic for this table */
319	hrtime_t	*ct_heap;	/* callout expiration heap */
320	ulong_t		ct_heap_num;	/* occupied slots in the heap */
321	ulong_t		ct_heap_max;	/* end of the heap */
322	kmem_cache_t	*ct_cache;	/* callout kmem cache */
323	kmem_cache_t	*ct_lcache;	/* callout list kmem cache */
324	callout_id_t	ct_gen_id;	/* generation based ID */
325
326	callout_hash_t	ct_expired;	/* list of expired callout lists */
327	taskq_t		*ct_taskq;	/* taskq to execute normal callouts */
328	kstat_t		*ct_kstats;	/* callout kstats */
329#ifdef _LP64
330	ulong_t		ct_pad[4];	/* cache alignment */
331#else
332	ulong_t		ct_pad[7];	/* cache alignment */
333#endif
334} callout_table_t;
335
336/*
337 * Short hand definitions for the callout kstats.
338 */
339#define	ct_timeouts							\
340		ct_kstat_data[CALLOUT_TIMEOUTS].value.ui64
341#define	ct_timeouts_pending						\
342		ct_kstat_data[CALLOUT_TIMEOUTS_PENDING].value.ui64
343#define	ct_untimeouts_unexpired						\
344		ct_kstat_data[CALLOUT_UNTIMEOUTS_UNEXPIRED].value.ui64
345#define	ct_untimeouts_executing						\
346		ct_kstat_data[CALLOUT_UNTIMEOUTS_EXECUTING].value.ui64
347#define	ct_untimeouts_expired						\
348		ct_kstat_data[CALLOUT_UNTIMEOUTS_EXPIRED].value.ui64
349#define	ct_expirations							\
350		ct_kstat_data[CALLOUT_EXPIRATIONS].value.ui64
351#define	ct_allocations							\
352		ct_kstat_data[CALLOUT_ALLOCATIONS].value.ui64
353
354#define	CALLOUT_CHUNK	128
355
356#define	CALLOUT_HEAP_PARENT(index)	(((index) - 1) >> 1)
357#define	CALLOUT_HEAP_RIGHT(index)	(((index) + 1) << 1)
358#define	CALLOUT_HEAP_LEFT(index)	((((index) + 1) << 1) - 1)
359
360#define	CALLOUT_CYCLIC_HANDLER(t)					\
361	((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal)
362
363/*
364 * We define a blanket minimum resolution for callouts of 1 millisecond.
365 * 1 millisecond is a safe value as it is already supported when the clock
366 * resolution is set to high.
367 */
368#define	CALLOUT_MIN_RESOLUTION		1000000ULL
369#define	CALLOUT_TCP_RESOLUTION		10000000ULL
370
371#define	CALLOUT_ALIGN	64	/* cache line size */
372
373#ifdef _LP64
374#define	CALLOUT_MAX_TICKS	NSEC_TO_TICK(CY_INFINITY);
375#else
376#define	CALLOUT_MAX_TICKS	LONG_MAX
377#endif
378
379extern void		callout_init(void);
380extern void		membar_sync(void);
381extern void		callout_cpu_online(cpu_t *);
382extern void		callout_cpu_offline(cpu_t *);
383extern void		callout_hrestime(void);
384
385#endif
386
387#ifdef	__cplusplus
388}
389#endif
390
391#endif	/* _SYS_CALLO_H */
392