1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/types.h>
30#include <sys/systm.h>
31#include <sys/cred.h>
32#include <sys/modctl.h>
33#include <sys/vfs.h>
34#include <sys/vfs_opreg.h>
35#include <sys/sysmacros.h>
36#include <sys/cmn_err.h>
37#include <sys/stat.h>
38#include <sys/errno.h>
39#include <sys/kmem.h>
40#include <sys/file.h>
41#include <sys/kstat.h>
42#include <sys/port_impl.h>
43#include <sys/task.h>
44#include <sys/project.h>
45
46/*
47 * Event Ports can be shared across threads or across processes.
48 * Every thread/process can use an own event port or a group of them
49 * can use a single port. A major request was also to get the ability
50 * to submit user-defined events to a port. The idea of the
51 * user-defined events is to use the event ports for communication between
52 * threads/processes (like message queues). User defined-events are queued
53 * in a port with the same priority as other event types.
54 *
55 * Events are delivered only once. The thread/process which is waiting
56 * for events with the "highest priority" (priority here is related to the
57 * internal strategy to wakeup waiting threads) will retrieve the event,
58 * all other threads/processes will not be notified. There is also
59 * the requirement to have events which should be submitted immediately
60 * to all "waiting" threads. That is the main task of the alert event.
61 * The alert event is submitted by the application to a port. The port
62 * changes from a standard mode to the alert mode. Now all waiting threads
63 * will be awaken immediately and they will return with the alert event.
64 * Threads trying to retrieve events from a port in alert mode will
65 * return immediately with the alert event.
66 *
67 *
68 * An event port is like a kernel queue, which accept events submitted from
69 * user level as well as events submitted from kernel sub-systems. Sub-systems
70 * able to submit events to a port are the so-called "event sources".
71 * Current event sources:
72 * PORT_SOURCE_AIO	 : events submitted per transaction completion from
73 *			   POSIX-I/O framework.
74 * PORT_SOURCE_TIMER	 : events submitted when a timer fires
75 *			   (see timer_create(3RT)).
76 * PORT_SOURCE_FD	 : events submitted per file descriptor (see poll(2)).
77 * PORT_SOURCE_ALERT	 : events submitted from user. This is not really a
78 *			   single event, this is actually a port mode
79 *			   (see port_alert(3c)).
80 * PORT_SOURCE_USER	 : events submitted by applications with
81 *			   port_send(3c) or port_sendn(3c).
82 * PORT_SOURCE_FILE	 : events submitted per file being watched for file
83 *			   change events  (see port_create(3c).
84 *
85 * There is a user API implemented in the libc library as well as a
86 * kernel API implemented in port_subr.c in genunix.
87 * The available user API functions are:
88 * port_create() : create a port as a file descriptor of portfs file system
89 *		   The standard close(2) function closes a port.
90 * port_associate() : associate a file descriptor with a port to be able to
91 *		      retrieve events from that file descriptor.
92 * port_dissociate(): remove the association of a file descriptor with a port.
93 * port_alert()	 : set/unset a port in alert mode
94 * port_send()	 : send an event of type PORT_SOURCE_USER to a port
95 * port_sendn()	 : send an event of type PORT_SOURCE_USER to a list of ports
96 * port_get()	 : retrieve a single event from a port
97 * port_getn()	 : retrieve a list of events from a port
98 *
99 * The available kernel API functions are:
100 * port_allocate_event(): allocate an event slot/structure of/from a port
101 * port_init_event()    : set event data in the event structure
102 * port_send_event()    : send event to a port
103 * port_free_event()    : deliver allocated slot/structure back to a port
104 * port_associate_ksource(): associate a kernel event source with a port
105 * port_dissociate_ksource(): dissociate a kernel event source from a port
106 *
107 * The libc implementation consists of small functions which pass the
108 * arguments to the kernel using the "portfs" system call. It means, all the
109 * synchronisation work is being done in the kernel. The "portfs" system
110 * call loads the portfs file system into the kernel.
111 *
112 * PORT CREATION
113 * The first function to be used is port_create() which internally creates
114 * a vnode and a portfs node. The portfs node is represented by the port_t
115 * structure, which again includes all the data necessary to control a port.
116 * port_create() returns a file descriptor, which needs to be used in almost
117 * all other event port functions.
118 * The maximum number of ports per system is controlled by the resource
119 * control: project:port-max-ids.
120 *
121 * EVENT GENERATION
122 * The second step is the triggering of events, which could be sent to a port.
123 * Every event source implements an own method to generate events for a port:
124 * PORT_SOURCE_AIO:
125 * 	The sigevent structure of the standard POSIX-IO functions
126 * 	was extended by an additional notification type.
127 * 	Standard notification types:
128 * 	SIGEV_NONE, SIGEV_SIGNAL and SIGEV_THREAD
129 * 	Event ports introduced now SIGEV_PORT.
130 * 	The notification type SIGEV_PORT specifies that a structure
131 * 	of type port_notify_t has to be attached to the sigev_value.
132 * 	The port_notify_t structure contains the event port file
133 * 	descriptor and a user-defined pointer.
134 * 	Internally the AIO implementation will use the kernel API
135 * 	functions to allocate an event port slot per transaction (aiocb)
136 * 	and sent the event to the port as soon as the transaction completes.
137 * 	All the events submitted per transaction are of type
138 * 	PORT_SOURCE_AIO.
139 * PORT_SOURCE_TIMER:
140 * 	The timer_create() function uses the same method as the
141 * 	PORT_SOURCE_AIO event source. It also uses the sigevent structure
142 * 	to deliver the port information.
143 * 	Internally the timer code will allocate a single event slot/struct
144 * 	per timer and it will send the timer event as soon as the timer
145 * 	fires. If the timer-fired event is not delivered to the application
146 * 	before the next period elapsed, then an overrun counter will be
147 * 	incremented. The timer event source uses a callback function to
148 * 	detect the delivery of the event to the application. At that time
149 * 	the timer callback function will update the event overrun counter.
150 * PORT_SOURCE_FD:
151 * 	This event source uses the port_associate() function to allocate
152 * 	an event slot/struct from a port. The application defines in the
153 * 	events argument of port_associate() the type of events which it is
154 * 	interested on.
155 * 	The internal pollwakeup() function is used by all the file
156 * 	systems --which are supporting the VOP_POLL() interface- to notify
157 * 	the upper layer (poll(2), devpoll(7d) and now event ports) about
158 * 	the event triggered (see valid events in poll(2)).
159 * 	The pollwakeup() function forwards the event to the layer registered
160 * 	to receive the current event.
161 * 	The port_dissociate() function can be used to free the allocated
162 * 	event slot from the port. Anyway, file descriptors deliver events
163 * 	only one time and remain deactivated until the application
164 * 	reactivates the association of a file descriptor with port_associate().
165 * 	If an associated file descriptor is closed then the file descriptor
166 * 	will be dissociated automatically from the port.
167 *
168 * PORT_SOURCE_ALERT:
169 * 	This event type is generated when the port was previously set in
170 * 	alert mode using the port_alert() function.
171 * 	A single alert event is delivered to every thread which tries to
172 * 	retrieve events from a port.
173 * PORT_SOURCE_USER:
174 * 	This type of event is generated from user level using the port_send()
175 * 	function to send a user event to a port or the port_sendn() function
176 * 	to send an event to a list of ports.
177 * PORT_SOURCE_FILE:
178 *	This event source uses the port_associate() interface to register
179 *	a file to be monitored for changes. The file name that needs to be
180 *	monitored is specified in the file_obj_t structure, a pointer to which
181 *	is passed as an argument. The event types to be monitored are specified
182 *	in the events argument.
183 *	A file events monitor is represented internal per port per object
184 *	address(the file_obj_t pointer). Which means there can be multiple
185 *	watches registered on the same file using different file_obj_t
186 *	structure pointer. With the help of the	FEM(File Event Monitoring)
187 *	hooks, the file's vnode ops are intercepted and relevant events
188 *	delivered. The port_dissociate() function is used to de-register a
189 *	file events monitor on a file. When the specified file is
190 *	removed/renamed, the file events watch/monitor is automatically
191 *	removed.
192 *
193 * EVENT DELIVERY / RETRIEVING EVENTS
194 * Events remain in the port queue until:
195 * - the application uses port_get() or port_getn() to retrieve events,
196 * - the event source cancel the event,
197 * - the event port is closed or
198 * - the process exits.
199 * The maximal number of events in a port queue is the maximal number
200 * of event slots/structures which can be allocated by event sources.
201 * The allocation of event slots/structures is controlled by the resource
202 * control: process.port-max-events.
203 * The port_get() function retrieves a single event and the port_getn()
204 * function retrieves a list of events.
205 * Events are classified as shareable and non-shareable events across processes.
206 * Non-shareable events are invisible for the port_get(n)() functions of
207 * processes other than the owner of the event.
208 *    Shareable event types are:
209 *    PORT_SOURCE_USER events
210 * 	This type of event is unconditionally shareable and without
211 * 	limitations. If the parent process sends a user event and closes
212 * 	the port afterwards, the event remains in the port and the child
213 * 	process will still be able to retrieve the user event.
214 *    PORT_SOURCE_ALERT events
215 * 	This type of event is shareable between processes.
216 * 	Limitation:	The alert mode of the port is removed if the owner
217 * 			(process which set the port in alert mode) of the
218 * 			alert event closes the port.
219 *    PORT_SOURCE_FD events
220 * 	This type of event is conditional shareable between processes.
221 * 	After fork(2) all forked file descriptors are shareable between
222 * 	the processes. The child process is allowed to retrieve events
223 * 	from the associated file descriptors and it can also re-associate
224 * 	the fd with the port.
225 * 	Limitations:	The child process is not allowed to dissociate
226 * 			the file descriptor from the port. Only the
227 * 			owner (process) of the association is allowed to
228 * 			dissociate the file descriptor from the port.
229 * 			If the owner of the association closes the port
230 * 			the association will be removed.
231 *    PORT_SOURCE_AIO events
232 * 	This type of event is not shareable between processes.
233 *    PORT_SOURCE_TIMER events
234 * 	This type of event is not shareable between processes.
235 *    PORT_SOURCE_FILE events
236 * 	This type of event is not shareable between processes.
237 *
238 * FORK BEHAVIOUR
239 * On fork(2) the child process inherits all opened file descriptors from
240 * the parent process. This is also valid for port file descriptors.
241 * Associated file descriptors with a port maintain the association across the
242 * fork(2). It means, the child process gets full access to the port and
243 * it can retrieve events from all common associated file descriptors.
244 * Events of file descriptors created and associated with a port after the
245 * fork(2) are non-shareable and can only be retrieved by the same process.
246 *
247 * If the parent or the child process closes an exported port (using fork(2)
248 * or I_SENDFD) all the file descriptors associated with the port by the
249 * process will be dissociated from the port. Events of dissociated file
250 * descriptors as well as all non-shareable events will be discarded.
251 * The other process can continue working with the port as usual.
252 *
253 * CLOSING A PORT
254 * close(2) has to be used to close a port. See FORK BEHAVIOUR for details.
255 *
256 * PORT EVENT STRUCTURES
257 * The global control structure of the event ports framework is port_control_t.
258 * port_control_t keeps track of the number of created ports in the system.
259 * The cache of the port event structures is also located in port_control_t.
260 *
261 * On port_create() the vnode and the portfs node is also created.
262 * The portfs node is represented by the port_t structure.
263 * The port_t structure manages all port specific tasks:
264 * - management of resource control values
265 * - port VOP_POLL interface
266 * - creation time
267 * - uid and gid of the port
268 *
269 * The port_t structure contains the port_queue_t structure.
270 * The port_queue_t structure contains all the data necessary for the
271 * queue management:
272 * - locking
273 * - condition variables
274 * - event counters
275 * - submitted events	(represented by port_kevent_t structures)
276 * - threads waiting for event delivery (check portget_t structure)
277 * - PORT_SOURCE_FD cache	(managed by the port_fdcache_t structure)
278 * - event source management (managed by the port_source_t structure)
279 * - alert mode management	(check port_alert_t structure)
280 *
281 * EVENT MANAGEMENT
282 * The event port file system creates a kmem_cache for internal allocation of
283 * event port structures.
284 *
285 * 1. Event source association with a port:
286 * The first step to do for event sources is to get associated with a port
287 * using the port_associate_ksource() function or adding an entry to the
288 * port_ksource_tab[]. An event source can get dissociated from a port
289 * using the port_dissociate_ksource() function. An entry in the
290 * port_ksource_tab[] implies that the source will be associated
291 * automatically with every new created port.
292 * The event source can deliver a callback function, which is used by the
293 * port to notify the event source about close(2). The idea is that
294 * in such a case the event source should free all allocated resources
295 * and it must return to the port all allocated slots/structures.
296 * The port_close() function will wait until all allocated event
297 * structures/slots are returned to the port.
298 * The callback function is not necessary when the event source does not
299 * maintain local resources, a second condition is that the event source
300 * can guarantee that allocated event slots will be returned without
301 * delay to the port (it will not block and sleep somewhere).
302 *
303 * 2. Reservation of an event slot / event structure
304 * The event port reliability is based on the reservation of an event "slot"
305 * (allocation of an event structure) by the event source as part of the
306 * application call. If the maximal number of event slots is exhausted then
307 * the event source can return a corresponding error code to the application.
308 *
309 * The port_alloc_event() function has to be used by event sources to
310 * allocate an event slot (reserve an event structure). The port_alloc_event()
311 * doesn not block and it will return a 0 value on success or an error code
312 * if it fails.
313 * An argument of port_alloc_event() is a flag which determines the behavior
314 * of the event after it was delivered to the application:
315 * PORT_ALLOC_DEFAULT	: event slot becomes free after delivery to the
316 *			  application.
317 * PORT_ALLOC_PRIVATE	: event slot remains under the control of the event
318 *			  source. This kind of slots can not be used for
319 *			  event delivery and should only be used internally
320 *			  by the event source.
321 * PORT_KEV_CACHED	: event slot remains under the control of an event
322 *			  port cache. It does not become free after delivery
323 *			  to the application.
324 * PORT_ALLOC_SCACHED	: event slot remains under the control of the event
325 *			  source. The event source takes the control over
326 *			  the slot after the event is delivered to the
327 *			  application.
328 *
329 * 3. Delivery of events to the event port
330 * Earlier allocated event structure/slot has to be used to deliver
331 * event data to the port. Event source has to use the function
332 * port_send_event(). The single argument is a pointer to the previously
333 * reserved event structure/slot.
334 * The portkev_events field of the port_kevent_t structure can be updated/set
335 * in two ways:
336 * 1. using the port_set_event() function, or
337 * 2. updating the portkev_events field out of the callback function:
338 *    The event source can deliver a callback function to the port as an
339 *    argument of port_init_event().
340 *    One of the arguments of the callback function is a pointer to the
341 *    events field, which will be delivered to the application.
342 *    (see Delivery of events to the application).
343 * Event structures/slots can be delivered to the event port only one time,
344 * they remain blocked until the data is delivered to the application and the
345 * slot becomes free or it is delivered back to the event source
346 * (PORT_ALLOC_SCACHED). The activation of the callback function mentioned above
347 * is at the same time the indicator for the event source that the event
348 * structure/slot is free for reuse.
349 *
350 * 4. Delivery of events to the application
351 * The events structures/slots delivered by event sources remain in the
352 * port queue until they are retrieved by the application or the port
353 * is closed (exit(2) also closes all opened file descriptors)..
354 * The application uses port_get() or port_getn() to retrieve events from
355 * a port. port_get() retrieves a single event structure/slot and port_getn()
356 * retrieves a list of event structures/slots.
357 * Both functions are able to poll for events and return immediately or they
358 * can specify a timeout value.
359 * Before the events are delivered to the application they are moved to a
360 * second temporary internal queue. The idea is to avoid lock collisions or
361 * contentions of the global queue lock.
362 * The global queue lock is used every time when an event source delivers
363 * new events to the port.
364 * The port_get() and port_getn() functions
365 * a) retrieve single events from the temporary queue,
366 * b) prepare the data to be passed to the application memory,
367 * c) activate the callback function of the event sources:
368 *    - to get the latest event data,
369 *    - the event source can free all allocated resources associated with the
370 *      current event,
371 *    - the event source can re-use the current event slot/structure
372 *    - the event source can deny the delivery of the event to the application
373 *      (e.g. because of the wrong process).
374 * d) put the event back to the temporary queue if the event delivery was denied
375 * e) repeat a) until d) as long as there are events in the queue and
376 *    there is enough user space available.
377 *
378 * The loop described above could block for a very long time the global mutex,
379 * to avoid that a second mutex was introduced to synchronized concurrent
380 * threads accessing the temporary queue.
381 */
382
383static int64_t portfs(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
384    uintptr_t);
385
386static struct sysent port_sysent = {
387	6,
388	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
389	(int (*)())portfs,
390};
391
392static struct modlsys modlsys = {
393	&mod_syscallops, "event ports", &port_sysent
394};
395
396#ifdef _SYSCALL32_IMPL
397
398static int64_t
399portfs32(uint32_t arg1, int32_t arg2, uint32_t arg3, uint32_t arg4,
400    uint32_t arg5, uint32_t arg6);
401
402static struct sysent port_sysent32 = {
403	6,
404	SE_ARGC | SE_64RVAL | SE_NOUNLOAD,
405	(int (*)())portfs32,
406};
407
408static struct modlsys modlsys32 = {
409	&mod_syscallops32,
410	"32-bit event ports syscalls",
411	&port_sysent32
412};
413#endif	/* _SYSCALL32_IMPL */
414
415static struct modlinkage modlinkage = {
416	MODREV_1,
417	&modlsys,
418#ifdef _SYSCALL32_IMPL
419	&modlsys32,
420#endif
421	NULL
422};
423
424port_kstat_t port_kstat = {
425	{ "ports",	KSTAT_DATA_UINT32 }
426};
427
428dev_t	portdev;
429struct	vnodeops *port_vnodeops;
430struct	vfs port_vfs;
431
432extern	rctl_hndl_t rc_process_portev;
433extern	rctl_hndl_t rc_project_portids;
434extern	void aio_close_port(void *, int, pid_t, int);
435
436/*
437 * This table contains a list of event sources which need a static
438 * association with a port (every port).
439 * The last NULL entry in the table is required to detect "end of table".
440 */
441struct port_ksource port_ksource_tab[] = {
442	{PORT_SOURCE_AIO, aio_close_port, NULL, NULL},
443	{0, NULL, NULL, NULL}
444};
445
446/* local functions */
447static int port_getn(port_t *, port_event_t *, uint_t, uint_t *,
448    port_gettimer_t *);
449static int port_sendn(int [], int [], uint_t, int, void *, uint_t *);
450static int port_alert(port_t *, int, int, void *);
451static int port_dispatch_event(port_t *, int, int, int, uintptr_t, void *);
452static int port_send(port_t *, int, int, void *);
453static int port_create(int *);
454static int port_get_alert(port_alert_t *, port_event_t *);
455static int port_copy_event(port_event_t *, port_kevent_t *, list_t *);
456static int *port_errorn(int *, int, int, int);
457static int port_noshare(void *, int *, pid_t, int, void *);
458static int port_get_timeout(timespec_t *, timespec_t *, timespec_t **, int *,
459    int);
460static void port_init(port_t *);
461static void port_remove_alert(port_queue_t *);
462static void port_add_ksource_local(port_t *, port_ksource_t *);
463static void port_check_return_cond(port_queue_t *);
464static void port_dequeue_thread(port_queue_t *, portget_t *);
465static portget_t *port_queue_thread(port_queue_t *, uint_t);
466static void port_kstat_init(void);
467
468#ifdef	_SYSCALL32_IMPL
469static int port_copy_event32(port_event32_t *, port_kevent_t *, list_t *);
470#endif
471
472int
473_init(void)
474{
475	static const fs_operation_def_t port_vfsops_template[] = {
476		NULL, NULL
477	};
478	extern const	fs_operation_def_t port_vnodeops_template[];
479	vfsops_t	*port_vfsops;
480	int		error;
481	major_t 	major;
482
483	if ((major = getudev()) == (major_t)-1)
484		return (ENXIO);
485	portdev = makedevice(major, 0);
486
487	/* Create a dummy vfs */
488	error = vfs_makefsops(port_vfsops_template, &port_vfsops);
489	if (error) {
490		cmn_err(CE_WARN, "port init: bad vfs ops");
491		return (error);
492	}
493	vfs_setops(&port_vfs, port_vfsops);
494	port_vfs.vfs_flag = VFS_RDONLY;
495	port_vfs.vfs_dev = portdev;
496	vfs_make_fsid(&(port_vfs.vfs_fsid), portdev, 0);
497
498	error = vn_make_ops("portfs", port_vnodeops_template, &port_vnodeops);
499	if (error) {
500		vfs_freevfsops(port_vfsops);
501		cmn_err(CE_WARN, "port init: bad vnode ops");
502		return (error);
503	}
504
505	mutex_init(&port_control.pc_mutex, NULL, MUTEX_DEFAULT, NULL);
506	port_control.pc_nents = 0;	/* number of active ports */
507
508	/* create kmem_cache for port event structures */
509	port_control.pc_cache = kmem_cache_create("port_cache",
510	    sizeof (port_kevent_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
511
512	port_kstat_init();		/* init port kstats */
513	return (mod_install(&modlinkage));
514}
515
516int
517_info(struct modinfo *modinfop)
518{
519	return (mod_info(&modlinkage, modinfop));
520}
521
522/*
523 * System call wrapper for all port related system calls from 32-bit programs.
524 */
525#ifdef _SYSCALL32_IMPL
526static int64_t
527portfs32(uint32_t opcode, int32_t a0, uint32_t a1, uint32_t a2, uint32_t a3,
528    uint32_t a4)
529{
530	int64_t	error;
531
532	switch (opcode & PORT_CODE_MASK) {
533	case PORT_GET:
534		error = portfs(PORT_GET, a0, a1, (int)a2, (int)a3, a4);
535		break;
536	case PORT_SENDN:
537		error = portfs(opcode, (uint32_t)a0, a1, a2, a3, a4);
538		break;
539	default:
540		error = portfs(opcode, a0, a1, a2, a3, a4);
541		break;
542	}
543	return (error);
544}
545#endif	/* _SYSCALL32_IMPL */
546
547/*
548 * System entry point for port functions.
549 * a0 is a port file descriptor (except for PORT_SENDN and PORT_CREATE).
550 * The libc uses PORT_SYS_NOPORT in functions which do not deliver a
551 * port file descriptor as first argument.
552 */
553static int64_t
554portfs(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3,
555    uintptr_t a4)
556{
557	rval_t		r;
558	port_t		*pp;
559	int 		error = 0;
560	uint_t		nget;
561	file_t		*fp;
562	port_gettimer_t	port_timer;
563
564	r.r_vals = 0;
565	if (opcode & PORT_SYS_NOPORT) {
566		opcode &= PORT_CODE_MASK;
567		if (opcode == PORT_SENDN) {
568			error = port_sendn((int *)a0, (int *)a1, (uint_t)a2,
569			    (int)a3, (void *)a4, (uint_t *)&r.r_val1);
570			if (error && (error != EIO))
571				return ((int64_t)set_errno(error));
572			return (r.r_vals);
573		}
574
575		if (opcode == PORT_CREATE) {
576			error = port_create(&r.r_val1);
577			if (error)
578				return ((int64_t)set_errno(error));
579			return (r.r_vals);
580		}
581	}
582
583	/* opcodes using port as first argument (a0) */
584
585	if ((fp = getf((int)a0)) == NULL)
586		return ((uintptr_t)set_errno(EBADF));
587
588	if (fp->f_vnode->v_type != VPORT) {
589		releasef((int)a0);
590		return ((uintptr_t)set_errno(EBADFD));
591	}
592
593	pp = VTOEP(fp->f_vnode);
594
595	switch (opcode & PORT_CODE_MASK) {
596	case	PORT_GET:
597	{
598		/* see PORT_GETN description */
599		struct	timespec timeout;
600
601		port_timer.pgt_flags = PORTGET_ONE;
602		port_timer.pgt_loop = 0;
603		port_timer.pgt_rqtp = NULL;
604		if (a4 != NULL) {
605			port_timer.pgt_timeout = &timeout;
606			timeout.tv_sec = (time_t)a2;
607			timeout.tv_nsec = (long)a3;
608		} else {
609			port_timer.pgt_timeout = NULL;
610		}
611		do {
612			nget = 1;
613			error = port_getn(pp, (port_event_t *)a1, 1,
614			    (uint_t *)&nget, &port_timer);
615		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
616		break;
617	}
618	case	PORT_GETN:
619	{
620		/*
621		 * port_getn() can only retrieve own or shareable events from
622		 * other processes. The port_getn() function remains in the
623		 * kernel until own or shareable events are available or the
624		 * timeout elapses.
625		 */
626		port_timer.pgt_flags = 0;
627		port_timer.pgt_loop = 0;
628		port_timer.pgt_rqtp = NULL;
629		port_timer.pgt_timeout = (struct timespec *)a4;
630		do {
631			nget = a3;
632			error = port_getn(pp, (port_event_t *)a1, (uint_t)a2,
633			    (uint_t *)&nget, &port_timer);
634		} while (nget == 0 && error == 0 && port_timer.pgt_loop);
635		r.r_val1 = nget;
636		r.r_val2 = error;
637		releasef((int)a0);
638		if (error && error != ETIME)
639			return ((int64_t)set_errno(error));
640		return (r.r_vals);
641	}
642	case	PORT_ASSOCIATE:
643	{
644		switch ((int)a1) {
645		case PORT_SOURCE_FD:
646			error = port_associate_fd(pp, (int)a1, (uintptr_t)a2,
647			    (int)a3, (void *)a4);
648			break;
649		case PORT_SOURCE_FILE:
650			error = port_associate_fop(pp, (int)a1, (uintptr_t)a2,
651			    (int)a3, (void *)a4);
652			break;
653		default:
654			error = EINVAL;
655			break;
656		}
657		break;
658	}
659	case	PORT_SEND:
660	{
661		/* user-defined events */
662		error = port_send(pp, PORT_SOURCE_USER, (int)a1, (void *)a2);
663		break;
664	}
665	case	PORT_DISPATCH:
666	{
667		/*
668		 * library events, blocking
669		 * Only events of type PORT_SOURCE_AIO or PORT_SOURCE_MQ
670		 * are currently allowed.
671		 */
672		if ((int)a1 != PORT_SOURCE_AIO && (int)a1 != PORT_SOURCE_MQ) {
673			error = EINVAL;
674			break;
675		}
676		error = port_dispatch_event(pp, (int)opcode, (int)a1, (int)a2,
677		    (uintptr_t)a3, (void *)a4);
678		break;
679	}
680	case	PORT_DISSOCIATE:
681	{
682		switch ((int)a1) {
683		case PORT_SOURCE_FD:
684			error = port_dissociate_fd(pp, (uintptr_t)a2);
685			break;
686		case PORT_SOURCE_FILE:
687			error = port_dissociate_fop(pp, (uintptr_t)a2);
688			break;
689		default:
690			error = EINVAL;
691			break;
692		}
693		break;
694	}
695	case	PORT_ALERT:
696	{
697		if ((int)a2)	/* a2 = events */
698			error = port_alert(pp, (int)a1, (int)a2, (void *)a3);
699		else
700			port_remove_alert(&pp->port_queue);
701		break;
702	}
703	default:
704		error = EINVAL;
705		break;
706	}
707
708	releasef((int)a0);
709	if (error)
710		return ((int64_t)set_errno(error));
711	return (r.r_vals);
712}
713
714/*
715 * System call to create a port.
716 *
717 * The port_create() function creates a vnode of type VPORT per port.
718 * The port control data is associated with the vnode as vnode private data.
719 * The port_create() function returns an event port file descriptor.
720 */
721static int
722port_create(int *fdp)
723{
724	port_t		*pp;
725	vnode_t		*vp;
726	struct file	*fp;
727	proc_t		*p = curproc;
728
729	/* initialize vnode and port private data */
730	pp = kmem_zalloc(sizeof (port_t), KM_SLEEP);
731
732	pp->port_vnode = vn_alloc(KM_SLEEP);
733	vp = EPTOV(pp);
734	vn_setops(vp, port_vnodeops);
735	vp->v_type = VPORT;
736	vp->v_vfsp = &port_vfs;
737	vp->v_data = (caddr_t)pp;
738
739	mutex_enter(&port_control.pc_mutex);
740	/*
741	 * Retrieve the maximal number of event ports allowed per system from
742	 * the resource control: project.port-max-ids.
743	 */
744	mutex_enter(&p->p_lock);
745	if (rctl_test(rc_project_portids, p->p_task->tk_proj->kpj_rctls, p,
746	    port_control.pc_nents + 1, RCA_SAFE) & RCT_DENY) {
747		mutex_exit(&p->p_lock);
748		vn_free(vp);
749		kmem_free(pp, sizeof (port_t));
750		mutex_exit(&port_control.pc_mutex);
751		return (EAGAIN);
752	}
753
754	/*
755	 * Retrieve the maximal number of events allowed per port from
756	 * the resource control: process.port-max-events.
757	 */
758	pp->port_max_events = rctl_enforced_value(rc_process_portev,
759	    p->p_rctls, p);
760	mutex_exit(&p->p_lock);
761
762	/* allocate a new user file descriptor and a file structure */
763	if (falloc(vp, 0, &fp, fdp)) {
764		/*
765		 * If the file table is full, free allocated resources.
766		 */
767		vn_free(vp);
768		kmem_free(pp, sizeof (port_t));
769		mutex_exit(&port_control.pc_mutex);
770		return (EMFILE);
771	}
772
773	mutex_exit(&fp->f_tlock);
774
775	pp->port_fd = *fdp;
776	port_control.pc_nents++;
777	p->p_portcnt++;
778	port_kstat.pks_ports.value.ui32++;
779	mutex_exit(&port_control.pc_mutex);
780
781	/* initializes port private data */
782	port_init(pp);
783	/* set user file pointer */
784	setf(*fdp, fp);
785	return (0);
786}
787
788/*
789 * port_init() initializes event port specific data
790 */
791static void
792port_init(port_t *pp)
793{
794	port_queue_t	*portq;
795	port_ksource_t	*pks;
796
797	mutex_init(&pp->port_mutex, NULL, MUTEX_DEFAULT, NULL);
798	portq = &pp->port_queue;
799	mutex_init(&portq->portq_mutex, NULL, MUTEX_DEFAULT, NULL);
800	pp->port_flags |= PORT_INIT;
801
802	/*
803	 * If it is not enough memory available to satisfy a user
804	 * request using a single port_getn() call then port_getn()
805	 * will reduce the size of the list to PORT_MAX_LIST.
806	 */
807	pp->port_max_list = port_max_list;
808
809	/* Set timestamp entries required for fstat(2) requests */
810	gethrestime(&pp->port_ctime);
811	pp->port_uid = crgetuid(curproc->p_cred);
812	pp->port_gid = crgetgid(curproc->p_cred);
813
814	/* initialize port queue structs */
815	list_create(&portq->portq_list, sizeof (port_kevent_t),
816	    offsetof(port_kevent_t, portkev_node));
817	list_create(&portq->portq_get_list, sizeof (port_kevent_t),
818	    offsetof(port_kevent_t, portkev_node));
819	portq->portq_flags = 0;
820	pp->port_pid = curproc->p_pid;
821
822	/* Allocate cache skeleton for PORT_SOURCE_FD events */
823	portq->portq_pcp = kmem_zalloc(sizeof (port_fdcache_t), KM_SLEEP);
824	mutex_init(&portq->portq_pcp->pc_lock, NULL, MUTEX_DEFAULT, NULL);
825
826	/*
827	 * Allocate cache skeleton for association of event sources.
828	 */
829	mutex_init(&portq->portq_source_mutex, NULL, MUTEX_DEFAULT, NULL);
830	portq->portq_scache = kmem_zalloc(
831	    PORT_SCACHE_SIZE * sizeof (port_source_t *), KM_SLEEP);
832
833	/*
834	 * pre-associate some kernel sources with this port.
835	 * The pre-association is required to create port_source_t
836	 * structures for object association.
837	 * Some sources can not get associated with a port before the first
838	 * object association is requested. Another reason to pre_associate
839	 * a particular source with a port is because of performance.
840	 */
841
842	for (pks = port_ksource_tab; pks->pks_source != 0; pks++)
843		port_add_ksource_local(pp, pks);
844}
845
846/*
847 * The port_add_ksource_local() function is being used to associate
848 * event sources with every new port.
849 * The event sources need to be added to port_ksource_tab[].
850 */
851static void
852port_add_ksource_local(port_t *pp, port_ksource_t *pks)
853{
854	port_source_t	*pse;
855	port_source_t	**ps;
856
857	mutex_enter(&pp->port_queue.portq_source_mutex);
858	ps = &pp->port_queue.portq_scache[PORT_SHASH(pks->pks_source)];
859	for (pse = *ps; pse != NULL; pse = pse->portsrc_next) {
860		if (pse->portsrc_source == pks->pks_source)
861			break;
862	}
863
864	if (pse == NULL) {
865		/* associate new source with the port */
866		pse = kmem_zalloc(sizeof (port_source_t), KM_SLEEP);
867		pse->portsrc_source = pks->pks_source;
868		pse->portsrc_close = pks->pks_close;
869		pse->portsrc_closearg = pks->pks_closearg;
870		pse->portsrc_cnt = 1;
871
872		pks->pks_portsrc = pse;
873		if (*ps != NULL)
874			pse->portsrc_next = (*ps)->portsrc_next;
875		*ps = pse;
876	}
877	mutex_exit(&pp->port_queue.portq_source_mutex);
878}
879
880/*
881 * The port_send() function sends an event of type "source" to a
882 * port. This function is non-blocking. An event can be sent to
883 * a port as long as the number of events per port does not achieve the
884 * maximal allowed number of events. The max. number of events per port is
885 * defined by the resource control process.max-port-events.
886 * This function is used by the port library function port_send()
887 * and port_dispatch(). The port_send(3c) function is part of the
888 * event ports API and submits events of type PORT_SOURCE_USER. The
889 * port_dispatch() function is project private and it is used by library
890 * functions to submit events of other types than PORT_SOURCE_USER
891 * (e.g. PORT_SOURCE_AIO).
892 */
893static int
894port_send(port_t *pp, int source, int events, void *user)
895{
896	port_kevent_t	*pev;
897	int		error;
898
899	error = port_alloc_event_local(pp, source, PORT_ALLOC_DEFAULT, &pev);
900	if (error)
901		return (error);
902
903	pev->portkev_object = 0;
904	pev->portkev_events = events;
905	pev->portkev_user = user;
906	pev->portkev_callback = NULL;
907	pev->portkev_arg = NULL;
908	pev->portkev_flags = 0;
909
910	port_send_event(pev);
911	return (0);
912}
913
914/*
915 * The port_noshare() function returns 0 if the current event was generated
916 * by the same process. Otherwise is returns a value other than 0 and the
917 * event should not be delivered to the current processe.
918 * The port_noshare() function is normally used by the port_dispatch()
919 * function. The port_dispatch() function is project private and can only be
920 * used within the event port project.
921 * Currently the libaio uses the port_dispatch() function to deliver events
922 * of types PORT_SOURCE_AIO.
923 */
924/* ARGSUSED */
925static int
926port_noshare(void *arg, int *events, pid_t pid, int flag, void *evp)
927{
928	if (flag == PORT_CALLBACK_DEFAULT && curproc->p_pid != pid)
929		return (1);
930	return (0);
931}
932
933/*
934 * The port_dispatch_event() function is project private and it is used by
935 * libraries involved in the project to deliver events to the port.
936 * port_dispatch will sleep and wait for enough resources to satisfy the
937 * request, if necessary.
938 * The library can specify if the delivered event is shareable with other
939 * processes (see PORT_SYS_NOSHARE flag).
940 */
941static int
942port_dispatch_event(port_t *pp, int opcode, int source, int events,
943    uintptr_t object, void *user)
944{
945	port_kevent_t	*pev;
946	int		error;
947
948	error = port_alloc_event_block(pp, source, PORT_ALLOC_DEFAULT, &pev);
949	if (error)
950		return (error);
951
952	pev->portkev_object = object;
953	pev->portkev_events = events;
954	pev->portkev_user = user;
955	pev->portkev_arg = NULL;
956	if (opcode & PORT_SYS_NOSHARE) {
957		pev->portkev_flags = PORT_KEV_NOSHARE;
958		pev->portkev_callback = port_noshare;
959	} else {
960		pev->portkev_flags = 0;
961		pev->portkev_callback = NULL;
962	}
963
964	port_send_event(pev);
965	return (0);
966}
967
968
969/*
970 * The port_sendn() function is the kernel implementation of the event
971 * port API function port_sendn(3c).
972 * This function is able to send an event to a list of event ports.
973 */
974static int
975port_sendn(int ports[], int errors[], uint_t nent, int events, void *user,
976    uint_t *nget)
977{
978	port_kevent_t	*pev;
979	int		errorcnt = 0;
980	int		error = 0;
981	int		count;
982	int		port;
983	int		*plist;
984	int		*elist = NULL;
985	file_t		*fp;
986	port_t		*pp;
987
988	if (nent == 0 || nent > port_max_list)
989		return (EINVAL);
990
991	plist = kmem_alloc(nent * sizeof (int), KM_SLEEP);
992	if (copyin((void *)ports, plist, nent * sizeof (int))) {
993		kmem_free(plist, nent * sizeof (int));
994		return (EFAULT);
995	}
996
997	/*
998	 * Scan the list for event port file descriptors and send the
999	 * attached user event data embedded in a event of type
1000	 * PORT_SOURCE_USER to every event port in the list.
1001	 * If a list entry is not a valid event port then the corresponding
1002	 * error code will be stored in the errors[] list with the same
1003	 * list offset as in the ports[] list.
1004	 */
1005
1006	for (count = 0; count < nent; count++) {
1007		port = plist[count];
1008		if ((fp = getf(port)) == NULL) {
1009			elist = port_errorn(elist, nent, EBADF, count);
1010			errorcnt++;
1011			continue;
1012		}
1013
1014		pp = VTOEP(fp->f_vnode);
1015		if (fp->f_vnode->v_type != VPORT) {
1016			releasef(port);
1017			elist = port_errorn(elist, nent, EBADFD, count);
1018			errorcnt++;
1019			continue;
1020		}
1021
1022		error = port_alloc_event_local(pp, PORT_SOURCE_USER,
1023		    PORT_ALLOC_DEFAULT, &pev);
1024		if (error) {
1025			releasef(port);
1026			elist = port_errorn(elist, nent, error, count);
1027			errorcnt++;
1028			continue;
1029		}
1030
1031		pev->portkev_object = 0;
1032		pev->portkev_events = events;
1033		pev->portkev_user = user;
1034		pev->portkev_callback = NULL;
1035		pev->portkev_arg = NULL;
1036		pev->portkev_flags = 0;
1037
1038		port_send_event(pev);
1039		releasef(port);
1040	}
1041	if (errorcnt) {
1042		error = EIO;
1043		if (copyout(elist, (void *)errors, nent * sizeof (int)))
1044			error = EFAULT;
1045		kmem_free(elist, nent * sizeof (int));
1046	}
1047	*nget = nent - errorcnt;
1048	kmem_free(plist, nent * sizeof (int));
1049	return (error);
1050}
1051
1052static int *
1053port_errorn(int *elist, int nent, int error, int index)
1054{
1055	if (elist == NULL)
1056		elist = kmem_zalloc(nent * sizeof (int), KM_SLEEP);
1057	elist[index] = error;
1058	return (elist);
1059}
1060
1061/*
1062 * port_alert()
1063 * The port_alert() funcion is a high priority event and it is always set
1064 * on top of the queue. It is also delivered as single event.
1065 * flags:
1066 *	- SET	:overwrite current alert data
1067 *	- UPDATE:set alert data or return EBUSY if alert mode is already set
1068 *
1069 * - set the ALERT flag
1070 * - wakeup all sleeping threads
1071 */
1072static int
1073port_alert(port_t *pp, int flags, int events, void *user)
1074{
1075	port_queue_t	*portq;
1076	portget_t	*pgetp;
1077	port_alert_t	*pa;
1078
1079	if ((flags & PORT_ALERT_INVALID) == PORT_ALERT_INVALID)
1080		return (EINVAL);
1081
1082	portq = &pp->port_queue;
1083	pa = &portq->portq_alert;
1084	mutex_enter(&portq->portq_mutex);
1085
1086	/* check alert conditions */
1087	if (flags == PORT_ALERT_UPDATE) {
1088		if (portq->portq_flags & PORTQ_ALERT) {
1089			mutex_exit(&portq->portq_mutex);
1090			return (EBUSY);
1091		}
1092	}
1093
1094	/*
1095	 * Store alert data in the port to be delivered to threads
1096	 * which are using port_get(n) to retrieve events.
1097	 */
1098
1099	portq->portq_flags |= PORTQ_ALERT;
1100	pa->portal_events = events;		/* alert info */
1101	pa->portal_pid = curproc->p_pid;	/* process owner */
1102	pa->portal_object = 0;			/* no object */
1103	pa->portal_user = user;			/* user alert data */
1104
1105	/* alert and deliver alert data to waiting threads */
1106	pgetp = portq->portq_thread;
1107	if (pgetp == NULL) {
1108		/* no threads waiting for events */
1109		mutex_exit(&portq->portq_mutex);
1110		return (0);
1111	}
1112
1113	/*
1114	 * Set waiting threads in alert mode (PORTGET_ALERT)..
1115	 * Every thread waiting for events already allocated a portget_t
1116	 * structure to sleep on.
1117	 * The port alert arguments are stored in the portget_t structure.
1118	 * The PORTGET_ALERT flag is set to indicate the thread to return
1119	 * immediately with the alert event.
1120	 */
1121	do {
1122		if ((pgetp->portget_state & PORTGET_ALERT) == 0) {
1123			pa = &pgetp->portget_alert;
1124			pa->portal_events = events;
1125			pa->portal_object = 0;
1126			pa->portal_user = user;
1127			pgetp->portget_state |= PORTGET_ALERT;
1128			cv_signal(&pgetp->portget_cv);
1129		}
1130	} while ((pgetp = pgetp->portget_next) != portq->portq_thread);
1131	mutex_exit(&portq->portq_mutex);
1132	return (0);
1133}
1134
1135/*
1136 * Clear alert state of the port
1137 */
1138static void
1139port_remove_alert(port_queue_t *portq)
1140{
1141	mutex_enter(&portq->portq_mutex);
1142	portq->portq_flags &= ~PORTQ_ALERT;
1143	mutex_exit(&portq->portq_mutex);
1144}
1145
1146/*
1147 * The port_getn() function is used to retrieve events from a port.
1148 *
1149 * The port_getn() function returns immediately if there are enough events
1150 * available in the port to satisfy the request or if the port is in alert
1151 * mode (see port_alert(3c)).
1152 * The timeout argument of port_getn(3c) -which is embedded in the
1153 * port_gettimer_t structure- specifies if the system call should block or if it
1154 * should return immediately depending on the number of events available.
1155 * This function is internally used by port_getn(3c) as well as by
1156 * port_get(3c).
1157 */
1158static int
1159port_getn(port_t *pp, port_event_t *uevp, uint_t max, uint_t *nget,
1160    port_gettimer_t *pgt)
1161{
1162	port_queue_t	*portq;
1163	port_kevent_t 	*pev;
1164	port_kevent_t 	*lev;
1165	int		error = 0;
1166	uint_t		nmax;
1167	uint_t		nevents;
1168	uint_t		eventsz;
1169	port_event_t	*kevp;
1170	list_t		*glist;
1171	uint_t		tnent;
1172	int		rval;
1173	int		blocking = -1;
1174	int		timecheck;
1175	int		flag;
1176	timespec_t	rqtime;
1177	timespec_t	*rqtp = NULL;
1178	portget_t	*pgetp;
1179	void		*results;
1180	model_t		model = get_udatamodel();
1181
1182	flag = pgt->pgt_flags;
1183
1184	if (*nget > max && max > 0)
1185		return (EINVAL);
1186
1187	portq = &pp->port_queue;
1188	mutex_enter(&portq->portq_mutex);
1189	if (max == 0) {
1190		/*
1191		 * Return number of objects with events.
1192		 * The port_block() call is required to synchronize this
1193		 * thread with another possible thread, which could be
1194		 * retrieving events from the port queue.
1195		 */
1196		port_block(portq);
1197		/*
1198		 * Check if a second thread is currently retrieving events
1199		 * and it is using the temporary event queue.
1200		 */
1201		if (portq->portq_tnent) {
1202			/* put remaining events back to the port queue */
1203			port_push_eventq(portq);
1204		}
1205		*nget = portq->portq_nent;
1206		port_unblock(portq);
1207		mutex_exit(&portq->portq_mutex);
1208		return (0);
1209	}
1210
1211	if (uevp == NULL) {
1212		mutex_exit(&portq->portq_mutex);
1213		return (EFAULT);
1214	}
1215	if (*nget == 0) {		/* no events required */
1216		mutex_exit(&portq->portq_mutex);
1217		return (0);
1218	}
1219
1220	/* port is being closed ... */
1221	if (portq->portq_flags & PORTQ_CLOSE) {
1222		mutex_exit(&portq->portq_mutex);
1223		return (EBADFD);
1224	}
1225
1226	/* return immediately if port in alert mode */
1227	if (portq->portq_flags & PORTQ_ALERT) {
1228		error = port_get_alert(&portq->portq_alert, uevp);
1229		if (error == 0)
1230			*nget = 1;
1231		mutex_exit(&portq->portq_mutex);
1232		return (error);
1233	}
1234
1235	portq->portq_thrcnt++;
1236
1237	/*
1238	 * Now check if the completed events satisfy the
1239	 * "wait" requirements of the current thread:
1240	 */
1241
1242	if (pgt->pgt_loop) {
1243		/*
1244		 * loop entry of same thread
1245		 * pgt_loop is set when the current thread returns
1246		 * prematurely from this function. That could happen
1247		 * when a port is being shared between processes and
1248		 * this thread could not find events to return.
1249		 * It is not allowed to a thread to retrieve non-shareable
1250		 * events generated in other processes.
1251		 * PORTQ_WAIT_EVENTS is set when a thread already
1252		 * checked the current event queue and no new events
1253		 * are added to the queue.
1254		 */
1255		if (((portq->portq_flags & PORTQ_WAIT_EVENTS) == 0) &&
1256		    (portq->portq_nent >= *nget)) {
1257			/* some new events arrived ...check them */
1258			goto portnowait;
1259		}
1260		rqtp = pgt->pgt_rqtp;
1261		timecheck = pgt->pgt_timecheck;
1262		pgt->pgt_flags |= PORTGET_WAIT_EVENTS;
1263	} else {
1264		/* check if enough events are available ... */
1265		if (portq->portq_nent >= *nget)
1266			goto portnowait;
1267		/*
1268		 * There are not enough events available to satisfy
1269		 * the request, check timeout value and wait for
1270		 * incoming events.
1271		 */
1272		error = port_get_timeout(pgt->pgt_timeout, &rqtime, &rqtp,
1273		    &blocking, flag);
1274		if (error) {
1275			port_check_return_cond(portq);
1276			mutex_exit(&portq->portq_mutex);
1277			return (error);
1278		}
1279
1280		if (blocking == 0) /* don't block, check fired events */
1281			goto portnowait;
1282
1283		if (rqtp != NULL) {
1284			timespec_t	now;
1285			timecheck = timechanged;
1286			gethrestime(&now);
1287			timespecadd(rqtp, &now);
1288		}
1289	}
1290
1291	/* enqueue thread in the list of waiting threads */
1292	pgetp = port_queue_thread(portq, *nget);
1293
1294
1295	/* Wait here until return conditions met */
1296	for (;;) {
1297		if (pgetp->portget_state & PORTGET_ALERT) {
1298			/* reap alert event and return */
1299			error = port_get_alert(&pgetp->portget_alert, uevp);
1300			if (error)
1301				*nget = 0;
1302			else
1303				*nget = 1;
1304			port_dequeue_thread(&pp->port_queue, pgetp);
1305			portq->portq_thrcnt--;
1306			mutex_exit(&portq->portq_mutex);
1307			return (error);
1308		}
1309
1310		/*
1311		 * Check if some other thread is already retrieving
1312		 * events (portq_getn > 0).
1313		 */
1314
1315		if ((portq->portq_getn  == 0) &&
1316		    ((portq)->portq_nent >= *nget) &&
1317		    (!((pgt)->pgt_flags & PORTGET_WAIT_EVENTS) ||
1318		    !((portq)->portq_flags & PORTQ_WAIT_EVENTS)))
1319			break;
1320
1321		if (portq->portq_flags & PORTQ_CLOSE) {
1322			error = EBADFD;
1323			break;
1324		}
1325
1326		rval = cv_waituntil_sig(&pgetp->portget_cv, &portq->portq_mutex,
1327		    rqtp, timecheck);
1328
1329		if (rval <= 0) {
1330			error = (rval == 0) ? EINTR : ETIME;
1331			break;
1332		}
1333	}
1334
1335	/* take thread out of the wait queue */
1336	port_dequeue_thread(portq, pgetp);
1337
1338	if (error != 0 && (error == EINTR || error == EBADFD ||
1339	    (error == ETIME && flag))) {
1340		/* return without events */
1341		port_check_return_cond(portq);
1342		mutex_exit(&portq->portq_mutex);
1343		return (error);
1344	}
1345
1346portnowait:
1347	/*
1348	 * Move port event queue to a temporary event queue .
1349	 * New incoming events will be continue be posted to the event queue
1350	 * and they will not be considered by the current thread.
1351	 * The idea is to avoid lock contentions or an often locking/unlocking
1352	 * of the port queue mutex. The contention and performance degradation
1353	 * could happen because:
1354	 * a) incoming events use the port queue mutex to enqueue new events and
1355	 * b) before the event can be delivered to the application it is
1356	 *    necessary to notify the event sources about the event delivery.
1357	 *    Sometimes the event sources can require a long time to return and
1358	 *    the queue mutex would block incoming events.
1359	 * During this time incoming events (port_send_event()) do not need
1360	 * to awake threads waiting for events. Before the current thread
1361	 * returns it will check the conditions to awake other waiting threads.
1362	 */
1363	portq->portq_getn++;	/* number of threads retrieving events */
1364	port_block(portq);	/* block other threads here */
1365	nmax = max < portq->portq_nent ? max : portq->portq_nent;
1366
1367	if (portq->portq_tnent) {
1368		/*
1369		 * Move remaining events from previous thread back to the
1370		 * port event queue.
1371		 */
1372		port_push_eventq(portq);
1373	}
1374	/* move port event queue to a temporary queue */
1375	list_move_tail(&portq->portq_get_list, &portq->portq_list);
1376	glist = &portq->portq_get_list;	/* use temporary event queue */
1377	tnent = portq->portq_nent;	/* get current number of events */
1378	portq->portq_nent = 0;		/* no events in the port event queue */
1379	portq->portq_flags |= PORTQ_WAIT_EVENTS; /* detect incoming events */
1380	mutex_exit(&portq->portq_mutex);    /* event queue can be reused now */
1381
1382	if (model == DATAMODEL_NATIVE) {
1383		eventsz = sizeof (port_event_t);
1384		kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1385		if (kevp == NULL) {
1386			if (nmax > pp->port_max_list)
1387				nmax = pp->port_max_list;
1388			kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
1389		}
1390		results = kevp;
1391		lev = NULL;	/* start with first event in the queue */
1392		for (nevents = 0; nevents < nmax; ) {
1393			pev = port_get_kevent(glist, lev);
1394			if (pev == NULL)	/* no more events available */
1395				break;
1396			if (pev->portkev_flags & PORT_KEV_FREE) {
1397				/* Just discard event */
1398				list_remove(glist, pev);
1399				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1400				if (PORT_FREE_EVENT(pev))
1401					port_free_event_local(pev, 0);
1402				tnent--;
1403				continue;
1404			}
1405
1406			/* move event data to copyout list */
1407			if (port_copy_event(&kevp[nevents], pev, glist)) {
1408				/*
1409				 * Event can not be delivered to the
1410				 * current process.
1411				 */
1412				if (lev != NULL)
1413					list_insert_after(glist, lev, pev);
1414				else
1415					list_insert_head(glist, pev);
1416				lev = pev;  /* last checked event */
1417			} else {
1418				nevents++;	/* # of events ready */
1419			}
1420		}
1421#ifdef	_SYSCALL32_IMPL
1422	} else {
1423		port_event32_t	*kevp32;
1424
1425		eventsz = sizeof (port_event32_t);
1426		kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
1427		if (kevp32 == NULL) {
1428			if (nmax > pp->port_max_list)
1429				nmax = pp->port_max_list;
1430			kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
1431		}
1432		results = kevp32;
1433		lev = NULL;	/* start with first event in the queue */
1434		for (nevents = 0; nevents < nmax; ) {
1435			pev = port_get_kevent(glist, lev);
1436			if (pev == NULL)	/* no more events available */
1437				break;
1438			if (pev->portkev_flags & PORT_KEV_FREE) {
1439				/* Just discard event */
1440				list_remove(glist, pev);
1441				pev->portkev_flags &= ~(PORT_CLEANUP_DONE);
1442				if (PORT_FREE_EVENT(pev))
1443					port_free_event_local(pev, 0);
1444				tnent--;
1445				continue;
1446			}
1447
1448			/* move event data to copyout list */
1449			if (port_copy_event32(&kevp32[nevents], pev, glist)) {
1450				/*
1451				 * Event can not be delivered to the
1452				 * current process.
1453				 */
1454				if (lev != NULL)
1455					list_insert_after(glist, lev, pev);
1456				else
1457					list_insert_head(glist, pev);
1458				lev = pev;  /* last checked event */
1459			} else {
1460				nevents++;	/* # of events ready */
1461			}
1462		}
1463#endif	/* _SYSCALL32_IMPL */
1464	}
1465
1466	/*
1467	 *  Remember number of remaining events in the temporary event queue.
1468	 */
1469	portq->portq_tnent = tnent - nevents;
1470
1471	/*
1472	 * Work to do before return :
1473	 * - push list of remaining events back to the top of the standard
1474	 *   port queue.
1475	 * - if this is the last thread calling port_get(n) then wakeup the
1476	 *   thread waiting on close(2).
1477	 * - check for a deferred cv_signal from port_send_event() and wakeup
1478	 *   the sleeping thread.
1479	 */
1480
1481	mutex_enter(&portq->portq_mutex);
1482	port_unblock(portq);
1483	if (portq->portq_tnent) {
1484		/*
1485		 * move remaining events in the temporary event queue back
1486		 * to the port event queue
1487		 */
1488		port_push_eventq(portq);
1489	}
1490	portq->portq_getn--;	/* update # of threads retrieving events */
1491	if (--portq->portq_thrcnt == 0) { /* # of threads waiting ... */
1492		/* Last thread => check close(2) conditions ... */
1493		if (portq->portq_flags & PORTQ_CLOSE) {
1494			cv_signal(&portq->portq_closecv);
1495			mutex_exit(&portq->portq_mutex);
1496			kmem_free(results, eventsz * nmax);
1497			/* do not copyout events */
1498			*nget = 0;
1499			return (EBADFD);
1500		}
1501	} else if (portq->portq_getn == 0) {
1502		/*
1503		 * no other threads retrieving events ...
1504		 * check wakeup conditions of sleeping threads
1505		 */
1506		if ((portq->portq_thread != NULL) &&
1507		    (portq->portq_nent >= portq->portq_nget))
1508			cv_signal(&portq->portq_thread->portget_cv);
1509	}
1510
1511	/*
1512	 * Check PORTQ_POLLIN here because the current thread set temporarily
1513	 * the number of events in the queue to zero.
1514	 */
1515	if (portq->portq_flags & PORTQ_POLLIN) {
1516		portq->portq_flags &= ~PORTQ_POLLIN;
1517		mutex_exit(&portq->portq_mutex);
1518		pollwakeup(&pp->port_pollhd, POLLIN);
1519	} else {
1520		mutex_exit(&portq->portq_mutex);
1521	}
1522
1523	/* now copyout list of user event structures to user space */
1524	if (nevents) {
1525		if (copyout(results, uevp, nevents * eventsz))
1526			error = EFAULT;
1527	}
1528	kmem_free(results, eventsz * nmax);
1529
1530	if (nevents == 0 && error == 0 && pgt->pgt_loop == 0 && blocking != 0) {
1531		/* no events retrieved: check loop conditions */
1532		if (blocking == -1) {
1533			/* no timeout checked */
1534			error = port_get_timeout(pgt->pgt_timeout,
1535			    &pgt->pgt_rqtime, &rqtp, &blocking, flag);
1536			if (error) {
1537				*nget = nevents;
1538				return (error);
1539			}
1540			if (rqtp != NULL) {
1541				timespec_t	now;
1542				pgt->pgt_timecheck = timechanged;
1543				gethrestime(&now);
1544				timespecadd(&pgt->pgt_rqtime, &now);
1545			}
1546			pgt->pgt_rqtp = rqtp;
1547		} else {
1548			/* timeout already checked -> remember values */
1549			pgt->pgt_rqtp = rqtp;
1550			if (rqtp != NULL) {
1551				pgt->pgt_timecheck = timecheck;
1552				pgt->pgt_rqtime = *rqtp;
1553			}
1554		}
1555		if (blocking)
1556			/* timeout remaining */
1557			pgt->pgt_loop = 1;
1558	}
1559
1560	/* set number of user event structures completed */
1561	*nget = nevents;
1562	return (error);
1563}
1564
1565/*
1566 * 1. copy kernel event structure to user event structure.
1567 * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1568 * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1569 * 4. Other types of event structures can be delivered back to the port cache
1570 *    (port_free_event_local()).
1571 * 5. The event source callback function is the last opportunity for the
1572 *    event source to update events, to free local resources associated with
1573 *    the event or to deny the delivery of the event.
1574 */
1575static int
1576port_copy_event(port_event_t *puevp, port_kevent_t *pkevp, list_t *list)
1577{
1578	int	free_event = 0;
1579	int	flags;
1580	int	error;
1581
1582	puevp->portev_source = pkevp->portkev_source;
1583	puevp->portev_object = pkevp->portkev_object;
1584	puevp->portev_user = pkevp->portkev_user;
1585	puevp->portev_events = pkevp->portkev_events;
1586
1587	/* remove event from the queue */
1588	list_remove(list, pkevp);
1589
1590	/*
1591	 * Events of type PORT_KEV_WIRED remain allocated by the
1592	 * event source.
1593	 */
1594	flags = pkevp->portkev_flags;
1595	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1596		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1597	else
1598		free_event = 1;
1599
1600	if (pkevp->portkev_callback) {
1601		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1602		    &puevp->portev_events, pkevp->portkev_pid,
1603		    PORT_CALLBACK_DEFAULT, pkevp);
1604
1605		if (error) {
1606			/*
1607			 * Event can not be delivered.
1608			 * Caller must reinsert the event into the queue.
1609			 */
1610			pkevp->portkev_flags = flags;
1611			return (error);
1612		}
1613	}
1614	if (free_event)
1615		port_free_event_local(pkevp, 0);
1616	return (0);
1617}
1618
1619#ifdef	_SYSCALL32_IMPL
1620/*
1621 * 1. copy kernel event structure to user event structure.
1622 * 2. PORT_KEV_WIRED event structures will be reused by the "source"
1623 * 3. Remove PORT_KEV_DONEQ flag (event removed from the event queue)
1624 * 4. Other types of event structures can be delivered back to the port cache
1625 *    (port_free_event_local()).
1626 * 5. The event source callback function is the last opportunity for the
1627 *    event source to update events, to free local resources associated with
1628 *    the event or to deny the delivery of the event.
1629 */
1630static int
1631port_copy_event32(port_event32_t *puevp, port_kevent_t *pkevp, list_t *list)
1632{
1633	int	free_event = 0;
1634	int	error;
1635	int	flags;
1636
1637	puevp->portev_source = pkevp->portkev_source;
1638	puevp->portev_object = (daddr32_t)pkevp->portkev_object;
1639	puevp->portev_user = (caddr32_t)(uintptr_t)pkevp->portkev_user;
1640	puevp->portev_events = pkevp->portkev_events;
1641
1642	/* remove event from the queue */
1643	list_remove(list, pkevp);
1644
1645	/*
1646	 * Events if type PORT_KEV_WIRED remain allocated by the
1647	 * sub-system (source).
1648	 */
1649
1650	flags = pkevp->portkev_flags;
1651	if (pkevp->portkev_flags & PORT_KEV_WIRED)
1652		pkevp->portkev_flags &= ~PORT_KEV_DONEQ;
1653	else
1654		free_event = 1;
1655
1656	if (pkevp->portkev_callback != NULL) {
1657		error = (*pkevp->portkev_callback)(pkevp->portkev_arg,
1658		    &puevp->portev_events, pkevp->portkev_pid,
1659		    PORT_CALLBACK_DEFAULT, pkevp);
1660		if (error) {
1661			/*
1662			 * Event can not be delivered.
1663			 * Caller must reinsert the event into the queue.
1664			 */
1665			pkevp->portkev_flags = flags;
1666			return (error);
1667		}
1668	}
1669	if (free_event)
1670		port_free_event_local(pkevp, 0);
1671	return (0);
1672}
1673#endif	/* _SYSCALL32_IMPL */
1674
1675/*
1676 * copyout alert event.
1677 */
1678static int
1679port_get_alert(port_alert_t *pa, port_event_t *uevp)
1680{
1681	model_t	model = get_udatamodel();
1682
1683	/* copyout alert event structures to user space */
1684	if (model == DATAMODEL_NATIVE) {
1685		port_event_t	uev;
1686		uev.portev_source = PORT_SOURCE_ALERT;
1687		uev.portev_object = pa->portal_object;
1688		uev.portev_events = pa->portal_events;
1689		uev.portev_user = pa->portal_user;
1690		if (copyout(&uev, uevp, sizeof (port_event_t)))
1691			return (EFAULT);
1692#ifdef	_SYSCALL32_IMPL
1693	} else {
1694		port_event32_t	uev32;
1695		uev32.portev_source = PORT_SOURCE_ALERT;
1696		uev32.portev_object = (daddr32_t)pa->portal_object;
1697		uev32.portev_events = pa->portal_events;
1698		uev32.portev_user = (daddr32_t)(uintptr_t)pa->portal_user;
1699		if (copyout(&uev32, uevp, sizeof (port_event32_t)))
1700			return (EFAULT);
1701#endif	/* _SYSCALL32_IMPL */
1702	}
1703	return (0);
1704}
1705
1706/*
1707 * Check return conditions :
1708 * - pending port close(2)
1709 * - threads waiting for events
1710 */
1711static void
1712port_check_return_cond(port_queue_t *portq)
1713{
1714	ASSERT(MUTEX_HELD(&portq->portq_mutex));
1715	portq->portq_thrcnt--;
1716	if (portq->portq_flags & PORTQ_CLOSE) {
1717		if (portq->portq_thrcnt == 0)
1718			cv_signal(&portq->portq_closecv);
1719		else
1720			cv_signal(&portq->portq_thread->portget_cv);
1721	}
1722}
1723
1724/*
1725 * The port_get_kevent() function returns
1726 * - the event located at the head of the queue if 'last' pointer is NULL
1727 * - the next event after the event pointed by 'last'
1728 * The caller of this function is responsible for the integrity of the queue
1729 * in use:
1730 * - port_getn() is using a temporary queue protected with port_block().
1731 * - port_close_events() is working on the global event queue and protects
1732 *   the queue with portq->portq_mutex.
1733 */
1734port_kevent_t *
1735port_get_kevent(list_t *list, port_kevent_t *last)
1736{
1737	if (last == NULL)
1738		return (list_head(list));
1739	else
1740		return (list_next(list, last));
1741}
1742
1743/*
1744 * The port_get_timeout() function gets the timeout data from user space
1745 * and converts that info into a corresponding internal representation.
1746 * The kerneldata flag means that the timeout data is already loaded.
1747 */
1748static int
1749port_get_timeout(timespec_t *timeout, timespec_t *rqtime, timespec_t **rqtp,
1750    int *blocking, int kerneldata)
1751{
1752	model_t	model = get_udatamodel();
1753
1754	*rqtp = NULL;
1755	if (timeout == NULL) {
1756		*blocking = 1;
1757		return (0);
1758	}
1759
1760	if (kerneldata) {
1761		*rqtime = *timeout;
1762	} else {
1763		if (model == DATAMODEL_NATIVE) {
1764			if (copyin(timeout, rqtime, sizeof (*rqtime)))
1765				return (EFAULT);
1766#ifdef	_SYSCALL32_IMPL
1767		} else {
1768			timespec32_t 	wait_time_32;
1769			if (copyin(timeout, &wait_time_32,
1770			    sizeof (wait_time_32)))
1771				return (EFAULT);
1772			TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
1773#endif  /* _SYSCALL32_IMPL */
1774		}
1775	}
1776
1777	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
1778		*blocking = 0;
1779		return (0);
1780	}
1781
1782	if (rqtime->tv_sec < 0 ||
1783	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
1784		return (EINVAL);
1785
1786	*rqtp = rqtime;
1787	*blocking = 1;
1788	return (0);
1789}
1790
1791/*
1792 * port_queue_thread()
1793 * Threads requiring more events than available will be put in a wait queue.
1794 * There is a "thread wait queue" per port.
1795 * Threads requiring less events get a higher priority than others and they
1796 * will be awoken first.
1797 */
1798static portget_t *
1799port_queue_thread(port_queue_t *portq, uint_t nget)
1800{
1801	portget_t	*pgetp;
1802	portget_t	*ttp;
1803	portget_t	*htp;
1804
1805	pgetp = kmem_zalloc(sizeof (portget_t), KM_SLEEP);
1806	pgetp->portget_nget = nget;
1807	pgetp->portget_pid = curproc->p_pid;
1808	if (portq->portq_thread == NULL) {
1809		/* first waiting thread */
1810		portq->portq_thread = pgetp;
1811		portq->portq_nget = nget;
1812		pgetp->portget_prev = pgetp;
1813		pgetp->portget_next = pgetp;
1814		return (pgetp);
1815	}
1816
1817	/*
1818	 * thread waiting for less events will be set on top of the queue.
1819	 */
1820	ttp = portq->portq_thread;
1821	htp = ttp;
1822	for (;;) {
1823		if (nget <= ttp->portget_nget)
1824			break;
1825		if (htp == ttp->portget_next)
1826			break;	/* last event */
1827		ttp = ttp->portget_next;
1828	}
1829
1830	/* add thread to the queue */
1831	pgetp->portget_next = ttp;
1832	pgetp->portget_prev = ttp->portget_prev;
1833	ttp->portget_prev->portget_next = pgetp;
1834	ttp->portget_prev = pgetp;
1835	if (portq->portq_thread == ttp)
1836		portq->portq_thread = pgetp;
1837	portq->portq_nget = portq->portq_thread->portget_nget;
1838	return (pgetp);
1839}
1840
1841/*
1842 * Take thread out of the queue.
1843 */
1844static void
1845port_dequeue_thread(port_queue_t *portq, portget_t *pgetp)
1846{
1847	if (pgetp->portget_next == pgetp) {
1848		/* last (single) waiting thread */
1849		portq->portq_thread = NULL;
1850		portq->portq_nget = 0;
1851	} else {
1852		pgetp->portget_prev->portget_next = pgetp->portget_next;
1853		pgetp->portget_next->portget_prev = pgetp->portget_prev;
1854		if (portq->portq_thread == pgetp)
1855			portq->portq_thread = pgetp->portget_next;
1856		portq->portq_nget = portq->portq_thread->portget_nget;
1857	}
1858	kmem_free(pgetp, sizeof (portget_t));
1859}
1860
1861/*
1862 * Set up event port kstats.
1863 */
1864static void
1865port_kstat_init()
1866{
1867	kstat_t	*ksp;
1868	uint_t	ndata;
1869
1870	ndata = sizeof (port_kstat) / sizeof (kstat_named_t);
1871	ksp = kstat_create("portfs", 0, "Event Ports", "misc",
1872	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_VIRTUAL);
1873	if (ksp) {
1874		ksp->ks_data = &port_kstat;
1875		kstat_install(ksp);
1876	}
1877}
1878