rsm.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27/*
28 * Overview of the RSM Kernel Agent:
29 * ---------------------------------
30 *
31 * rsm.c constitutes the implementation of the RSM kernel agent. The RSM
32 * kernel agent is a pseudo device driver which makes use of the RSMPI
33 * interface on behalf of the RSMAPI user library.
34 *
35 * The kernel agent functionality can be categorized into the following
36 * components:
37 * 1. Driver Infrastructure
38 * 2. Export/Import Segment Management
39 * 3. Internal resource allocation/deallocation
40 *
41 * The driver infrastructure includes the basic module loading entry points
42 * like _init, _info, _fini to load, unload and report information about
43 * the driver module. The driver infrastructure also includes the
44 * autoconfiguration entry points namely, attach, detach and getinfo for
45 * the device autoconfiguration.
46 *
47 * The kernel agent is a pseudo character device driver and exports
48 * a cb_ops structure which defines the driver entry points for character
49 * device access. This includes the open and close entry points. The
50 * other entry points provided include ioctl, devmap and segmap and chpoll.
51 * read and write entry points are not used since the device is memory
52 * mapped. Also ddi_prop_op is used for the prop_op entry point.
53 *
54 * The ioctl entry point supports a number of commands, which are used by
55 * the RSMAPI library in order to export and import segments. These
56 * commands include commands for binding and rebinding the physical pages
57 * allocated to the virtual address range, publishing the export segment,
58 * unpublishing and republishing an export segment, creating an
59 * import segment and a virtual connection from this import segment to
60 * an export segment, performing scatter-gather data transfer, barrier
61 * operations.
62 *
63 *
64 * Export and Import segments:
65 * ---------------------------
66 *
67 * In order to create an RSM export segment a process allocates a range in its
68 * virtual address space for the segment using standard Solaris interfaces.
69 * The process then calls RSMAPI, which in turn makes an ioctl call to the
70 * RSM kernel agent for an allocation of physical memory pages and for
71 * creation of the export segment by binding these pages to the virtual
72 * address range. These pages are locked in memory so that remote accesses
73 * are always applied to the correct page. Then the RSM segment is published,
74 * again via RSMAPI making an ioctl to the RSM kernel agent, and a segment id
75 * is assigned to it.
76 *
77 * In order to import a published RSM segment, RSMAPI creates an import
78 * segment and forms a virtual connection across the interconnect to the
79 * export segment, via an ioctl into the kernel agent with the connect
80 * command. The import segment setup is completed by mapping the
81 * local device memory into the importers virtual address space. The
82 * mapping of the import segment is handled by the segmap/devmap
83 * infrastructure described as follows.
84 *
85 * Segmap and Devmap interfaces:
86 *
87 * The RSM kernel agent allows device memory to be directly accessed by user
88 * threads via memory mapping. In order to do so, the RSM kernel agent
89 * supports the devmap and segmap entry points.
90 *
91 * The segmap entry point(rsm_segmap) is responsible for setting up a memory
92 * mapping as requested by mmap. The devmap entry point(rsm_devmap) is
93 * responsible for exporting the device memory to the user applications.
94 * rsm_segmap calls RSMPI rsm_map to allocate device memory. Then the
95 * control is transfered to the devmap_setup call which calls rsm_devmap.
96 *
97 * rsm_devmap validates the user mapping to the device or kernel memory
98 * and passes the information to the system for setting up the mapping. The
99 * actual setting up of the mapping is done by devmap_devmem_setup(for
100 * device memory) or devmap_umem_setup(for kernel memory). Callbacks are
101 * registered for device context management via the devmap_devmem_setup
102 * or devmap_umem_setup calls. The callbacks are rsmmap_map, rsmmap_unmap,
103 * rsmmap_access, rsmmap_dup. The callbacks are called when a new mapping
104 * is created, a mapping is freed, a mapping is accessed or an existing
105 * mapping is duplicated respectively. These callbacks allow the RSM kernel
106 * agent to maintain state information associated with the mappings.
107 * The state information is mainly in the form of a cookie list for the import
108 * segment for which mapping has been done.
109 *
110 * Forced disconnect of import segments:
111 *
112 * When an exported segment is unpublished, the exporter sends a forced
113 * disconnect message to all its importers. The importer segments are
114 * unloaded and disconnected. This involves unloading the original
115 * mappings and remapping to a preallocated kernel trash page. This is
116 * done by devmap_umem_remap. The trash/dummy page is a kernel page,
117 * preallocated by the kernel agent during attach using ddi_umem_alloc with
118 * the DDI_UMEM_TRASH flag set. This avoids a core dump in the application
119 * due to unloading of the original mappings.
120 *
121 * Additionally every segment has a mapping generation number associated
122 * with it. This is an entry in the barrier generation page, created
123 * during attach time. This mapping generation number for the import
124 * segments is incremented on a force disconnect to notify the application
125 * of the force disconnect. On this notification, the application needs
126 * to reconnect the segment to establish a new legitimate mapping.
127 *
128 *
129 * Locks used in the kernel agent:
130 * -------------------------------
131 *
132 * The kernel agent uses a variety of mutexes and condition variables for
133 * mutual exclusion of the shared data structures and for synchronization
134 * between the various threads. Some of the locks are described as follows.
135 *
136 * Each resource structure, which represents either an export/import segment
137 * has a lock associated with it. The lock is the resource mutex, rsmrc_lock.
138 * This is used directly by RSMRC_LOCK and RSMRC_UNLOCK macros and in the
139 * rsmseglock_acquire and rsmseglock_release macros. An additional
140 * lock called the rsmsi_lock is used for the shared import data structure
141 * that is relevant for resources representing import segments. There is
142 * also a condition variable associated with the resource called s_cv. This
143 * is used to wait for events like the segment state change etc.
144 *
145 * The resource structures are allocated from a pool of resource structures,
146 * called rsm_resource. This pool is protected via a reader-writer lock,
147 * called rsmrc_lock.
148 *
149 * There are two separate hash tables, one for the export segments and
150 * one for the import segments. The export segments are inserted into the
151 * export segment hash table only after they have been published and the
152 * import segments are inserted in the import segments list only after they
153 * have successfully connected to an exported segment. These tables are
154 * protected via reader-writer locks.
155 *
156 * Debug Support in the kernel agent:
157 * ----------------------------------
158 *
159 * Debugging support in the kernel agent is provided by the following
160 * macros.
161 *
162 * DBG_PRINTF((category, level, message)) is a macro which logs a debug
163 * message to the kernel agents debug buffer, rsmka_dbg. This debug buffer
164 * can be viewed in kmdb as *rsmka_dbg/s. The message is logged based
165 * on the definition of the category and level. All messages that belong to
166 * the specified category(rsmdbg_category) and are of an equal or greater
167 * severity than the specified level(rsmdbg_level) are logged. The message
168 * is a string which uses the same formatting rules as the strings used in
169 * printf.
170 *
171 * The category defines which component of the kernel agent has logged this
172 * message. There are a number of categories that have been defined such as
173 * RSM_KERNEL_AGENT, RSM_OPS, RSM_IMPORT, RSM_EXPORT etc. A macro,
174 * DBG_ADDCATEGORY is used to add in another category to the currently
175 * specified category value so that the component using this new category
176 * can also effectively log debug messages. Thus, the category of a specific
177 * message is some combination of the available categories and we can define
178 * sub-categories if we want a finer level of granularity.
179 *
180 * The level defines the severity of the message. Different level values are
181 * defined, with RSM_ERR being the most severe and RSM_DEBUG_VERBOSE being
182 * the least severe(debug level is 0).
183 *
184 * DBG_DEFINE and DBG_DEFINE_STR are macros provided to declare a debug
185 * variable or a string respectively.
186 *
187 *
188 * NOTES:
189 *
190 * Special Fork and Exec Handling:
191 * -------------------------------
192 *
193 * The backing physical pages of an exported segment are always locked down.
194 * Thus, there are two cases in which a process having exported segments
195 * will cause a cpu to hang: (1) the process invokes exec; (2) a process
196 * forks and invokes exit before the duped file descriptors for the export
197 * segments are closed in the child process. The hang is caused because the
198 * address space release algorithm in Solaris VM subsystem is based on a
199 * non-blocking loop which does not terminate while segments are locked
200 * down. In addition to this, Solaris VM subsystem lacks a callback
201 * mechanism to the rsm kernel agent to allow unlocking these export
202 * segment pages.
203 *
204 * In order to circumvent this problem, the kernel agent does the following.
205 * The Solaris VM subsystem keeps memory segments in increasing order of
206 * virtual addressses. Thus a special page(special_exit_offset) is allocated
207 * by the kernel agent and is mmapped into the heap area of the process address
208 * space(the mmap is done by the RSMAPI library). During the mmap processing
209 * of this special page by the devmap infrastructure, a callback(the same
210 * devmap context management callbacks discussed above) is registered for an
211 * unmap.
212 *
213 * As discussed above, this page is processed by the Solaris address space
214 * release code before any of the exported segments pages(which are allocated
215 * from high memory). It is during this processing that the unmap callback gets
216 * called and this callback is responsible for force destroying the exported
217 * segments and thus eliminating the problem of locked pages.
218 *
219 * Flow-control:
220 * ------------
221 *
222 * A credit based flow control algorithm is used for messages whose
223 * processing cannot be done in the interrupt context because it might
224 * involve invoking rsmpi calls, or might take a long time to complete
225 * or might need to allocate resources. The algorithm operates on a per
226 * path basis. To send a message the pathend needs to have a credit and
227 * it consumes one for every message that is flow controlled. On the
228 * receiving pathend the message is put on a msgbuf_queue and a task is
229 * dispatched on the worker thread - recv_taskq where it is processed.
230 * After processing the message, the receiving pathend dequeues the message,
231 * and if it has processed > RSMIPC_LOTSFREE_MSGBUFS messages sends
232 * credits to the sender pathend.
233 *
234 * RSM_DRTEST:
235 * -----------
236 *
237 * This is used to enable the DR testing using a test driver on test
238 * platforms which do not supported DR.
239 *
240 */
241
242#include <sys/types.h>
243#include <sys/param.h>
244#include <sys/user.h>
245#include <sys/buf.h>
246#include <sys/systm.h>
247#include <sys/cred.h>
248#include <sys/vm.h>
249#include <sys/uio.h>
250#include <vm/seg.h>
251#include <vm/page.h>
252#include <sys/stat.h>
253
254#include <sys/time.h>
255#include <sys/errno.h>
256
257#include <sys/file.h>
258#include <sys/uio.h>
259#include <sys/proc.h>
260#include <sys/mman.h>
261#include <sys/open.h>
262#include <sys/atomic.h>
263#include <sys/mem_config.h>
264
265
266#include <sys/ddi.h>
267#include <sys/devops.h>
268#include <sys/ddidevmap.h>
269#include <sys/sunddi.h>
270#include <sys/esunddi.h>
271#include <sys/ddi_impldefs.h>
272
273#include <sys/kmem.h>
274#include <sys/conf.h>
275#include <sys/devops.h>
276#include <sys/ddi_impldefs.h>
277
278#include <sys/modctl.h>
279
280#include <sys/policy.h>
281#include <sys/types.h>
282#include <sys/conf.h>
283#include <sys/param.h>
284
285#include <sys/taskq.h>
286
287#include <sys/rsm/rsm_common.h>
288#include <sys/rsm/rsmapi_common.h>
289#include <sys/rsm/rsm.h>
290#include <rsm_in.h>
291#include <sys/rsm/rsmka_path_int.h>
292#include <sys/rsm/rsmpi.h>
293
294#include <sys/modctl.h>
295#include <sys/debug.h>
296
297#include <sys/tuneable.h>
298
299#ifdef	RSM_DRTEST
300extern int rsm_kphysm_setup_func_register(kphysm_setup_vector_t *vec,
301		void *arg);
302extern void rsm_kphysm_setup_func_unregister(kphysm_setup_vector_t *vec,
303		void *arg);
304#endif
305
306extern void dbg_printf(int category, int level, char *fmt, ...);
307extern void rsmka_pathmanager_init();
308extern void rsmka_pathmanager_cleanup();
309extern void rele_sendq_token();
310extern rsm_addr_t get_remote_hwaddr(adapter_t *, rsm_node_id_t);
311extern rsm_node_id_t get_remote_nodeid(adapter_t *, rsm_addr_t);
312extern int rsmka_topology_ioctl(caddr_t, int, int);
313
314extern pri_t maxclsyspri;
315extern work_queue_t work_queue;
316extern kmutex_t ipc_info_lock;
317extern kmutex_t ipc_info_cvlock;
318extern kcondvar_t ipc_info_cv;
319extern kmutex_t path_hold_cvlock;
320extern kcondvar_t path_hold_cv;
321
322extern kmutex_t rsmka_buf_lock;
323
324extern path_t *rsm_find_path(char *, int, rsm_addr_t);
325extern adapter_t *rsmka_lookup_adapter(char *, int);
326extern sendq_token_t *rsmka_get_sendq_token(rsm_node_id_t, sendq_token_t *);
327extern boolean_t rsmka_do_path_active(path_t *, int);
328extern boolean_t rsmka_check_node_alive(rsm_node_id_t);
329extern void rsmka_release_adapter(adapter_t *);
330extern void rsmka_enqueue_msgbuf(path_t *path, void *data);
331extern void rsmka_dequeue_msgbuf(path_t *path);
332extern msgbuf_elem_t *rsmka_gethead_msgbuf(path_t *path);
333/* lint -w2 */
334
335static int rsm_open(dev_t *, int, int, cred_t *);
336static int rsm_close(dev_t, int, int, cred_t *);
337static int rsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
338    cred_t *credp, int *rvalp);
339static int rsm_devmap(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
340    uint_t);
341static int rsm_segmap(dev_t, off_t, struct as *, caddr_t *, off_t, uint_t,
342    uint_t, uint_t, cred_t *);
343static int rsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
344    struct pollhead **phpp);
345
346static int rsm_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
347static int rsm_attach(dev_info_t *, ddi_attach_cmd_t);
348static int rsm_detach(dev_info_t *, ddi_detach_cmd_t);
349
350static int rsmipc_send(rsm_node_id_t, rsmipc_request_t *, rsmipc_reply_t *);
351static void rsm_force_unload(rsm_node_id_t, rsm_memseg_id_t, boolean_t);
352static void rsm_send_importer_disconnects(rsm_memseg_id_t, rsm_node_id_t);
353static void rsm_send_republish(rsm_memseg_id_t, rsmapi_access_entry_t *, int,
354				rsm_permission_t);
355static void rsm_export_force_destroy(ddi_umem_cookie_t *);
356static void rsmacl_free(rsmapi_access_entry_t *, int);
357static void rsmpiacl_free(rsm_access_entry_t *, int);
358
359static int rsm_inc_pgcnt(pgcnt_t);
360static void rsm_dec_pgcnt(pgcnt_t);
361static void rsm_free_mapinfo(rsm_mapinfo_t *mapinfop);
362static rsm_mapinfo_t *rsm_get_mapinfo(rsmseg_t *, off_t, size_t, off_t *,
363					size_t *);
364static void exporter_quiesce();
365static void rsmseg_suspend(rsmseg_t *, int *);
366static void rsmsegshare_suspend(rsmseg_t *);
367static int rsmseg_resume(rsmseg_t *, void **);
368static int rsmsegshare_resume(rsmseg_t *);
369
370static struct cb_ops rsm_cb_ops = {
371	rsm_open,		/* open */
372	rsm_close,		/* close */
373	nodev,			/* strategy */
374	nodev,			/* print */
375	nodev,			/* dump */
376	nodev,			/* read */
377	nodev,			/* write */
378	rsm_ioctl,		/* ioctl */
379	rsm_devmap,		/* devmap */
380	NULL,			/* mmap */
381	rsm_segmap,		/* segmap */
382	rsm_chpoll,		/* poll */
383	ddi_prop_op,		/* cb_prop_op */
384	0,			/* streamtab  */
385	D_NEW|D_MP|D_DEVMAP,	/* Driver compatibility flag */
386	0,
387	0,
388	0
389};
390
391static struct dev_ops rsm_ops = {
392	DEVO_REV,		/* devo_rev, */
393	0,			/* refcnt  */
394	rsm_info,		/* get_dev_info */
395	nulldev,		/* identify */
396	nulldev,		/* probe */
397	rsm_attach,		/* attach */
398	rsm_detach,		/* detach */
399	nodev,			/* reset */
400	&rsm_cb_ops,		/* driver operations */
401	(struct bus_ops *)0,	/* bus operations */
402	0,
403	ddi_quiesce_not_needed,		/* quiesce */
404};
405
406/*
407 * Module linkage information for the kernel.
408 */
409
410static struct modldrv modldrv = {
411	&mod_driverops, /* Type of module.  This one is a pseudo driver */
412	"Remote Shared Memory Driver",
413	&rsm_ops,	/* driver ops */
414};
415
416static struct modlinkage modlinkage = {
417	MODREV_1,
418	(void *)&modldrv,
419	0,
420	0,
421	0
422};
423
424static void rsm_dr_callback_post_add(void *arg, pgcnt_t delta);
425static int rsm_dr_callback_pre_del(void *arg, pgcnt_t delta);
426static void rsm_dr_callback_post_del(void *arg, pgcnt_t delta, int cancelled);
427
428static kphysm_setup_vector_t rsm_dr_callback_vec = {
429	KPHYSM_SETUP_VECTOR_VERSION,
430	rsm_dr_callback_post_add,
431	rsm_dr_callback_pre_del,
432	rsm_dr_callback_post_del
433};
434
435/* This flag can be changed to 0 to help with PIT testing */
436int rsmka_modunloadok = 1;
437int no_reply_cnt = 0;
438
439uint64_t rsm_ctrlmsg_errcnt = 0;
440uint64_t rsm_ipcsend_errcnt = 0;
441
442#define	MAX_NODES 64
443
444static struct rsm_driver_data rsm_drv_data;
445static struct rsmresource_table rsm_resource;
446
447static void rsmresource_insert(minor_t, rsmresource_t *, rsm_resource_type_t);
448static void rsmresource_destroy(void);
449static int rsmresource_alloc(minor_t *);
450static rsmresource_t *rsmresource_free(minor_t rnum);
451static int rsm_closeconnection(rsmseg_t *seg, void **cookie);
452static int rsm_unpublish(rsmseg_t *seg, int mode);
453static int rsm_unbind(rsmseg_t *seg);
454static uint_t rsmhash(rsm_memseg_id_t key);
455static void rsmhash_alloc(rsmhash_table_t *rhash, int size);
456static void rsmhash_free(rsmhash_table_t *rhash, int size);
457static void *rsmhash_getbkt(rsmhash_table_t *rhash, uint_t hashval);
458static void **rsmhash_bktaddr(rsmhash_table_t *rhash, uint_t hashval);
459static int rsm_send_notimporting(rsm_node_id_t dest, rsm_memseg_id_t segid,
460					void *cookie);
461int rsm_disconnect(rsmseg_t *seg);
462void rsmseg_unload(rsmseg_t *);
463void rsm_suspend_complete(rsm_node_id_t src_node, int flag);
464
465rsm_intr_hand_ret_t rsm_srv_func(rsm_controller_object_t *chd,
466    rsm_intr_q_op_t opcode, rsm_addr_t src,
467    void *data, size_t size, rsm_intr_hand_arg_t arg);
468
469static void rsm_intr_callback(void *, rsm_addr_t, rsm_intr_hand_arg_t);
470
471rsm_node_id_t my_nodeid;
472
473/* cookie, va, offsets and length for the barrier */
474static rsm_gnum_t		*bar_va;
475static ddi_umem_cookie_t	bar_cookie;
476static off_t			barrier_offset;
477static size_t			barrier_size;
478static int			max_segs;
479
480/* cookie for the trash memory */
481static ddi_umem_cookie_t	remap_cookie;
482
483static rsm_memseg_id_t	rsm_nextavail_segmentid;
484
485extern taskq_t *work_taskq;
486extern char *taskq_name;
487
488static dev_info_t *rsm_dip;	/* private copy of devinfo pointer */
489
490static rsmhash_table_t rsm_export_segs;		/* list of exported segs */
491rsmhash_table_t rsm_import_segs;		/* list of imported segs */
492static rsmhash_table_t rsm_event_queues;	/* list of event queues */
493
494static	rsm_ipc_t	rsm_ipc;		/* ipc info */
495
496/* list of nodes to which RSMIPC_MSG_SUSPEND has been sent */
497static list_head_t	rsm_suspend_list;
498
499/* list of descriptors for remote importers */
500static importers_table_t importer_list;
501
502kmutex_t rsm_suspend_cvlock;
503kcondvar_t rsm_suspend_cv;
504
505static kmutex_t rsm_lock;
506
507adapter_t loopback_adapter;
508rsm_controller_attr_t loopback_attr;
509
510int rsmipc_send_controlmsg(path_t *path, int msgtype);
511
512void rsmka_init_loopback();
513
514int rsmka_null_seg_create(
515    rsm_controller_handle_t,
516    rsm_memseg_export_handle_t *,
517    size_t,
518    uint_t,
519    rsm_memory_local_t *,
520    rsm_resource_callback_t,
521    rsm_resource_callback_arg_t);
522
523int rsmka_null_seg_destroy(
524    rsm_memseg_export_handle_t);
525
526int rsmka_null_bind(
527    rsm_memseg_export_handle_t,
528    off_t,
529    rsm_memory_local_t *,
530    rsm_resource_callback_t,
531    rsm_resource_callback_arg_t);
532
533int rsmka_null_unbind(
534    rsm_memseg_export_handle_t,
535    off_t,
536    size_t);
537
538int rsmka_null_rebind(
539    rsm_memseg_export_handle_t,
540    off_t,
541    rsm_memory_local_t *,
542    rsm_resource_callback_t,
543    rsm_resource_callback_arg_t);
544
545int rsmka_null_publish(
546    rsm_memseg_export_handle_t,
547    rsm_access_entry_t [],
548    uint_t,
549    rsm_memseg_id_t,
550    rsm_resource_callback_t,
551    rsm_resource_callback_arg_t);
552
553
554int rsmka_null_republish(
555    rsm_memseg_export_handle_t,
556    rsm_access_entry_t [],
557    uint_t,
558    rsm_resource_callback_t,
559    rsm_resource_callback_arg_t);
560
561int rsmka_null_unpublish(
562    rsm_memseg_export_handle_t);
563
564rsm_ops_t null_rsmpi_ops;
565
566/*
567 * data and locks to keep track of total amount of exported memory
568 */
569static	pgcnt_t		rsm_pgcnt;
570static	pgcnt_t		rsm_pgcnt_max;	/* max allowed */
571static	kmutex_t	rsm_pgcnt_lock;
572
573static	int		rsm_enable_dr;
574
575static	char		loopback_str[] = "loopback";
576
577int		rsm_hash_size;
578
579/*
580 * The locking model is as follows:
581 *
582 * Local operations:
583 *		find resource - grab reader lock on resouce list
584 *		insert rc     - grab writer lock
585 *		delete rc     - grab writer lock and resource mutex
586 *		read/write    - no lock
587 *
588 * Remote invocations:
589 *		find resource - grab read lock and resource mutex
590 *
591 * State:
592 *		resource state - grab resource mutex
593 */
594
595int
596_init(void)
597{
598	int e;
599
600	e = mod_install(&modlinkage);
601	if (e != 0) {
602		return (e);
603	}
604
605	mutex_init(&rsm_lock, NULL, MUTEX_DRIVER, NULL);
606
607	mutex_init(&rsmka_buf_lock, NULL, MUTEX_DEFAULT, NULL);
608
609
610	rw_init(&rsm_resource.rsmrc_lock, NULL, RW_DRIVER, NULL);
611
612	rsm_hash_size = RSM_HASHSZ;
613
614	rw_init(&rsm_export_segs.rsmhash_rw, NULL, RW_DRIVER, NULL);
615
616	rw_init(&rsm_import_segs.rsmhash_rw, NULL, RW_DRIVER, NULL);
617
618	mutex_init(&importer_list.lock, NULL, MUTEX_DRIVER, NULL);
619
620	mutex_init(&rsm_ipc.lock, NULL, MUTEX_DRIVER, NULL);
621	cv_init(&rsm_ipc.cv, NULL, CV_DRIVER, 0);
622
623	mutex_init(&rsm_suspend_cvlock, NULL, MUTEX_DRIVER, NULL);
624	cv_init(&rsm_suspend_cv, NULL, CV_DRIVER, 0);
625
626	mutex_init(&rsm_drv_data.drv_lock, NULL, MUTEX_DRIVER, NULL);
627	cv_init(&rsm_drv_data.drv_cv, NULL, CV_DRIVER, 0);
628
629	rsm_ipc.count = RSMIPC_SZ;
630	rsm_ipc.wanted = 0;
631	rsm_ipc.sequence = 0;
632
633	(void) mutex_init(&rsm_pgcnt_lock, NULL, MUTEX_DRIVER, NULL);
634
635	for (e = 0; e < RSMIPC_SZ; e++) {
636		rsmipc_slot_t *slot = &rsm_ipc.slots[e];
637
638		RSMIPC_SET(slot, RSMIPC_FREE);
639		mutex_init(&slot->rsmipc_lock, NULL, MUTEX_DRIVER, NULL);
640		cv_init(&slot->rsmipc_cv, NULL, CV_DRIVER, 0);
641	}
642
643	/*
644	 * Initialize the suspend message list
645	 */
646	rsm_suspend_list.list_head = NULL;
647	mutex_init(&rsm_suspend_list.list_lock, NULL, MUTEX_DRIVER, NULL);
648
649	/*
650	 * It is assumed here that configuration data is available
651	 * during system boot since _init may be called at that time.
652	 */
653
654	rsmka_pathmanager_init();
655
656	DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE,
657	    "rsm: _init done\n"));
658
659	return (DDI_SUCCESS);
660
661}
662
663int
664_info(struct modinfo *modinfop)
665{
666
667	return (mod_info(&modlinkage, modinfop));
668}
669
670int
671_fini(void)
672{
673	int e;
674
675	DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE,
676	    "rsm: _fini enter\n"));
677
678	/*
679	 * The rsmka_modunloadok flag is simply used to help with
680	 * the PIT testing. Make this flag 0 to disallow modunload.
681	 */
682	if (rsmka_modunloadok == 0)
683		return (EBUSY);
684
685	/* rsm_detach will be called as a result of mod_remove */
686	e = mod_remove(&modlinkage);
687	if (e) {
688		DBG_PRINTF((RSM_KERNEL_AGENT, RSM_ERR,
689		    "Unable to fini RSM %x\n", e));
690		return (e);
691	}
692
693	rsmka_pathmanager_cleanup();
694
695	rw_destroy(&rsm_resource.rsmrc_lock);
696
697	rw_destroy(&rsm_export_segs.rsmhash_rw);
698	rw_destroy(&rsm_import_segs.rsmhash_rw);
699	rw_destroy(&rsm_event_queues.rsmhash_rw);
700
701	mutex_destroy(&importer_list.lock);
702
703	mutex_destroy(&rsm_ipc.lock);
704	cv_destroy(&rsm_ipc.cv);
705
706	(void) mutex_destroy(&rsm_suspend_list.list_lock);
707
708	(void) mutex_destroy(&rsm_pgcnt_lock);
709
710	DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE, "_fini done\n"));
711
712	return (DDI_SUCCESS);
713
714}
715
716/*ARGSUSED1*/
717static int
718rsm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
719{
720	minor_t	rnum;
721	int	percent;
722	int	ret;
723	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
724
725	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_attach enter\n"));
726
727	switch (cmd) {
728	case DDI_ATTACH:
729		break;
730	case DDI_RESUME:
731	default:
732		DBG_PRINTF((category, RSM_ERR,
733		    "rsm:rsm_attach - cmd not supported\n"));
734		return (DDI_FAILURE);
735	}
736
737	if (rsm_dip != NULL) {
738		DBG_PRINTF((category, RSM_ERR,
739		    "rsm:rsm_attach - supports only "
740		    "one instance\n"));
741		return (DDI_FAILURE);
742	}
743
744	rsm_enable_dr = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
745	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
746	    "enable-dynamic-reconfiguration", 1);
747
748	mutex_enter(&rsm_drv_data.drv_lock);
749	rsm_drv_data.drv_state = RSM_DRV_REG_PROCESSING;
750	mutex_exit(&rsm_drv_data.drv_lock);
751
752	if (rsm_enable_dr) {
753#ifdef	RSM_DRTEST
754		ret = rsm_kphysm_setup_func_register(&rsm_dr_callback_vec,
755		    (void *)NULL);
756#else
757		ret = kphysm_setup_func_register(&rsm_dr_callback_vec,
758		    (void *)NULL);
759#endif
760		if (ret != 0) {
761			mutex_exit(&rsm_drv_data.drv_lock);
762			cmn_err(CE_CONT, "rsm:rsm_attach - Dynamic "
763			    "reconfiguration setup failed\n");
764			return (DDI_FAILURE);
765		}
766	}
767
768	mutex_enter(&rsm_drv_data.drv_lock);
769	ASSERT(rsm_drv_data.drv_state == RSM_DRV_REG_PROCESSING);
770	rsm_drv_data.drv_state = RSM_DRV_OK;
771	cv_broadcast(&rsm_drv_data.drv_cv);
772	mutex_exit(&rsm_drv_data.drv_lock);
773
774	/*
775	 * page_list_read_lock();
776	 * xx_setup();
777	 * page_list_read_unlock();
778	 */
779
780	rsm_hash_size = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
781	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
782	    "segment-hashtable-size", RSM_HASHSZ);
783	if (rsm_hash_size == 0) {
784		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
785		    "rsm: segment-hashtable-size in rsm.conf "
786		    "must be greater than 0, defaulting to 128\n"));
787		rsm_hash_size = RSM_HASHSZ;
788	}
789
790	DBG_PRINTF((category, RSM_DEBUG, "rsm_attach rsm_hash_size: %d\n",
791	    rsm_hash_size));
792
793	rsm_pgcnt = 0;
794
795	percent = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
796	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
797	    "max-exported-memory", 0);
798	if (percent < 0) {
799		DBG_PRINTF((category, RSM_ERR,
800		    "rsm:rsm_attach not enough memory available to "
801		    "export, or max-exported-memory set incorrectly.\n"));
802		return (DDI_FAILURE);
803	}
804	/* 0 indicates no fixed upper limit. maxmem is the max	*/
805	/* available pageable physical mem			*/
806	rsm_pgcnt_max = (percent*maxmem)/100;
807
808	if (rsm_pgcnt_max > 0) {
809		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
810		    "rsm: Available physical memory = %lu pages, "
811		    "Max exportable memory = %lu pages",
812		    maxmem, rsm_pgcnt_max));
813	}
814
815	/*
816	 * Create minor number
817	 */
818	if (rsmresource_alloc(&rnum) != RSM_SUCCESS) {
819		DBG_PRINTF((category, RSM_ERR,
820		    "rsm: rsm_attach - Unable to get "
821		    "minor number\n"));
822		return (DDI_FAILURE);
823	}
824
825	ASSERT(rnum == RSM_DRIVER_MINOR);
826
827	if (ddi_create_minor_node(devi, DRIVER_NAME, S_IFCHR,
828	    rnum, DDI_PSEUDO, NULL) == DDI_FAILURE) {
829		DBG_PRINTF((category, RSM_ERR,
830		    "rsm: rsm_attach - unable to allocate "
831		    "minor #\n"));
832		return (DDI_FAILURE);
833	}
834
835	rsm_dip = devi;
836	/*
837	 * Allocate the hashtables
838	 */
839	rsmhash_alloc(&rsm_export_segs, rsm_hash_size);
840	rsmhash_alloc(&rsm_import_segs, rsm_hash_size);
841
842	importer_list.bucket = (importing_token_t **)
843	    kmem_zalloc(rsm_hash_size * sizeof (importing_token_t *), KM_SLEEP);
844
845	/*
846	 * Allocate a resource struct
847	 */
848	{
849		rsmresource_t *p;
850
851		p = (rsmresource_t *)kmem_zalloc(sizeof (*p), KM_SLEEP);
852
853		mutex_init(&p->rsmrc_lock, NULL, MUTEX_DRIVER, (void *) NULL);
854
855		rsmresource_insert(rnum, p, RSM_RESOURCE_BAR);
856	}
857
858	/*
859	 * Based on the rsm.conf property max-segments, determine the maximum
860	 * number of segments that can be exported/imported. This is then used
861	 * to determine the size for barrier failure pages.
862	 */
863
864	/* First get the max number of segments from the rsm.conf file */
865	max_segs = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
866	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
867	    "max-segments", 0);
868	if (max_segs == 0) {
869		/* Use default number of segments */
870		max_segs = RSM_MAX_NUM_SEG;
871	}
872
873	/*
874	 * Based on the max number of segments allowed, determine the barrier
875	 * page size. add 1 to max_segs since the barrier page itself uses
876	 * a slot
877	 */
878	barrier_size = roundup((max_segs + 1) * sizeof (rsm_gnum_t),
879	    PAGESIZE);
880
881	/*
882	 * allocation of the barrier failure page
883	 */
884	bar_va = (rsm_gnum_t *)ddi_umem_alloc(barrier_size,
885	    DDI_UMEM_SLEEP, &bar_cookie);
886
887	/*
888	 * Set the barrier_offset
889	 */
890	barrier_offset = 0;
891
892	/*
893	 * Allocate a trash memory and get a cookie for it. This will be used
894	 * when remapping segments during force disconnects. Allocate the
895	 * trash memory with a large size which is page aligned.
896	 */
897	(void) ddi_umem_alloc((size_t)TRASHSIZE,
898	    DDI_UMEM_TRASH, &remap_cookie);
899
900	/* initialize user segment id allocation variable */
901	rsm_nextavail_segmentid = (rsm_memseg_id_t)RSM_USER_APP_ID_BASE;
902
903	/*
904	 * initialize the null_rsmpi_ops vector and the loopback adapter
905	 */
906	rsmka_init_loopback();
907
908
909	ddi_report_dev(devi);
910
911	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_attach done\n"));
912
913	return (DDI_SUCCESS);
914}
915
916/*
917 * The call to mod_remove in the _fine routine will cause the system
918 * to call rsm_detach
919 */
920/*ARGSUSED*/
921static int
922rsm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
923{
924	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
925
926	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_detach enter\n"));
927
928	switch (cmd) {
929	case DDI_DETACH:
930		break;
931	default:
932		DBG_PRINTF((category, RSM_ERR,
933		    "rsm:rsm_detach - cmd %x not supported\n",
934		    cmd));
935		return (DDI_FAILURE);
936	}
937
938	mutex_enter(&rsm_drv_data.drv_lock);
939	while (rsm_drv_data.drv_state != RSM_DRV_OK)
940		cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
941	rsm_drv_data.drv_state = RSM_DRV_UNREG_PROCESSING;
942	mutex_exit(&rsm_drv_data.drv_lock);
943
944	/*
945	 * Unregister the DR callback functions
946	 */
947	if (rsm_enable_dr) {
948#ifdef	RSM_DRTEST
949		rsm_kphysm_setup_func_unregister(&rsm_dr_callback_vec,
950		    (void *)NULL);
951#else
952		kphysm_setup_func_unregister(&rsm_dr_callback_vec,
953		    (void *)NULL);
954#endif
955	}
956
957	mutex_enter(&rsm_drv_data.drv_lock);
958	ASSERT(rsm_drv_data.drv_state == RSM_DRV_UNREG_PROCESSING);
959	rsm_drv_data.drv_state = RSM_DRV_NEW;
960	mutex_exit(&rsm_drv_data.drv_lock);
961
962	ASSERT(rsm_suspend_list.list_head == NULL);
963
964	/*
965	 * Release all resources, seglist, controller, ...
966	 */
967
968	/* remove intersend queues */
969	/* remove registered services */
970
971
972	ddi_remove_minor_node(dip, DRIVER_NAME);
973	rsm_dip = NULL;
974
975	/*
976	 * Free minor zero resource
977	 */
978	{
979		rsmresource_t *p;
980
981		p = rsmresource_free(RSM_DRIVER_MINOR);
982		if (p) {
983			mutex_destroy(&p->rsmrc_lock);
984			kmem_free((void *)p, sizeof (*p));
985		}
986	}
987
988	/*
989	 * Free resource table
990	 */
991
992	rsmresource_destroy();
993
994	/*
995	 * Free the hash tables
996	 */
997	rsmhash_free(&rsm_export_segs, rsm_hash_size);
998	rsmhash_free(&rsm_import_segs, rsm_hash_size);
999
1000	kmem_free((void *)importer_list.bucket,
1001	    rsm_hash_size * sizeof (importing_token_t *));
1002	importer_list.bucket = NULL;
1003
1004
1005	/* free barrier page */
1006	if (bar_cookie != NULL) {
1007		ddi_umem_free(bar_cookie);
1008	}
1009	bar_va = NULL;
1010	bar_cookie = NULL;
1011
1012	/*
1013	 * Free the memory allocated for the trash
1014	 */
1015	if (remap_cookie != NULL) {
1016		ddi_umem_free(remap_cookie);
1017	}
1018	remap_cookie = NULL;
1019
1020	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_detach done\n"));
1021
1022	return (DDI_SUCCESS);
1023}
1024
1025/*ARGSUSED*/
1026static int
1027rsm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1028{
1029	register int error;
1030	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
1031
1032	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_info enter\n"));
1033
1034	switch (infocmd) {
1035	case DDI_INFO_DEVT2DEVINFO:
1036		if (rsm_dip == NULL)
1037			error = DDI_FAILURE;
1038		else {
1039			*result = (void *)rsm_dip;
1040			error = DDI_SUCCESS;
1041		}
1042		break;
1043	case DDI_INFO_DEVT2INSTANCE:
1044		*result = (void *)0;
1045		error = DDI_SUCCESS;
1046		break;
1047	default:
1048		error = DDI_FAILURE;
1049	}
1050
1051	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_info done\n"));
1052	return (error);
1053}
1054
1055adapter_t *
1056rsm_getadapter(rsm_ioctlmsg_t *msg, int mode)
1057{
1058	adapter_t *adapter;
1059	char adapter_devname[MAXNAMELEN];
1060	int instance;
1061	DBG_DEFINE(category,
1062	    RSM_KERNEL_AGENT | RSM_IMPORT | RSM_EXPORT | RSM_IOCTL);
1063
1064	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_getadapter enter\n"));
1065
1066	instance = msg->cnum;
1067
1068	if ((msg->cname_len <= 0) || (msg->cname_len > MAXNAMELEN)) {
1069		return (NULL);
1070	}
1071
1072	if (ddi_copyin(msg->cname, adapter_devname, msg->cname_len, mode))
1073		return (NULL);
1074
1075	if (strcmp(adapter_devname, "loopback") == 0)
1076		return (&loopback_adapter);
1077
1078	adapter = rsmka_lookup_adapter(adapter_devname, instance);
1079
1080	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_getadapter done\n"));
1081
1082	return (adapter);
1083}
1084
1085
1086/*
1087 * *********************** Resource Number Management ********************
1088 * All resources are stored in a simple hash table. The table is an array
1089 * of pointers to resource blks. Each blk contains:
1090 *	base	- base number of this blk
1091 *	used	- number of used slots in this blk.
1092 *	blks    - array of pointers to resource items.
1093 * An entry in a resource blk is empty if it's NULL.
1094 *
1095 * We start with no resource array. Each time we run out of slots, we
1096 * reallocate a new larger array and copy the pointer to the new array and
1097 * a new resource blk is allocated and added to the hash table.
1098 *
1099 * The resource control block contains:
1100 *      root    - array of pointer of resource blks
1101 *      sz      - current size of array.
1102 *      len     - last valid entry in array.
1103 *
1104 * A search operation based on a resource number is as follows:
1105 *      index = rnum / RESOURCE_BLKSZ;
1106 *      ASSERT(index < resource_block.len);
1107 *      ASSERT(index < resource_block.sz);
1108 *	offset = rnum % RESOURCE_BLKSZ;
1109 *      ASSERT(offset >= resource_block.root[index]->base);
1110 *	ASSERT(offset < resource_block.root[index]->base + RESOURCE_BLKSZ);
1111 *	return resource_block.root[index]->blks[offset];
1112 *
1113 * A resource blk is freed with its used count reachs zero.
1114 */
1115static int
1116rsmresource_alloc(minor_t *rnum)
1117{
1118
1119	/* search for available resource slot */
1120	int i, j, empty = -1;
1121	rsmresource_blk_t *blk;
1122
1123	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1124	    "rsmresource_alloc enter\n"));
1125
1126	rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
1127
1128	/* Try to find an empty slot */
1129	for (i = 0; i < rsm_resource.rsmrc_len; i++) {
1130		blk = rsm_resource.rsmrc_root[i];
1131		if (blk != NULL && blk->rsmrcblk_avail > 0) {
1132			/* found an empty slot in this blk */
1133			for (j = 0; j < RSMRC_BLKSZ; j++) {
1134				if (blk->rsmrcblk_blks[j] == NULL) {
1135					*rnum = (minor_t)
1136					    (j + (i * RSMRC_BLKSZ));
1137					/*
1138					 * obey gen page limits
1139					 */
1140					if (*rnum >= max_segs + 1) {
1141						if (empty < 0) {
1142							rw_exit(&rsm_resource.
1143							    rsmrc_lock);
1144							DBG_PRINTF((
1145							    RSM_KERNEL_ALL,
1146							    RSM_ERR,
1147							    "rsmresource"
1148							    "_alloc failed:"
1149							    "not enough res"
1150							    "%d\n", *rnum));
1151					return (RSMERR_INSUFFICIENT_RESOURCES);
1152						} else {
1153							/* use empty slot */
1154							break;
1155						}
1156
1157					}
1158
1159					blk->rsmrcblk_blks[j] = RSMRC_RESERVED;
1160					blk->rsmrcblk_avail--;
1161					rw_exit(&rsm_resource.rsmrc_lock);
1162					DBG_PRINTF((RSM_KERNEL_ALL,
1163					    RSM_DEBUG_VERBOSE,
1164					    "rsmresource_alloc done\n"));
1165					return (RSM_SUCCESS);
1166				}
1167			}
1168		} else if (blk == NULL && empty < 0) {
1169			/* remember first empty slot */
1170			empty = i;
1171		}
1172	}
1173
1174	/* Couldn't find anything, allocate a new blk */
1175	/*
1176	 * Do we need to reallocate the root array
1177	 */
1178	if (empty < 0) {
1179		if (rsm_resource.rsmrc_len == rsm_resource.rsmrc_sz) {
1180			/*
1181			 * Allocate new array and copy current stuff into it
1182			 */
1183			rsmresource_blk_t	**p;
1184			uint_t newsz = (uint_t)rsm_resource.rsmrc_sz +
1185			    RSMRC_BLKSZ;
1186			/*
1187			 * Don't allocate more that max valid rnum
1188			 */
1189			if (rsm_resource.rsmrc_len*RSMRC_BLKSZ >=
1190			    max_segs + 1) {
1191				rw_exit(&rsm_resource.rsmrc_lock);
1192				return (RSMERR_INSUFFICIENT_RESOURCES);
1193			}
1194
1195			p = (rsmresource_blk_t **)kmem_zalloc(
1196			    newsz * sizeof (*p),
1197			    KM_SLEEP);
1198
1199			if (rsm_resource.rsmrc_root) {
1200				uint_t oldsz;
1201
1202				oldsz = (uint_t)(rsm_resource.rsmrc_sz *
1203				    (int)sizeof (*p));
1204
1205				/*
1206				 * Copy old data into new space and
1207				 * free old stuff
1208				 */
1209				bcopy(rsm_resource.rsmrc_root, p, oldsz);
1210				kmem_free(rsm_resource.rsmrc_root, oldsz);
1211			}
1212
1213			rsm_resource.rsmrc_root = p;
1214			rsm_resource.rsmrc_sz = (int)newsz;
1215		}
1216
1217		empty = rsm_resource.rsmrc_len;
1218		rsm_resource.rsmrc_len++;
1219	}
1220
1221	/*
1222	 * Allocate a new blk
1223	 */
1224	blk = (rsmresource_blk_t *)kmem_zalloc(sizeof (*blk), KM_SLEEP);
1225	ASSERT(rsm_resource.rsmrc_root[empty] == NULL);
1226	rsm_resource.rsmrc_root[empty] = blk;
1227	blk->rsmrcblk_avail = RSMRC_BLKSZ - 1;
1228
1229	/*
1230	 * Allocate slot
1231	 */
1232
1233	*rnum = (minor_t)(empty * RSMRC_BLKSZ);
1234
1235	/*
1236	 * watch out not to exceed bounds of barrier page
1237	 */
1238	if (*rnum >= max_segs + 1) {
1239		rw_exit(&rsm_resource.rsmrc_lock);
1240		DBG_PRINTF((RSM_KERNEL_ALL, RSM_ERR,
1241		    "rsmresource_alloc failed %d\n", *rnum));
1242
1243		return (RSMERR_INSUFFICIENT_RESOURCES);
1244	}
1245	blk->rsmrcblk_blks[0] = RSMRC_RESERVED;
1246
1247
1248	rw_exit(&rsm_resource.rsmrc_lock);
1249
1250	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1251	    "rsmresource_alloc done\n"));
1252
1253	return (RSM_SUCCESS);
1254}
1255
1256static rsmresource_t *
1257rsmresource_free(minor_t rnum)
1258{
1259
1260	/* search for available resource slot */
1261	int i, j;
1262	rsmresource_blk_t *blk;
1263	rsmresource_t *p;
1264
1265	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1266	    "rsmresource_free enter\n"));
1267
1268	i = (int)(rnum / RSMRC_BLKSZ);
1269	j = (int)(rnum % RSMRC_BLKSZ);
1270
1271	if (i >= rsm_resource.rsmrc_len) {
1272		DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1273		    "rsmresource_free done\n"));
1274		return (NULL);
1275	}
1276
1277	rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
1278
1279	ASSERT(rsm_resource.rsmrc_root);
1280	ASSERT(i < rsm_resource.rsmrc_len);
1281	ASSERT(i < rsm_resource.rsmrc_sz);
1282	blk = rsm_resource.rsmrc_root[i];
1283	if (blk == NULL) {
1284		rw_exit(&rsm_resource.rsmrc_lock);
1285		DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1286		    "rsmresource_free done\n"));
1287		return (NULL);
1288	}
1289
1290	ASSERT(blk->rsmrcblk_blks[j]); /* reserved or full */
1291
1292	p = blk->rsmrcblk_blks[j];
1293	if (p == RSMRC_RESERVED) {
1294		p = NULL;
1295	}
1296
1297	blk->rsmrcblk_blks[j] = NULL;
1298	blk->rsmrcblk_avail++;
1299	if (blk->rsmrcblk_avail == RSMRC_BLKSZ) {
1300		/* free this blk */
1301		kmem_free(blk, sizeof (*blk));
1302		rsm_resource.rsmrc_root[i] = NULL;
1303	}
1304
1305	rw_exit(&rsm_resource.rsmrc_lock);
1306
1307	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1308	    "rsmresource_free done\n"));
1309
1310	return (p);
1311}
1312
1313static rsmresource_t *
1314rsmresource_lookup(minor_t rnum, int lock)
1315{
1316	int i, j;
1317	rsmresource_blk_t *blk;
1318	rsmresource_t *p;
1319
1320	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1321	    "rsmresource_lookup enter\n"));
1322
1323	/* Find resource and lock it in READER mode */
1324	/* search for available resource slot */
1325
1326	i = (int)(rnum / RSMRC_BLKSZ);
1327	j = (int)(rnum % RSMRC_BLKSZ);
1328
1329	if (i >= rsm_resource.rsmrc_len) {
1330		DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1331		    "rsmresource_lookup done\n"));
1332		return (NULL);
1333	}
1334
1335	rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
1336
1337	blk = rsm_resource.rsmrc_root[i];
1338	if (blk != NULL) {
1339		ASSERT(i < rsm_resource.rsmrc_len);
1340		ASSERT(i < rsm_resource.rsmrc_sz);
1341
1342		p = blk->rsmrcblk_blks[j];
1343		if (lock == RSM_LOCK) {
1344			if (p != RSMRC_RESERVED) {
1345				mutex_enter(&p->rsmrc_lock);
1346			} else {
1347				p = NULL;
1348			}
1349		}
1350	} else {
1351		p = NULL;
1352	}
1353	rw_exit(&rsm_resource.rsmrc_lock);
1354
1355	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1356	    "rsmresource_lookup done\n"));
1357
1358	return (p);
1359}
1360
1361static void
1362rsmresource_insert(minor_t rnum, rsmresource_t *p, rsm_resource_type_t type)
1363{
1364	/* Find resource and lock it in READER mode */
1365	/* Caller can upgrade if need be */
1366	/* search for available resource slot */
1367	int i, j;
1368	rsmresource_blk_t *blk;
1369
1370	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1371	    "rsmresource_insert enter\n"));
1372
1373	i = (int)(rnum / RSMRC_BLKSZ);
1374	j = (int)(rnum % RSMRC_BLKSZ);
1375
1376	p->rsmrc_type = type;
1377	p->rsmrc_num = rnum;
1378
1379	rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
1380
1381	ASSERT(rsm_resource.rsmrc_root);
1382	ASSERT(i < rsm_resource.rsmrc_len);
1383	ASSERT(i < rsm_resource.rsmrc_sz);
1384
1385	blk = rsm_resource.rsmrc_root[i];
1386	ASSERT(blk);
1387
1388	ASSERT(blk->rsmrcblk_blks[j] == RSMRC_RESERVED);
1389
1390	blk->rsmrcblk_blks[j] = p;
1391
1392	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1393	    "rsmresource_insert done\n"));
1394
1395	rw_exit(&rsm_resource.rsmrc_lock);
1396}
1397
1398static void
1399rsmresource_destroy()
1400{
1401	int i, j;
1402
1403	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1404	    "rsmresource_destroy enter\n"));
1405
1406	rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
1407
1408	for (i = 0; i < rsm_resource.rsmrc_len; i++) {
1409		rsmresource_blk_t	*blk;
1410
1411		blk = rsm_resource.rsmrc_root[i];
1412		if (blk == NULL) {
1413			continue;
1414		}
1415		for (j = 0; j < RSMRC_BLKSZ; j++) {
1416			if (blk->rsmrcblk_blks[j] != NULL) {
1417				DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1418				    "Not null slot %d, %lx\n", j,
1419				    (size_t)blk->rsmrcblk_blks[j]));
1420			}
1421		}
1422		kmem_free(blk, sizeof (*blk));
1423		rsm_resource.rsmrc_root[i] = NULL;
1424	}
1425	if (rsm_resource.rsmrc_root) {
1426		i = rsm_resource.rsmrc_sz * (int)sizeof (rsmresource_blk_t *);
1427		kmem_free(rsm_resource.rsmrc_root, (uint_t)i);
1428		rsm_resource.rsmrc_root = NULL;
1429		rsm_resource.rsmrc_len = 0;
1430		rsm_resource.rsmrc_sz = 0;
1431	}
1432
1433	DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
1434	    "rsmresource_destroy done\n"));
1435
1436	rw_exit(&rsm_resource.rsmrc_lock);
1437}
1438
1439
1440/* ******************** Generic Key Hash Table Management ********* */
1441static rsmresource_t *
1442rsmhash_lookup(rsmhash_table_t *rhash, rsm_memseg_id_t key,
1443    rsm_resource_state_t state)
1444{
1445	rsmresource_t	*p;
1446	uint_t		hashval;
1447	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1448
1449	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_lookup enter\n"));
1450
1451	hashval = rsmhash(key);
1452
1453	DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_lookup %u=%d\n",
1454	    key, hashval));
1455
1456	rw_enter(&rhash->rsmhash_rw, RW_READER);
1457
1458	p = (rsmresource_t *)rsmhash_getbkt(rhash, hashval);
1459
1460	for (; p; p = p->rsmrc_next) {
1461		if (p->rsmrc_key == key) {
1462			/* acquire resource lock */
1463			RSMRC_LOCK(p);
1464			break;
1465		}
1466	}
1467
1468	rw_exit(&rhash->rsmhash_rw);
1469
1470	if (p != NULL && p->rsmrc_state != state) {
1471		/* state changed, release lock and return null */
1472		RSMRC_UNLOCK(p);
1473		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
1474		    "rsmhash_lookup done: state changed\n"));
1475		return (NULL);
1476	}
1477
1478	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_lookup done\n"));
1479
1480	return (p);
1481}
1482
1483static void
1484rsmhash_rm(rsmhash_table_t *rhash, rsmresource_t *rcelm)
1485{
1486	rsmresource_t		*p, **back;
1487	uint_t			hashval;
1488	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1489
1490	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_rm enter\n"));
1491
1492	hashval = rsmhash(rcelm->rsmrc_key);
1493
1494	DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_rm %u=%d\n",
1495	    rcelm->rsmrc_key, hashval));
1496
1497	/*
1498	 * It's ok not to find the segment.
1499	 */
1500	rw_enter(&rhash->rsmhash_rw, RW_WRITER);
1501
1502	back = (rsmresource_t **)rsmhash_bktaddr(rhash, hashval);
1503
1504	for (; (p = *back) != NULL;  back = &p->rsmrc_next) {
1505		if (p == rcelm) {
1506			*back = rcelm->rsmrc_next;
1507			break;
1508		}
1509	}
1510
1511	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_rm done\n"));
1512
1513	rw_exit(&rhash->rsmhash_rw);
1514}
1515
1516static int
1517rsmhash_add(rsmhash_table_t *rhash, rsmresource_t *new, rsm_memseg_id_t key,
1518    int dup_check, rsm_resource_state_t state)
1519{
1520	rsmresource_t	*p = NULL, **bktp;
1521	uint_t		hashval;
1522	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1523
1524	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_add enter\n"));
1525
1526	/* lock table */
1527	rw_enter(&rhash->rsmhash_rw, RW_WRITER);
1528
1529	/*
1530	 * If the current resource state is other than the state passed in
1531	 * then the resource is (probably) already on the list. eg. for an
1532	 * import segment if the state is not RSM_STATE_NEW then it's on the
1533	 * list already.
1534	 */
1535	RSMRC_LOCK(new);
1536	if (new->rsmrc_state != state) {
1537		RSMRC_UNLOCK(new);
1538		rw_exit(&rhash->rsmhash_rw);
1539		return (RSMERR_BAD_SEG_HNDL);
1540	}
1541
1542	hashval = rsmhash(key);
1543	DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_add %d\n", hashval));
1544
1545	if (dup_check) {
1546		/*
1547		 * Used for checking export segments; don't want to have
1548		 * the same key used for multiple segments.
1549		 */
1550
1551		p = (rsmresource_t *)rsmhash_getbkt(rhash, hashval);
1552
1553		for (; p; p = p->rsmrc_next) {
1554			if (p->rsmrc_key == key) {
1555				RSMRC_UNLOCK(new);
1556				break;
1557			}
1558		}
1559	}
1560
1561	if (p == NULL) {
1562		/* Key doesn't exist, add it */
1563
1564		bktp = (rsmresource_t **)rsmhash_bktaddr(rhash, hashval);
1565
1566		new->rsmrc_key = key;
1567		new->rsmrc_next = *bktp;
1568		*bktp = new;
1569	}
1570
1571	rw_exit(&rhash->rsmhash_rw);
1572
1573	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_add done\n"));
1574
1575	return (p == NULL ? RSM_SUCCESS : RSMERR_SEGID_IN_USE);
1576}
1577
1578/*
1579 * XOR each byte of the key.
1580 */
1581static uint_t
1582rsmhash(rsm_memseg_id_t key)
1583{
1584	uint_t	hash = key;
1585
1586	hash ^=  (key >> 8);
1587	hash ^=  (key >> 16);
1588	hash ^=  (key >> 24);
1589
1590	return (hash % rsm_hash_size);
1591
1592}
1593
1594/*
1595 * generic function to get a specific bucket
1596 */
1597static void *
1598rsmhash_getbkt(rsmhash_table_t *rhash, uint_t hashval)
1599{
1600
1601	if (rhash->bucket == NULL)
1602		return (NULL);
1603	else
1604		return ((void *)rhash->bucket[hashval]);
1605}
1606
1607/*
1608 * generic function to get a specific bucket's address
1609 */
1610static void **
1611rsmhash_bktaddr(rsmhash_table_t *rhash, uint_t hashval)
1612{
1613	if (rhash->bucket == NULL)
1614		return (NULL);
1615	else
1616		return ((void **)&(rhash->bucket[hashval]));
1617}
1618
1619/*
1620 * generic function to alloc a hash table
1621 */
1622static void
1623rsmhash_alloc(rsmhash_table_t *rhash, int size)
1624{
1625	rhash->bucket = (rsmresource_t **)
1626	    kmem_zalloc(size * sizeof (rsmresource_t *), KM_SLEEP);
1627}
1628
1629/*
1630 * generic function to free a hash table
1631 */
1632static void
1633rsmhash_free(rsmhash_table_t *rhash, int size)
1634{
1635
1636	kmem_free((void *)rhash->bucket, size * sizeof (caddr_t));
1637	rhash->bucket = NULL;
1638
1639}
1640/* *********************** Exported Segment Key Management ************ */
1641
1642#define	rsmexport_add(new, key)		\
1643	rsmhash_add(&rsm_export_segs, (rsmresource_t *)new, key, 1, \
1644	    RSM_STATE_BIND)
1645
1646#define	rsmexport_rm(arg)	\
1647	rsmhash_rm(&rsm_export_segs, (rsmresource_t *)(arg))
1648
1649#define	rsmexport_lookup(key)	\
1650	(rsmseg_t *)rsmhash_lookup(&rsm_export_segs, key, RSM_STATE_EXPORT)
1651
1652/* ************************** Import Segment List Management ********** */
1653
1654/*
1655 *  Add segment to import list. This will be useful for paging and loopback
1656 * segment unloading.
1657 */
1658#define	rsmimport_add(arg, key)	\
1659	rsmhash_add(&rsm_import_segs, (rsmresource_t *)(arg), (key), 0, \
1660	    RSM_STATE_NEW)
1661
1662#define	rsmimport_rm(arg)	\
1663	rsmhash_rm(&rsm_import_segs, (rsmresource_t *)(arg))
1664
1665/*
1666 *	#define	rsmimport_lookup(key)	\
1667 *	(rsmseg_t *)rsmhash_lookup(&rsm_import_segs, (key), RSM_STATE_CONNECT)
1668 */
1669
1670/*
1671 * increase the ref count and make the import segment point to the
1672 * shared data structure. Return a pointer to the share data struct
1673 * and the shared data struct is locked upon return
1674 */
1675static rsm_import_share_t *
1676rsmshare_get(rsm_memseg_id_t key, rsm_node_id_t node, adapter_t *adapter,
1677    rsmseg_t *segp)
1678{
1679	uint_t		hash;
1680	rsmresource_t		*p;
1681	rsm_import_share_t	*shdatap;
1682	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1683
1684	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmshare_get enter\n"));
1685
1686	hash = rsmhash(key);
1687	/* lock table */
1688	rw_enter(&rsm_import_segs.rsmhash_rw, RW_WRITER);
1689	DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmshare_get:key=%u, hash=%d\n",
1690	    key, hash));
1691
1692	p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hash);
1693
1694	for (; p; p = p->rsmrc_next) {
1695		/*
1696		 * Look for an entry that is importing the same exporter
1697		 * with the share data structure allocated.
1698		 */
1699		if ((p->rsmrc_key == key) &&
1700		    (p->rsmrc_node == node) &&
1701		    (p->rsmrc_adapter == adapter) &&
1702		    (((rsmseg_t *)p)->s_share != NULL)) {
1703			shdatap = ((rsmseg_t *)p)->s_share;
1704			break;
1705		}
1706	}
1707
1708	if (p == NULL) {
1709		/* we are the first importer, create the shared data struct */
1710		shdatap = kmem_zalloc(sizeof (rsm_import_share_t), KM_SLEEP);
1711		shdatap->rsmsi_state = RSMSI_STATE_NEW;
1712		shdatap->rsmsi_segid = key;
1713		shdatap->rsmsi_node = node;
1714		mutex_init(&shdatap->rsmsi_lock, NULL, MUTEX_DRIVER, NULL);
1715		cv_init(&shdatap->rsmsi_cv, NULL, CV_DRIVER, 0);
1716	}
1717
1718	rsmseglock_acquire(segp);
1719
1720	/* we grab the shared lock before returning from this function */
1721	mutex_enter(&shdatap->rsmsi_lock);
1722
1723	shdatap->rsmsi_refcnt++;
1724	segp->s_share = shdatap;
1725
1726	rsmseglock_release(segp);
1727
1728	rw_exit(&rsm_import_segs.rsmhash_rw);
1729
1730	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmshare_get done\n"));
1731
1732	return (shdatap);
1733}
1734
1735/*
1736 * the shared data structure should be locked before calling
1737 * rsmsharecv_signal().
1738 * Change the state and signal any waiting segments.
1739 */
1740void
1741rsmsharecv_signal(rsmseg_t *seg, int oldstate, int newstate)
1742{
1743	ASSERT(rsmsharelock_held(seg));
1744
1745	if (seg->s_share->rsmsi_state == oldstate) {
1746		seg->s_share->rsmsi_state = newstate;
1747		cv_broadcast(&seg->s_share->rsmsi_cv);
1748	}
1749}
1750
1751/*
1752 * Add to the hash table
1753 */
1754static void
1755importer_list_add(rsm_node_id_t node, rsm_memseg_id_t key, rsm_addr_t hwaddr,
1756    void *cookie)
1757{
1758
1759	importing_token_t	*head;
1760	importing_token_t	*new_token;
1761	int			index;
1762
1763	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1764
1765	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_add enter\n"));
1766
1767	new_token = kmem_zalloc(sizeof (importing_token_t), KM_SLEEP);
1768	new_token->importing_node = node;
1769	new_token->key = key;
1770	new_token->import_segment_cookie = cookie;
1771	new_token->importing_adapter_hwaddr = hwaddr;
1772
1773	index = rsmhash(key);
1774
1775	mutex_enter(&importer_list.lock);
1776
1777	head = importer_list.bucket[index];
1778	importer_list.bucket[index] = new_token;
1779	new_token->next = head;
1780	mutex_exit(&importer_list.lock);
1781
1782	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_add done\n"));
1783}
1784
1785static void
1786importer_list_rm(rsm_node_id_t node,  rsm_memseg_id_t key, void *cookie)
1787{
1788
1789	importing_token_t	*prev, *token = NULL;
1790	int			index;
1791	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1792
1793	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_rm enter\n"));
1794
1795	index = rsmhash(key);
1796
1797	mutex_enter(&importer_list.lock);
1798
1799	token = importer_list.bucket[index];
1800
1801	prev = token;
1802	while (token != NULL) {
1803		if (token->importing_node == node &&
1804		    token->import_segment_cookie == cookie) {
1805			if (prev == token)
1806				importer_list.bucket[index] = token->next;
1807			else
1808				prev->next = token->next;
1809			kmem_free((void *)token, sizeof (*token));
1810			break;
1811		} else {
1812			prev = token;
1813			token = token->next;
1814		}
1815	}
1816
1817	mutex_exit(&importer_list.lock);
1818
1819	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_rm done\n"));
1820
1821
1822}
1823
1824/* **************************Segment Structure Management ************* */
1825
1826/*
1827 * Free segment structure
1828 */
1829static void
1830rsmseg_free(rsmseg_t *seg)
1831{
1832
1833	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1834
1835	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_free enter\n"));
1836
1837	/* need to take seglock here to avoid race with rsmmap_unmap() */
1838	rsmseglock_acquire(seg);
1839	if (seg->s_ckl != NULL) {
1840		/* Segment is still busy */
1841		seg->s_state = RSM_STATE_END;
1842		rsmseglock_release(seg);
1843		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
1844		    "rsmseg_free done\n"));
1845		return;
1846	}
1847
1848	rsmseglock_release(seg);
1849
1850	ASSERT(seg->s_state == RSM_STATE_END || seg->s_state == RSM_STATE_NEW);
1851
1852	/*
1853	 * If it's an importer decrement the refcount
1854	 * and if its down to zero free the shared data structure.
1855	 * This is where failures during rsm_connect() are unrefcounted
1856	 */
1857	if (seg->s_share != NULL) {
1858
1859		ASSERT(seg->s_type == RSM_RESOURCE_IMPORT_SEGMENT);
1860
1861		rsmsharelock_acquire(seg);
1862
1863		ASSERT(seg->s_share->rsmsi_refcnt > 0);
1864
1865		seg->s_share->rsmsi_refcnt--;
1866
1867		if (seg->s_share->rsmsi_refcnt == 0) {
1868			rsmsharelock_release(seg);
1869			mutex_destroy(&seg->s_share->rsmsi_lock);
1870			cv_destroy(&seg->s_share->rsmsi_cv);
1871			kmem_free((void *)(seg->s_share),
1872			    sizeof (rsm_import_share_t));
1873		} else {
1874			rsmsharelock_release(seg);
1875		}
1876		/*
1877		 * The following needs to be done after any
1878		 * rsmsharelock calls which use seg->s_share.
1879		 */
1880		seg->s_share = NULL;
1881	}
1882
1883	cv_destroy(&seg->s_cv);
1884	mutex_destroy(&seg->s_lock);
1885	rsmacl_free(seg->s_acl, seg->s_acl_len);
1886	rsmpiacl_free(seg->s_acl_in, seg->s_acl_len);
1887	if (seg->s_adapter)
1888		rsmka_release_adapter(seg->s_adapter);
1889
1890	kmem_free((void *)seg, sizeof (*seg));
1891
1892	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_free done\n"));
1893
1894}
1895
1896
1897static rsmseg_t *
1898rsmseg_alloc(minor_t num, struct cred *cred)
1899{
1900	rsmseg_t	*new;
1901	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
1902
1903	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_alloc enter\n"));
1904	/*
1905	 * allocate memory for new segment. This should be a segkmem cache.
1906	 */
1907	new = (rsmseg_t *)kmem_zalloc(sizeof (*new), KM_SLEEP);
1908
1909	new->s_state = RSM_STATE_NEW;
1910	new->s_minor	= num;
1911	new->s_acl_len	= 0;
1912	new->s_cookie = NULL;
1913	new->s_adapter = NULL;
1914
1915	new->s_mode = 0777 & ~PTOU((ttoproc(curthread)))->u_cmask;
1916	/* we don't have a key yet, will set at export/connect */
1917	new->s_uid  = crgetuid(cred);
1918	new->s_gid  = crgetgid(cred);
1919
1920	mutex_init(&new->s_lock, NULL, MUTEX_DRIVER, (void *)NULL);
1921	cv_init(&new->s_cv, NULL, CV_DRIVER, 0);
1922
1923	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_alloc done\n"));
1924
1925	return (new);
1926}
1927
1928/* ******************************** Driver Open/Close/Poll *************** */
1929
1930/*ARGSUSED1*/
1931static int
1932rsm_open(dev_t *devp, int flag, int otyp, struct cred *cred)
1933{
1934	minor_t rnum;
1935	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
1936
1937	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_open enter\n"));
1938	/*
1939	 * Char only
1940	 */
1941	if (otyp != OTYP_CHR) {
1942		DBG_PRINTF((category, RSM_ERR, "rsm_open: bad otyp\n"));
1943		return (EINVAL);
1944	}
1945
1946	/*
1947	 * Only zero can be opened, clones are used for resources.
1948	 */
1949	if (getminor(*devp) != RSM_DRIVER_MINOR) {
1950		DBG_PRINTF((category, RSM_ERR,
1951		    "rsm_open: bad minor %d\n", getminor(*devp)));
1952		return (ENODEV);
1953	}
1954
1955	if ((flag & FEXCL) != 0 && secpolicy_excl_open(cred) != 0) {
1956		DBG_PRINTF((category, RSM_ERR, "rsm_open: bad perm\n"));
1957		return (EPERM);
1958	}
1959
1960	if (!(flag & FWRITE)) {
1961		/*
1962		 * The library function _rsm_librsm_init calls open for
1963		 * /dev/rsm with flag set to O_RDONLY.  We want a valid
1964		 * file descriptor to be returned for minor device zero.
1965		 */
1966
1967		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
1968		    "rsm_open RDONLY done\n"));
1969		return (DDI_SUCCESS);
1970	}
1971
1972	/*
1973	 * - allocate new minor number and segment.
1974	 * - add segment to list of all segments.
1975	 * - set minordev data to segment
1976	 * - update devp argument to new device
1977	 * - update s_cred to cred; make sure you do crhold(cred);
1978	 */
1979
1980	/* allocate a new resource number */
1981	if (rsmresource_alloc(&rnum) == RSM_SUCCESS) {
1982		/*
1983		 * We will bind this minor to a specific resource in first
1984		 * ioctl
1985		 */
1986		*devp = makedevice(getmajor(*devp), rnum);
1987	} else {
1988		return (EAGAIN);
1989	}
1990
1991	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_open done\n"));
1992	return (DDI_SUCCESS);
1993}
1994
1995static void
1996rsmseg_close(rsmseg_t *seg, int force_flag)
1997{
1998	int e = RSM_SUCCESS;
1999
2000	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
2001
2002	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_close enter\n"));
2003
2004	rsmseglock_acquire(seg);
2005	if (!force_flag && (seg->s_hdr.rsmrc_type ==
2006	    RSM_RESOURCE_EXPORT_SEGMENT)) {
2007		/*
2008		 * If we are processing rsm_close wait for force_destroy
2009		 * processing to complete since force_destroy processing
2010		 * needs to finish first before we can free the segment.
2011		 * force_destroy is only for export segments
2012		 */
2013		while (seg->s_flags & RSM_FORCE_DESTROY_WAIT) {
2014			cv_wait(&seg->s_cv, &seg->s_lock);
2015		}
2016	}
2017	rsmseglock_release(seg);
2018
2019	/* It's ok to read the state without a lock */
2020	switch (seg->s_state) {
2021	case RSM_STATE_EXPORT:
2022	case RSM_STATE_EXPORT_QUIESCING:
2023	case RSM_STATE_EXPORT_QUIESCED:
2024		e = rsm_unpublish(seg, 1);
2025		/* FALLTHRU */
2026	case RSM_STATE_BIND_QUIESCED:
2027		/* FALLTHRU */
2028	case RSM_STATE_BIND:
2029		e = rsm_unbind(seg);
2030		if (e != RSM_SUCCESS && force_flag == 1)
2031			return;
2032		ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT);
2033		/* FALLTHRU */
2034	case RSM_STATE_NEW_QUIESCED:
2035		rsmseglock_acquire(seg);
2036		seg->s_state = RSM_STATE_NEW;
2037		cv_broadcast(&seg->s_cv);
2038		rsmseglock_release(seg);
2039		break;
2040	case RSM_STATE_NEW:
2041		break;
2042	case RSM_STATE_ZOMBIE:
2043		/*
2044		 * Segments in this state have been removed off the
2045		 * exported segments list and have been unpublished
2046		 * and unbind. These segments have been removed during
2047		 * a callback to the rsm_export_force_destroy, which
2048		 * is called for the purpose of unlocking these
2049		 * exported memory segments when a process exits but
2050		 * leaves the segments locked down since rsm_close is
2051		 * is not called for the segments. This can happen
2052		 * when a process calls fork or exec and then exits.
2053		 * Once the segments are in the ZOMBIE state, all that
2054		 * remains is to destroy them when rsm_close is called.
2055		 * This is done here. Thus, for such segments the
2056		 * the state is changed to new so that later in this
2057		 * function rsmseg_free is called.
2058		 */
2059		rsmseglock_acquire(seg);
2060		seg->s_state = RSM_STATE_NEW;
2061		rsmseglock_release(seg);
2062		break;
2063	case RSM_STATE_MAP_QUIESCE:
2064	case RSM_STATE_ACTIVE:
2065		/* Disconnect will handle the unmap */
2066	case RSM_STATE_CONN_QUIESCE:
2067	case RSM_STATE_CONNECT:
2068	case RSM_STATE_DISCONNECT:
2069		ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
2070		(void) rsm_disconnect(seg);
2071		break;
2072	case RSM_STATE_MAPPING:
2073		/*FALLTHRU*/
2074	case RSM_STATE_END:
2075		DBG_PRINTF((category, RSM_ERR,
2076		    "Invalid segment state %d in rsm_close\n", seg->s_state));
2077		break;
2078	default:
2079		DBG_PRINTF((category, RSM_ERR,
2080		    "Invalid segment state %d in rsm_close\n", seg->s_state));
2081		break;
2082	}
2083
2084	/*
2085	 * check state.
2086	 * - make sure you do crfree(s_cred);
2087	 * release segment and minor number
2088	 */
2089	ASSERT(seg->s_state == RSM_STATE_NEW);
2090
2091	/*
2092	 * The export_force_destroy callback is created to unlock
2093	 * the exported segments of a process
2094	 * when the process does a fork or exec and then exits calls this
2095	 * function with the force flag set to 1 which indicates that the
2096	 * segment state must be converted to ZOMBIE. This state means that the
2097	 * segments still exist and have been unlocked and most importantly the
2098	 * only operation allowed is to destroy them on an rsm_close.
2099	 */
2100	if (force_flag) {
2101		rsmseglock_acquire(seg);
2102		seg->s_state = RSM_STATE_ZOMBIE;
2103		rsmseglock_release(seg);
2104	} else {
2105		rsmseg_free(seg);
2106	}
2107
2108	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_close done\n"));
2109}
2110
2111static int
2112rsm_close(dev_t dev, int flag, int otyp, cred_t *cred)
2113{
2114	minor_t	rnum = getminor(dev);
2115	rsmresource_t *res;
2116	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
2117
2118	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close enter\n"));
2119
2120	flag = flag; cred = cred;
2121
2122	if (otyp != OTYP_CHR)
2123		return (EINVAL);
2124
2125	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rnum = %d\n", rnum));
2126
2127	/*
2128	 * At this point we are the last reference to the resource.
2129	 * Free resource number from resource table.
2130	 * It's ok to remove number before we free the segment.
2131	 * We need to lock the resource to protect against remote calls.
2132	 */
2133	if (rnum == RSM_DRIVER_MINOR ||
2134	    (res = rsmresource_free(rnum)) == NULL) {
2135		DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close done\n"));
2136		return (DDI_SUCCESS);
2137	}
2138
2139	switch (res->rsmrc_type) {
2140	case RSM_RESOURCE_EXPORT_SEGMENT:
2141	case RSM_RESOURCE_IMPORT_SEGMENT:
2142		rsmseg_close((rsmseg_t *)res, 0);
2143		break;
2144	case RSM_RESOURCE_BAR:
2145		DBG_PRINTF((category, RSM_ERR, "bad resource in rsm_close\n"));
2146		break;
2147	default:
2148		break;
2149	}
2150
2151	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close done\n"));
2152
2153	return (DDI_SUCCESS);
2154}
2155
2156/*
2157 * rsm_inc_pgcnt
2158 *
2159 * Description: increment rsm page counter.
2160 *
2161 * Parameters:	pgcnt_t	pnum;	number of pages to be used
2162 *
2163 * Returns:	RSM_SUCCESS	if memory limit not exceeded
2164 *		ENOSPC		if memory limit exceeded. In this case, the
2165 *				page counter remains unchanged.
2166 *
2167 */
2168static int
2169rsm_inc_pgcnt(pgcnt_t pnum)
2170{
2171	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2172	if (rsm_pgcnt_max == 0) { /* no upper limit has been set */
2173		return (RSM_SUCCESS);
2174	}
2175
2176	mutex_enter(&rsm_pgcnt_lock);
2177
2178	if (rsm_pgcnt + pnum > rsm_pgcnt_max) {
2179		/* ensure that limits have not been exceeded */
2180		mutex_exit(&rsm_pgcnt_lock);
2181		return (RSMERR_INSUFFICIENT_MEM);
2182	}
2183
2184	rsm_pgcnt += pnum;
2185	DBG_PRINTF((category, RSM_DEBUG, "rsm_pgcnt incr to %d.\n",
2186	    rsm_pgcnt));
2187	mutex_exit(&rsm_pgcnt_lock);
2188
2189	return (RSM_SUCCESS);
2190}
2191
2192/*
2193 * rsm_dec_pgcnt
2194 *
2195 * Description:	decrement rsm page counter.
2196 *
2197 * Parameters:	pgcnt_t	pnum;	number of pages freed
2198 *
2199 */
2200static void
2201rsm_dec_pgcnt(pgcnt_t pnum)
2202{
2203	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2204
2205	if (rsm_pgcnt_max == 0) { /* no upper limit has been set */
2206		return;
2207	}
2208
2209	mutex_enter(&rsm_pgcnt_lock);
2210	ASSERT(rsm_pgcnt >= pnum);
2211	rsm_pgcnt -= pnum;
2212	DBG_PRINTF((category, RSM_DEBUG, "rsm_pgcnt decr to %d.\n",
2213	    rsm_pgcnt));
2214	mutex_exit(&rsm_pgcnt_lock);
2215}
2216
2217static struct umem_callback_ops rsm_as_ops = {
2218	UMEM_CALLBACK_VERSION, /* version number */
2219	rsm_export_force_destroy,
2220};
2221
2222static int
2223rsm_bind_pages(ddi_umem_cookie_t *cookie, caddr_t vaddr, size_t len,
2224    proc_t *procp)
2225{
2226	int error = RSM_SUCCESS;
2227	ulong_t pnum;
2228	struct umem_callback_ops *callbackops = &rsm_as_ops;
2229
2230	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2231
2232	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind_pages enter\n"));
2233
2234	/*
2235	 * Make sure vaddr and len are aligned on a page boundary
2236	 */
2237	if ((uintptr_t)vaddr & (PAGESIZE - 1)) {
2238		return (RSMERR_BAD_ADDR);
2239	}
2240
2241	if (len & (PAGESIZE - 1)) {
2242		return (RSMERR_BAD_LENGTH);
2243	}
2244
2245	/*
2246	 * Find number of pages
2247	 */
2248	pnum = btopr(len);
2249	error = rsm_inc_pgcnt(pnum);
2250	if (error != RSM_SUCCESS) {
2251		DBG_PRINTF((category, RSM_ERR,
2252		    "rsm_bind_pages:mem limit exceeded\n"));
2253		return (RSMERR_INSUFFICIENT_MEM);
2254	}
2255
2256	error = umem_lockmemory(vaddr, len,
2257	    DDI_UMEMLOCK_WRITE|DDI_UMEMLOCK_READ|DDI_UMEMLOCK_LONGTERM,
2258	    cookie,
2259	    callbackops, procp);
2260
2261	if (error) {
2262		rsm_dec_pgcnt(pnum);
2263		DBG_PRINTF((category, RSM_ERR,
2264		    "rsm_bind_pages:ddi_umem_lock failed\n"));
2265		/*
2266		 * ddi_umem_lock, in the case of failure, returns one of
2267		 * the following three errors. These are translated into
2268		 * the RSMERR namespace and returned.
2269		 */
2270		if (error == EFAULT)
2271			return (RSMERR_BAD_ADDR);
2272		else if (error == EACCES)
2273			return (RSMERR_PERM_DENIED);
2274		else
2275			return (RSMERR_INSUFFICIENT_MEM);
2276	}
2277
2278	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind_pages done\n"));
2279
2280	return (error);
2281
2282}
2283
2284static int
2285rsm_unbind_pages(rsmseg_t *seg)
2286{
2287	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2288
2289	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind_pages enter\n"));
2290
2291	ASSERT(rsmseglock_held(seg));
2292
2293	if (seg->s_cookie != NULL) {
2294		/* unlock address range */
2295		ddi_umem_unlock(seg->s_cookie);
2296		rsm_dec_pgcnt(btopr(seg->s_len));
2297		seg->s_cookie = NULL;
2298	}
2299
2300	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind_pages done\n"));
2301
2302	return (RSM_SUCCESS);
2303}
2304
2305
2306static int
2307rsm_bind(rsmseg_t *seg, rsm_ioctlmsg_t *msg, intptr_t dataptr, int mode)
2308{
2309	int e;
2310	adapter_t *adapter;
2311	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2312
2313	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind enter\n"));
2314
2315	adapter = rsm_getadapter(msg, mode);
2316	if (adapter == NULL) {
2317		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2318		    "rsm_bind done:no adapter\n"));
2319		return (RSMERR_CTLR_NOT_PRESENT);
2320	}
2321
2322	/* lock address range */
2323	if (msg->vaddr == NULL) {
2324		rsmka_release_adapter(adapter);
2325		DBG_PRINTF((category, RSM_ERR,
2326		    "rsm: rsm_bind done: invalid vaddr\n"));
2327		return (RSMERR_BAD_ADDR);
2328	}
2329	if (msg->len <= 0) {
2330		rsmka_release_adapter(adapter);
2331		DBG_PRINTF((category, RSM_ERR,
2332		    "rsm_bind: invalid length\n"));
2333		return (RSMERR_BAD_LENGTH);
2334	}
2335
2336	/* Lock segment */
2337	rsmseglock_acquire(seg);
2338
2339	while (seg->s_state == RSM_STATE_NEW_QUIESCED) {
2340		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
2341			DBG_PRINTF((category, RSM_DEBUG,
2342			    "rsm_bind done: cv_wait INTERRUPTED"));
2343			rsmka_release_adapter(adapter);
2344			rsmseglock_release(seg);
2345			return (RSMERR_INTERRUPTED);
2346		}
2347	}
2348
2349	ASSERT(seg->s_state == RSM_STATE_NEW);
2350
2351	ASSERT(seg->s_cookie == NULL);
2352
2353	e = rsm_bind_pages(&seg->s_cookie, msg->vaddr, msg->len, curproc);
2354	if (e == RSM_SUCCESS) {
2355		seg->s_flags |= RSM_USER_MEMORY;
2356		if (msg->perm & RSM_ALLOW_REBIND) {
2357			seg->s_flags |= RSMKA_ALLOW_UNBIND_REBIND;
2358		}
2359		if (msg->perm & RSM_CREATE_SEG_DONTWAIT) {
2360			seg->s_flags |= RSMKA_SET_RESOURCE_DONTWAIT;
2361		}
2362		seg->s_region.r_vaddr = msg->vaddr;
2363		/*
2364		 * Set the s_pid value in the segment structure. This is used
2365		 * to identify exported segments belonging to a particular
2366		 * process so that when the process exits, these segments can
2367		 * be unlocked forcefully even if rsm_close is not called on
2368		 * process exit since there maybe other processes referencing
2369		 * them (for example on a fork or exec).
2370		 * The s_pid value is also used to authenticate the process
2371		 * doing a publish or unpublish on the export segment. Only
2372		 * the creator of the export segment has a right to do a
2373		 * publish or unpublish and unbind on the segment.
2374		 */
2375		seg->s_pid = ddi_get_pid();
2376		seg->s_len = msg->len;
2377		seg->s_state = RSM_STATE_BIND;
2378		seg->s_adapter = adapter;
2379		seg->s_proc = curproc;
2380	} else {
2381		rsmka_release_adapter(adapter);
2382		DBG_PRINTF((category, RSM_WARNING,
2383		    "unable to lock down pages\n"));
2384	}
2385
2386	msg->rnum = seg->s_minor;
2387	/* Unlock segment */
2388	rsmseglock_release(seg);
2389
2390	if (e == RSM_SUCCESS) {
2391		/* copyout the resource number */
2392#ifdef _MULTI_DATAMODEL
2393		if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
2394			rsm_ioctlmsg32_t msg32;
2395
2396			msg32.rnum = msg->rnum;
2397			if (ddi_copyout((caddr_t)&msg32.rnum,
2398			    (caddr_t)&((rsm_ioctlmsg32_t *)dataptr)->rnum,
2399			    sizeof (minor_t), mode)) {
2400				rsmka_release_adapter(adapter);
2401				e = RSMERR_BAD_ADDR;
2402			}
2403		}
2404#endif
2405		if (ddi_copyout((caddr_t)&msg->rnum,
2406		    (caddr_t)&((rsm_ioctlmsg_t *)dataptr)->rnum,
2407		    sizeof (minor_t), mode)) {
2408			rsmka_release_adapter(adapter);
2409			e = RSMERR_BAD_ADDR;
2410		}
2411	}
2412
2413	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind done\n"));
2414
2415	return (e);
2416}
2417
2418static void
2419rsm_remap_local_importers(rsm_node_id_t src_nodeid,
2420    rsm_memseg_id_t ex_segid,
2421    ddi_umem_cookie_t cookie)
2422
2423{
2424	rsmresource_t	*p = NULL;
2425	rsmhash_table_t *rhash = &rsm_import_segs;
2426	uint_t		index;
2427
2428	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_FUNC_ALL, RSM_DEBUG_VERBOSE,
2429	    "rsm_remap_local_importers enter\n"));
2430
2431	index = rsmhash(ex_segid);
2432
2433	rw_enter(&rhash->rsmhash_rw, RW_READER);
2434
2435	p = rsmhash_getbkt(rhash, index);
2436
2437	for (; p; p = p->rsmrc_next) {
2438		rsmseg_t *seg = (rsmseg_t *)p;
2439		rsmseglock_acquire(seg);
2440		/*
2441		 * Change the s_cookie value of only the local importers
2442		 * which have been mapped (in state RSM_STATE_ACTIVE).
2443		 * Note that there is no need to change the s_cookie value
2444		 * if the imported segment is in RSM_STATE_MAPPING since
2445		 * eventually the s_cookie will be updated via the mapping
2446		 * functionality.
2447		 */
2448		if ((seg->s_segid == ex_segid) && (seg->s_node == src_nodeid) &&
2449		    (seg->s_state == RSM_STATE_ACTIVE)) {
2450			seg->s_cookie = cookie;
2451		}
2452		rsmseglock_release(seg);
2453	}
2454	rw_exit(&rhash->rsmhash_rw);
2455
2456	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_FUNC_ALL, RSM_DEBUG_VERBOSE,
2457	    "rsm_remap_local_importers done\n"));
2458}
2459
2460static int
2461rsm_rebind(rsmseg_t *seg, rsm_ioctlmsg_t *msg)
2462{
2463	int e;
2464	adapter_t *adapter;
2465	ddi_umem_cookie_t cookie;
2466	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2467
2468	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind enter\n"));
2469
2470	/* Check for permissions to rebind */
2471	if (!(seg->s_flags & RSMKA_ALLOW_UNBIND_REBIND)) {
2472		return (RSMERR_REBIND_NOT_ALLOWED);
2473	}
2474
2475	if (seg->s_pid != ddi_get_pid() &&
2476	    ddi_get_pid() != 0) {
2477		DBG_PRINTF((category, RSM_ERR, "rsm_rebind: Not owner\n"));
2478		return (RSMERR_NOT_CREATOR);
2479	}
2480
2481	/*
2482	 * We will not be allowing partial rebind and hence length passed
2483	 * in must be same as segment length
2484	 */
2485	if (msg->vaddr == NULL) {
2486		DBG_PRINTF((category, RSM_ERR,
2487		    "rsm_rebind done: null msg->vaddr\n"));
2488		return (RSMERR_BAD_ADDR);
2489	}
2490	if (msg->len != seg->s_len) {
2491		DBG_PRINTF((category, RSM_ERR,
2492		    "rsm_rebind: invalid length\n"));
2493		return (RSMERR_BAD_LENGTH);
2494	}
2495
2496	/* Lock segment */
2497	rsmseglock_acquire(seg);
2498
2499	while ((seg->s_state == RSM_STATE_BIND_QUIESCED) ||
2500	    (seg->s_state == RSM_STATE_EXPORT_QUIESCING) ||
2501	    (seg->s_state == RSM_STATE_EXPORT_QUIESCED)) {
2502		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
2503			rsmseglock_release(seg);
2504			DBG_PRINTF((category, RSM_DEBUG,
2505			    "rsm_rebind done: cv_wait INTERRUPTED"));
2506			return (RSMERR_INTERRUPTED);
2507		}
2508	}
2509
2510	/* verify segment state */
2511	if ((seg->s_state != RSM_STATE_BIND) &&
2512	    (seg->s_state != RSM_STATE_EXPORT)) {
2513		/* Unlock segment */
2514		rsmseglock_release(seg);
2515		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2516		    "rsm_rebind done: invalid state\n"));
2517		return (RSMERR_BAD_SEG_HNDL);
2518	}
2519
2520	ASSERT(seg->s_cookie != NULL);
2521
2522	if (msg->vaddr == seg->s_region.r_vaddr) {
2523		rsmseglock_release(seg);
2524		DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind done\n"));
2525		return (RSM_SUCCESS);
2526	}
2527
2528	e = rsm_bind_pages(&cookie, msg->vaddr, msg->len, curproc);
2529	if (e == RSM_SUCCESS) {
2530		struct buf *xbuf;
2531		dev_t sdev = 0;
2532		rsm_memory_local_t mem;
2533
2534		xbuf = ddi_umem_iosetup(cookie, 0, msg->len, B_WRITE,
2535		    sdev, 0, NULL, DDI_UMEM_SLEEP);
2536		ASSERT(xbuf != NULL);
2537
2538		mem.ms_type = RSM_MEM_BUF;
2539		mem.ms_bp = xbuf;
2540
2541		adapter = seg->s_adapter;
2542		e = adapter->rsmpi_ops->rsm_rebind(
2543		    seg->s_handle.out, 0, &mem,
2544		    RSM_RESOURCE_DONTWAIT, NULL);
2545
2546		if (e == RSM_SUCCESS) {
2547			/*
2548			 * unbind the older pages, and unload local importers;
2549			 * but don't disconnect importers
2550			 */
2551			(void) rsm_unbind_pages(seg);
2552			seg->s_cookie = cookie;
2553			seg->s_region.r_vaddr = msg->vaddr;
2554			rsm_remap_local_importers(my_nodeid, seg->s_segid,
2555			    cookie);
2556		} else {
2557			/*
2558			 * Unbind the pages associated with "cookie" by the
2559			 * rsm_bind_pages calls prior to this. This is
2560			 * similar to what is done in the rsm_unbind_pages
2561			 * routine for the seg->s_cookie.
2562			 */
2563			ddi_umem_unlock(cookie);
2564			rsm_dec_pgcnt(btopr(msg->len));
2565			DBG_PRINTF((category, RSM_ERR,
2566			    "rsm_rebind failed with %d\n", e));
2567		}
2568		/*
2569		 * At present there is no dependency on the existence of xbuf.
2570		 * So we can free it here. If in the future this changes, it can
2571		 * be freed sometime during the segment destroy.
2572		 */
2573		freerbuf(xbuf);
2574	}
2575
2576	/* Unlock segment */
2577	rsmseglock_release(seg);
2578
2579	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind done\n"));
2580
2581	return (e);
2582}
2583
2584static int
2585rsm_unbind(rsmseg_t *seg)
2586{
2587	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2588
2589	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind enter\n"));
2590
2591	rsmseglock_acquire(seg);
2592
2593	/* verify segment state */
2594	if ((seg->s_state != RSM_STATE_BIND) &&
2595	    (seg->s_state != RSM_STATE_BIND_QUIESCED)) {
2596		rsmseglock_release(seg);
2597		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2598		    "rsm_unbind: invalid state\n"));
2599		return (RSMERR_BAD_SEG_HNDL);
2600	}
2601
2602	/* unlock current range */
2603	(void) rsm_unbind_pages(seg);
2604
2605	if (seg->s_state == RSM_STATE_BIND) {
2606		seg->s_state = RSM_STATE_NEW;
2607	} else if (seg->s_state == RSM_STATE_BIND_QUIESCED) {
2608		seg->s_state = RSM_STATE_NEW_QUIESCED;
2609	}
2610
2611	rsmseglock_release(seg);
2612
2613	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind done\n"));
2614
2615	return (RSM_SUCCESS);
2616}
2617
2618/* **************************** Exporter Access List Management ******* */
2619static void
2620rsmacl_free(rsmapi_access_entry_t *acl, int acl_len)
2621{
2622	int	acl_sz;
2623	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2624
2625	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_free enter\n"));
2626
2627	/* acl could be NULL */
2628
2629	if (acl != NULL && acl_len > 0) {
2630		acl_sz = acl_len * sizeof (rsmapi_access_entry_t);
2631		kmem_free((void *)acl, acl_sz);
2632	}
2633
2634	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_free done\n"));
2635}
2636
2637static void
2638rsmpiacl_free(rsm_access_entry_t *acl, int acl_len)
2639{
2640	int	acl_sz;
2641	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2642
2643	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_free enter\n"));
2644
2645	if (acl != NULL && acl_len > 0) {
2646		acl_sz = acl_len * sizeof (rsm_access_entry_t);
2647		kmem_free((void *)acl, acl_sz);
2648	}
2649
2650	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_free done\n"));
2651
2652}
2653
2654static int
2655rsmacl_build(rsm_ioctlmsg_t *msg, int mode,
2656    rsmapi_access_entry_t **list, int *len, int loopback)
2657{
2658	rsmapi_access_entry_t *acl;
2659	int	acl_len;
2660	int i;
2661	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2662
2663	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_build enter\n"));
2664
2665	*len = 0;
2666	*list = NULL;
2667
2668	acl_len = msg->acl_len;
2669	if ((loopback && acl_len > 1) || (acl_len < 0) ||
2670	    (acl_len > MAX_NODES)) {
2671		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2672		    "rsmacl_build done: acl invalid\n"));
2673		return (RSMERR_BAD_ACL);
2674	}
2675
2676	if (acl_len > 0 && acl_len <= MAX_NODES) {
2677		size_t acl_size = acl_len * sizeof (rsmapi_access_entry_t);
2678
2679		acl = kmem_alloc(acl_size, KM_SLEEP);
2680
2681		if (ddi_copyin((caddr_t)msg->acl, (caddr_t)acl,
2682		    acl_size, mode)) {
2683			kmem_free((void *) acl, acl_size);
2684			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2685			    "rsmacl_build done: BAD_ADDR\n"));
2686			return (RSMERR_BAD_ADDR);
2687		}
2688
2689		/*
2690		 * Verify access list
2691		 */
2692		for (i = 0; i < acl_len; i++) {
2693			if (acl[i].ae_node > MAX_NODES ||
2694			    (loopback && (acl[i].ae_node != my_nodeid)) ||
2695			    acl[i].ae_permission > RSM_ACCESS_TRUSTED) {
2696				/* invalid entry */
2697				kmem_free((void *) acl, acl_size);
2698				DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2699				    "rsmacl_build done: EINVAL\n"));
2700				return (RSMERR_BAD_ACL);
2701			}
2702		}
2703
2704		*len = acl_len;
2705		*list = acl;
2706	}
2707
2708	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_build done\n"));
2709
2710	return (DDI_SUCCESS);
2711}
2712
2713static int
2714rsmpiacl_create(rsmapi_access_entry_t *src, rsm_access_entry_t **dest,
2715    int acl_len, adapter_t *adapter)
2716{
2717	rsm_access_entry_t *acl;
2718	rsm_addr_t hwaddr;
2719	int i;
2720	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2721
2722	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_create enter\n"));
2723
2724	if (src != NULL) {
2725		size_t acl_size = acl_len * sizeof (rsm_access_entry_t);
2726		acl = kmem_alloc(acl_size, KM_SLEEP);
2727
2728		/*
2729		 * translate access list
2730		 */
2731		for (i = 0; i < acl_len; i++) {
2732			if (src[i].ae_node == my_nodeid) {
2733				acl[i].ae_addr = adapter->hwaddr;
2734			} else {
2735				hwaddr = get_remote_hwaddr(adapter,
2736				    src[i].ae_node);
2737				if ((int64_t)hwaddr < 0) {
2738					/* invalid hwaddr */
2739					kmem_free((void *) acl, acl_size);
2740					DBG_PRINTF((category,
2741					    RSM_DEBUG_VERBOSE,
2742					    "rsmpiacl_create done:"
2743					    "EINVAL hwaddr\n"));
2744					return (RSMERR_INTERNAL_ERROR);
2745				}
2746				acl[i].ae_addr = hwaddr;
2747			}
2748			/* rsmpi understands only RSM_PERM_XXXX */
2749			acl[i].ae_permission =
2750			    src[i].ae_permission & RSM_PERM_RDWR;
2751		}
2752		*dest = acl;
2753	} else {
2754		*dest = NULL;
2755	}
2756
2757	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_create done\n"));
2758
2759	return (RSM_SUCCESS);
2760}
2761
2762static int
2763rsmsegacl_validate(rsmipc_request_t *req, rsm_node_id_t rnode,
2764    rsmipc_reply_t *reply)
2765{
2766
2767	int		i;
2768	rsmseg_t	*seg;
2769	rsm_memseg_id_t key = req->rsmipc_key;
2770	rsm_permission_t perm = req->rsmipc_perm;
2771	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2772
2773	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2774	    "rsmsegacl_validate enter\n"));
2775
2776	/*
2777	 * Find segment and grab its lock. The reason why we grab the segment
2778	 * lock in side the search is to avoid the race when the segment is
2779	 * being deleted and we already have a pointer to it.
2780	 */
2781	seg = rsmexport_lookup(key);
2782	if (!seg) {
2783		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2784		    "rsmsegacl_validate done: %u ENXIO\n", key));
2785		return (RSMERR_SEG_NOT_PUBLISHED);
2786	}
2787
2788	ASSERT(rsmseglock_held(seg));
2789	ASSERT(seg->s_state == RSM_STATE_EXPORT);
2790
2791	/*
2792	 * We implement a 2-level protection scheme.
2793	 * First, we check if local/remote host has access rights.
2794	 * Second, we check if the user has access rights.
2795	 *
2796	 * This routine only validates the rnode access_list
2797	 */
2798	if (seg->s_acl_len > 0) {
2799		/*
2800		 * Check host access list
2801		 */
2802		ASSERT(seg->s_acl != NULL);
2803		for (i = 0; i < seg->s_acl_len; i++) {
2804			if (seg->s_acl[i].ae_node == rnode) {
2805				perm &= seg->s_acl[i].ae_permission;
2806				goto found;
2807			}
2808		}
2809		/* rnode is not found in the list */
2810		rsmseglock_release(seg);
2811		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
2812		    "rsmsegacl_validate done: EPERM\n"));
2813		return (RSMERR_SEG_NOT_PUBLISHED_TO_NODE);
2814	} else {
2815		/* use default owner creation umask */
2816		perm &= seg->s_mode;
2817	}
2818
2819found:
2820	/* update perm for this node */
2821	reply->rsmipc_mode = perm;
2822	reply->rsmipc_uid = seg->s_uid;
2823	reply->rsmipc_gid = seg->s_gid;
2824	reply->rsmipc_segid = seg->s_segid;
2825	reply->rsmipc_seglen = seg->s_len;
2826
2827	/*
2828	 * Perm of requesting node is valid; source will validate user
2829	 */
2830	rsmseglock_release(seg);
2831
2832	/*
2833	 * Add the importer to the list right away, if connect fails
2834	 * the importer will ask the exporter to remove it.
2835	 */
2836	importer_list_add(rnode, key, req->rsmipc_adapter_hwaddr,
2837	    req->rsmipc_segment_cookie);
2838
2839	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegacl_validate done\n"));
2840
2841	return (RSM_SUCCESS);
2842}
2843
2844
2845/* ************************** Exporter Calls ************************* */
2846
2847static int
2848rsm_publish(rsmseg_t *seg, rsm_ioctlmsg_t *msg, intptr_t dataptr, int mode)
2849{
2850	int			e;
2851	int			acl_len;
2852	rsmapi_access_entry_t	*acl;
2853	rsm_access_entry_t	*rsmpi_acl;
2854	rsm_memory_local_t	mem;
2855	struct buf		*xbuf;
2856	dev_t 			sdev = 0;
2857	adapter_t		*adapter;
2858	rsm_memseg_id_t		segment_id = 0;
2859	int			loopback_flag = 0;
2860	int			create_flags = 0;
2861	rsm_resource_callback_t	callback_flag;
2862	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
2863
2864	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_publish enter\n"));
2865
2866	if (seg->s_adapter == &loopback_adapter)
2867		loopback_flag = 1;
2868
2869	if (seg->s_pid != ddi_get_pid() &&
2870	    ddi_get_pid() != 0) {
2871		DBG_PRINTF((category, RSM_ERR,
2872		    "rsm_publish: Not creator\n"));
2873		return (RSMERR_NOT_CREATOR);
2874	}
2875
2876	/*
2877	 * Get per node access list
2878	 */
2879	e = rsmacl_build(msg, mode, &acl, &acl_len, loopback_flag);
2880	if (e != DDI_SUCCESS) {
2881		DBG_PRINTF((category, RSM_ERR,
2882		    "rsm_publish done: rsmacl_build failed\n"));
2883		return (e);
2884	}
2885
2886	/*
2887	 * The application provided msg->key is used for resolving a
2888	 * segment id according to the following:
2889	 *    key = 0   		Kernel Agent selects the segment id
2890	 *    key <= RSM_DLPI_ID_END	Reserved for system usage except
2891	 *				RSMLIB range
2892	 *    key < RSM_USER_APP_ID_BASE segment id = key
2893	 *    key >= RSM_USER_APP_ID_BASE Reserved for KA selections
2894	 *
2895	 * rsm_nextavail_segmentid is initialized to 0x80000000 and
2896	 * overflows to zero after 0x80000000 allocations.
2897	 * An algorithm is needed which allows reinitialization and provides
2898	 * for reallocation after overflow.  For now, ENOMEM is returned
2899	 * once the overflow condition has occurred.
2900	 */
2901	if (msg->key == 0) {
2902		mutex_enter(&rsm_lock);
2903		segment_id = rsm_nextavail_segmentid;
2904		if (segment_id != 0) {
2905			rsm_nextavail_segmentid++;
2906			mutex_exit(&rsm_lock);
2907		} else {
2908			mutex_exit(&rsm_lock);
2909			DBG_PRINTF((category, RSM_ERR,
2910			    "rsm_publish done: no more keys avlbl\n"));
2911			return (RSMERR_INSUFFICIENT_RESOURCES);
2912		}
2913	} else	if BETWEEN(msg->key, RSM_RSMLIB_ID_BASE, RSM_RSMLIB_ID_END)
2914		/* range reserved for internal use by base/ndi libraries */
2915		segment_id = msg->key;
2916	else	if (msg->key <= RSM_DLPI_ID_END)
2917		return (RSMERR_RESERVED_SEGID);
2918	else if (msg->key <= (uint_t)RSM_USER_APP_ID_BASE -1)
2919		segment_id = msg->key;
2920	else {
2921		DBG_PRINTF((category, RSM_ERR,
2922		    "rsm_publish done: invalid key %u\n", msg->key));
2923		return (RSMERR_RESERVED_SEGID);
2924	}
2925
2926	/* Add key to exportlist; The segment lock is held on success */
2927	e = rsmexport_add(seg, segment_id);
2928	if (e) {
2929		rsmacl_free(acl, acl_len);
2930		DBG_PRINTF((category, RSM_ERR,
2931		    "rsm_publish done: export_add failed: %d\n", e));
2932		return (e);
2933	}
2934
2935	seg->s_segid = segment_id;
2936
2937	if ((seg->s_state != RSM_STATE_BIND) &&
2938	    (seg->s_state != RSM_STATE_BIND_QUIESCED)) {
2939		/* state changed since then, free acl and return */
2940		rsmseglock_release(seg);
2941		rsmexport_rm(seg);
2942		rsmacl_free(acl, acl_len);
2943		DBG_PRINTF((category, RSM_ERR,
2944		    "rsm_publish done: segment in wrong state: %d\n",
2945		    seg->s_state));
2946		return (RSMERR_BAD_SEG_HNDL);
2947	}
2948
2949	/*
2950	 * If this is for a local memory handle and permissions are zero,
2951	 * then the surrogate segment is very large and we want to skip
2952	 * allocation of DVMA space.
2953	 *
2954	 * Careful!  If the user didn't use an ACL list, acl will be a NULL
2955	 * pointer.  Check that before dereferencing it.
2956	 */
2957	if (acl != (rsmapi_access_entry_t *)NULL) {
2958		if (acl[0].ae_node == my_nodeid && acl[0].ae_permission == 0)
2959			goto skipdriver;
2960	}
2961
2962	/* create segment  */
2963	xbuf = ddi_umem_iosetup(seg->s_cookie, 0, seg->s_len, B_WRITE,
2964	    sdev, 0, NULL, DDI_UMEM_SLEEP);
2965	ASSERT(xbuf != NULL);
2966
2967	mem.ms_type = RSM_MEM_BUF;
2968	mem.ms_bp = xbuf;
2969
2970	/* This call includes a bind operations */
2971
2972	adapter = seg->s_adapter;
2973	/*
2974	 * create a acl list with hwaddr for RSMPI publish
2975	 */
2976	e = rsmpiacl_create(acl, &rsmpi_acl, acl_len, adapter);
2977
2978	if (e != RSM_SUCCESS) {
2979		rsmseglock_release(seg);
2980		rsmexport_rm(seg);
2981		rsmacl_free(acl, acl_len);
2982		freerbuf(xbuf);
2983		DBG_PRINTF((category, RSM_ERR,
2984		    "rsm_publish done: rsmpiacl_create failed: %d\n", e));
2985		return (e);
2986	}
2987
2988	if (seg->s_state == RSM_STATE_BIND) {
2989		/* create segment  */
2990
2991		/* This call includes a bind operations */
2992
2993		if (seg->s_flags & RSMKA_ALLOW_UNBIND_REBIND) {
2994			create_flags = RSM_ALLOW_UNBIND_REBIND;
2995		}
2996
2997		if (seg->s_flags & RSMKA_SET_RESOURCE_DONTWAIT) {
2998			callback_flag  = RSM_RESOURCE_DONTWAIT;
2999		} else {
3000			callback_flag  = RSM_RESOURCE_SLEEP;
3001		}
3002
3003		e = adapter->rsmpi_ops->rsm_seg_create(
3004		    adapter->rsmpi_handle,
3005		    &seg->s_handle.out, seg->s_len,
3006		    create_flags, &mem,
3007		    callback_flag, NULL);
3008		/*
3009		 * At present there is no dependency on the existence of xbuf.
3010		 * So we can free it here. If in the future this changes, it can
3011		 * be freed sometime during the segment destroy.
3012		 */
3013		freerbuf(xbuf);
3014
3015		if (e != RSM_SUCCESS) {
3016			rsmseglock_release(seg);
3017			rsmexport_rm(seg);
3018			rsmacl_free(acl, acl_len);
3019			rsmpiacl_free(rsmpi_acl, acl_len);
3020			DBG_PRINTF((category, RSM_ERR,
3021			    "rsm_publish done: export_create failed: %d\n", e));
3022			/*
3023			 * The following assertion ensures that the two errors
3024			 * related to the length and its alignment do not occur
3025			 * since they have been checked during export_create
3026			 */
3027			ASSERT(e != RSMERR_BAD_MEM_ALIGNMENT &&
3028			    e != RSMERR_BAD_LENGTH);
3029			if (e == RSMERR_NOT_MEM)
3030				e = RSMERR_INSUFFICIENT_MEM;
3031
3032			return (e);
3033		}
3034		/* export segment, this should create an IMMU mapping */
3035		e = adapter->rsmpi_ops->rsm_publish(
3036		    seg->s_handle.out,
3037		    rsmpi_acl, acl_len,
3038		    seg->s_segid,
3039		    RSM_RESOURCE_DONTWAIT, NULL);
3040
3041		if (e != RSM_SUCCESS) {
3042			adapter->rsmpi_ops->rsm_seg_destroy(seg->s_handle.out);
3043			rsmseglock_release(seg);
3044			rsmexport_rm(seg);
3045			rsmacl_free(acl, acl_len);
3046			rsmpiacl_free(rsmpi_acl, acl_len);
3047			DBG_PRINTF((category, RSM_ERR,
3048			    "rsm_publish done: export_publish failed: %d\n",
3049			    e));
3050			return (e);
3051		}
3052	}
3053
3054	seg->s_acl_in = rsmpi_acl;
3055
3056skipdriver:
3057	/* defer s_acl/s_acl_len -> avoid crash in rsmseg_free */
3058	seg->s_acl_len	= acl_len;
3059	seg->s_acl	= acl;
3060
3061	if (seg->s_state == RSM_STATE_BIND) {
3062		seg->s_state = RSM_STATE_EXPORT;
3063	} else if (seg->s_state == RSM_STATE_BIND_QUIESCED) {
3064		seg->s_state = RSM_STATE_EXPORT_QUIESCED;
3065		cv_broadcast(&seg->s_cv);
3066	}
3067
3068	rsmseglock_release(seg);
3069
3070	/*
3071	 * If the segment id was solicited, then return it in
3072	 * the original incoming message.
3073	 */
3074	if (msg->key == 0) {
3075		msg->key = segment_id;
3076#ifdef _MULTI_DATAMODEL
3077		if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
3078			rsm_ioctlmsg32_t msg32;
3079
3080			msg32.key = msg->key;
3081			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3082			    "rsm_publish done\n"));
3083			return (ddi_copyout((caddr_t)&msg32,
3084			    (caddr_t)dataptr, sizeof (msg32), mode));
3085		}
3086#endif
3087		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3088		    "rsm_publish done\n"));
3089		return (ddi_copyout((caddr_t)msg,
3090		    (caddr_t)dataptr, sizeof (*msg), mode));
3091	}
3092
3093	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_publish done\n"));
3094	return (DDI_SUCCESS);
3095}
3096
3097/*
3098 * This function modifies the access control list of an already published
3099 * segment.  There is no effect on import segments which are already
3100 * connected.
3101 */
3102static int
3103rsm_republish(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int mode)
3104{
3105	rsmapi_access_entry_t	*new_acl, *old_acl, *tmp_acl;
3106	rsm_access_entry_t	*rsmpi_new_acl, *rsmpi_old_acl;
3107	int			new_acl_len, old_acl_len, tmp_acl_len;
3108	int			e, i;
3109	adapter_t		*adapter;
3110	int			loopback_flag = 0;
3111	rsm_memseg_id_t		key;
3112	rsm_permission_t	permission;
3113	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
3114
3115	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_republish enter\n"));
3116
3117	if ((seg->s_state != RSM_STATE_EXPORT) &&
3118	    (seg->s_state != RSM_STATE_EXPORT_QUIESCED) &&
3119	    (seg->s_state != RSM_STATE_EXPORT_QUIESCING))
3120		return (RSMERR_SEG_NOT_PUBLISHED);
3121
3122	if (seg->s_pid != ddi_get_pid() &&
3123	    ddi_get_pid() != 0) {
3124		DBG_PRINTF((category, RSM_ERR,
3125		    "rsm_republish: Not owner\n"));
3126		return (RSMERR_NOT_CREATOR);
3127	}
3128
3129	if (seg->s_adapter == &loopback_adapter)
3130		loopback_flag = 1;
3131
3132	/*
3133	 * Build new list first
3134	 */
3135	e = rsmacl_build(msg, mode, &new_acl, &new_acl_len, loopback_flag);
3136	if (e) {
3137		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3138		    "rsm_republish done: rsmacl_build failed %d", e));
3139		return (e);
3140	}
3141
3142	/* Lock segment */
3143	rsmseglock_acquire(seg);
3144	/*
3145	 * a republish is in progress - REPUBLISH message is being
3146	 * sent to the importers so wait for it to complete OR
3147	 * wait till DR completes
3148	 */
3149	while (((seg->s_state == RSM_STATE_EXPORT) &&
3150	    (seg->s_flags & RSM_REPUBLISH_WAIT)) ||
3151	    (seg->s_state == RSM_STATE_EXPORT_QUIESCED) ||
3152	    (seg->s_state == RSM_STATE_EXPORT_QUIESCING)) {
3153		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
3154			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3155			    "rsm_republish done: cv_wait  INTERRUPTED"));
3156			rsmseglock_release(seg);
3157			rsmacl_free(new_acl, new_acl_len);
3158			return (RSMERR_INTERRUPTED);
3159		}
3160	}
3161
3162	/* recheck if state is valid */
3163	if (seg->s_state != RSM_STATE_EXPORT) {
3164		rsmseglock_release(seg);
3165		rsmacl_free(new_acl, new_acl_len);
3166		return (RSMERR_SEG_NOT_PUBLISHED);
3167	}
3168
3169	key = seg->s_key;
3170	old_acl = seg->s_acl;
3171	old_acl_len = seg->s_acl_len;
3172
3173	seg->s_acl = new_acl;
3174	seg->s_acl_len = new_acl_len;
3175
3176	/*
3177	 * This call will only be meaningful if and when the interconnect
3178	 * layer makes use of the access list
3179	 */
3180	adapter = seg->s_adapter;
3181	/*
3182	 * create a acl list with hwaddr for RSMPI publish
3183	 */
3184	e = rsmpiacl_create(new_acl, &rsmpi_new_acl, new_acl_len, adapter);
3185
3186	if (e != RSM_SUCCESS) {
3187		seg->s_acl = old_acl;
3188		seg->s_acl_len = old_acl_len;
3189		rsmseglock_release(seg);
3190		rsmacl_free(new_acl, new_acl_len);
3191		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3192		    "rsm_republish done: rsmpiacl_create failed %d", e));
3193		return (e);
3194	}
3195	rsmpi_old_acl = seg->s_acl_in;
3196	seg->s_acl_in = rsmpi_new_acl;
3197
3198	e = adapter->rsmpi_ops->rsm_republish(seg->s_handle.out,
3199	    seg->s_acl_in, seg->s_acl_len,
3200	    RSM_RESOURCE_DONTWAIT, NULL);
3201
3202	if (e != RSM_SUCCESS) {
3203		seg->s_acl = old_acl;
3204		seg->s_acl_in = rsmpi_old_acl;
3205		seg->s_acl_len = old_acl_len;
3206		rsmseglock_release(seg);
3207		rsmacl_free(new_acl, new_acl_len);
3208		rsmpiacl_free(rsmpi_new_acl, new_acl_len);
3209
3210		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3211		    "rsm_republish done: rsmpi republish failed %d\n", e));
3212		return (e);
3213	}
3214
3215	/* create a tmp copy of the new acl */
3216	tmp_acl_len = new_acl_len;
3217	if (tmp_acl_len > 0) {
3218		tmp_acl = kmem_zalloc(new_acl_len*sizeof (*tmp_acl), KM_SLEEP);
3219		for (i = 0; i < tmp_acl_len; i++) {
3220			tmp_acl[i].ae_node = new_acl[i].ae_node;
3221			tmp_acl[i].ae_permission = new_acl[i].ae_permission;
3222		}
3223		/*
3224		 * The default permission of a node which was in the old
3225		 * ACL but not in the new ACL is 0 ie no access.
3226		 */
3227		permission = 0;
3228	} else {
3229		/*
3230		 * NULL acl means all importers can connect and
3231		 * default permission will be owner creation umask
3232		 */
3233		tmp_acl = NULL;
3234		permission = seg->s_mode;
3235	}
3236
3237	/* make other republishers to wait for republish to complete */
3238	seg->s_flags |= RSM_REPUBLISH_WAIT;
3239
3240	rsmseglock_release(seg);
3241
3242	/* send the new perms to the importing nodes */
3243	rsm_send_republish(key, tmp_acl, tmp_acl_len, permission);
3244
3245	rsmseglock_acquire(seg);
3246	seg->s_flags &= ~RSM_REPUBLISH_WAIT;
3247	/* wake up any one waiting for republish to complete */
3248	cv_broadcast(&seg->s_cv);
3249	rsmseglock_release(seg);
3250
3251	rsmacl_free(tmp_acl, tmp_acl_len);
3252	rsmacl_free(old_acl, old_acl_len);
3253	rsmpiacl_free(rsmpi_old_acl, old_acl_len);
3254
3255	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_republish done\n"));
3256	return (DDI_SUCCESS);
3257}
3258
3259static int
3260rsm_unpublish(rsmseg_t *seg, int mode)
3261{
3262	rsmapi_access_entry_t	*acl;
3263	rsm_access_entry_t	*rsmpi_acl;
3264	int			acl_len;
3265	int			e;
3266	clock_t			ticks;
3267	adapter_t *adapter;
3268	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
3269
3270	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unpublish enter\n"));
3271
3272	if (seg->s_pid != ddi_get_pid() &&
3273	    ddi_get_pid() != 0) {
3274		DBG_PRINTF((category, RSM_ERR,
3275		    "rsm_unpublish: Not creator\n"));
3276		return (RSMERR_NOT_CREATOR);
3277	}
3278
3279	rsmseglock_acquire(seg);
3280	/*
3281	 * wait for QUIESCING to complete here before rsmexport_rm
3282	 * is called because the SUSPEND_COMPLETE mesg which changes
3283	 * the seg state from EXPORT_QUIESCING to EXPORT_QUIESCED and
3284	 * signals the cv_wait needs to find it in the hashtable.
3285	 */
3286	while ((seg->s_state == RSM_STATE_EXPORT_QUIESCING) ||
3287	    ((seg->s_state == RSM_STATE_EXPORT) && (seg->s_rdmacnt > 0))) {
3288		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
3289			rsmseglock_release(seg);
3290			DBG_PRINTF((category, RSM_ERR,
3291			    "rsm_unpublish done: cv_wait INTR qscing"
3292			    "getv/putv in progress"));
3293			return (RSMERR_INTERRUPTED);
3294		}
3295	}
3296
3297	/* verify segment state */
3298	if ((seg->s_state != RSM_STATE_EXPORT) &&
3299	    (seg->s_state != RSM_STATE_EXPORT_QUIESCED)) {
3300		rsmseglock_release(seg);
3301		DBG_PRINTF((category, RSM_ERR,
3302		    "rsm_unpublish done: bad state %x\n", seg->s_state));
3303		return (RSMERR_SEG_NOT_PUBLISHED);
3304	}
3305
3306	rsmseglock_release(seg);
3307
3308	rsmexport_rm(seg);
3309
3310	rsm_send_importer_disconnects(seg->s_segid, my_nodeid);
3311
3312	rsmseglock_acquire(seg);
3313	/*
3314	 * wait for republish to complete
3315	 */
3316	while ((seg->s_state == RSM_STATE_EXPORT) &&
3317	    (seg->s_flags & RSM_REPUBLISH_WAIT)) {
3318		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
3319			DBG_PRINTF((category, RSM_ERR,
3320			    "rsm_unpublish done: cv_wait INTR repubing"));
3321			rsmseglock_release(seg);
3322			return (RSMERR_INTERRUPTED);
3323		}
3324	}
3325
3326	if ((seg->s_state != RSM_STATE_EXPORT) &&
3327	    (seg->s_state != RSM_STATE_EXPORT_QUIESCED)) {
3328		DBG_PRINTF((category, RSM_ERR,
3329		    "rsm_unpublish done: invalid state"));
3330		rsmseglock_release(seg);
3331		return (RSMERR_SEG_NOT_PUBLISHED);
3332	}
3333
3334	/*
3335	 * check for putv/get surrogate segment which was not published
3336	 * to the driver.
3337	 *
3338	 * Be certain to see if there is an ACL first!  If this segment was
3339	 * not published with an ACL, acl will be a null pointer.  Check
3340	 * that before dereferencing it.
3341	 */
3342	acl = seg->s_acl;
3343	if (acl != (rsmapi_access_entry_t *)NULL) {
3344		if (acl[0].ae_node == my_nodeid && acl[0].ae_permission == 0)
3345			goto bypass;
3346	}
3347
3348	/* The RSMPI unpublish/destroy has been done if seg is QUIESCED */
3349	if (seg->s_state == RSM_STATE_EXPORT_QUIESCED)
3350		goto bypass;
3351
3352	adapter = seg->s_adapter;
3353	for (;;) {
3354		if (seg->s_state != RSM_STATE_EXPORT) {
3355			rsmseglock_release(seg);
3356			DBG_PRINTF((category, RSM_ERR,
3357			    "rsm_unpublish done: bad state %x\n",
3358			    seg->s_state));
3359			return (RSMERR_SEG_NOT_PUBLISHED);
3360		}
3361
3362		/* unpublish from adapter */
3363		e = adapter->rsmpi_ops->rsm_unpublish(seg->s_handle.out);
3364
3365		if (e == RSM_SUCCESS) {
3366			break;
3367		}
3368
3369		if (e == RSMERR_SEG_IN_USE && mode == 1) {
3370			/*
3371			 * wait for unpublish to succeed, it's busy.
3372			 */
3373			seg->s_flags |= RSM_EXPORT_WAIT;
3374
3375			/* wait for a max of 1 ms - this is an empirical */
3376			/* value that was found by some minimal testing  */
3377			/* can be fine tuned when we have better numbers */
3378			/* A long term fix would be to send cv_signal	 */
3379			/* from the intr callback routine		 */
3380			(void) drv_getparm(LBOLT, &ticks);
3381			ticks += drv_usectohz(1000);
3382			/* currently nobody signals this wait		*/
3383			(void) cv_timedwait(&seg->s_cv, &seg->s_lock, ticks);
3384
3385			DBG_PRINTF((category, RSM_ERR,
3386			    "rsm_unpublish: SEG_IN_USE\n"));
3387
3388			seg->s_flags &= ~RSM_EXPORT_WAIT;
3389		} else {
3390			if (mode == 1) {
3391				DBG_PRINTF((category, RSM_ERR,
3392				    "rsm:rsmpi unpublish err %x\n", e));
3393				seg->s_state = RSM_STATE_BIND;
3394			}
3395			rsmseglock_release(seg);
3396			return (e);
3397		}
3398	}
3399
3400	/* Free segment */
3401	e = adapter->rsmpi_ops->rsm_seg_destroy(seg->s_handle.out);
3402
3403	if (e != RSM_SUCCESS) {
3404		DBG_PRINTF((category, RSM_ERR,
3405		    "rsm_unpublish: rsmpi destroy key=%x failed %x\n",
3406		    seg->s_key, e));
3407	}
3408
3409bypass:
3410	acl = seg->s_acl;
3411	rsmpi_acl = seg->s_acl_in;
3412	acl_len = seg->s_acl_len;
3413
3414	seg->s_acl = NULL;
3415	seg->s_acl_in = NULL;
3416	seg->s_acl_len = 0;
3417
3418	if (seg->s_state == RSM_STATE_EXPORT) {
3419		seg->s_state = RSM_STATE_BIND;
3420	} else if (seg->s_state == RSM_STATE_EXPORT_QUIESCED) {
3421		seg->s_state = RSM_STATE_BIND_QUIESCED;
3422		cv_broadcast(&seg->s_cv);
3423	}
3424
3425	rsmseglock_release(seg);
3426
3427	rsmacl_free(acl, acl_len);
3428	rsmpiacl_free(rsmpi_acl, acl_len);
3429
3430	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unpublish done\n"));
3431
3432	return (DDI_SUCCESS);
3433}
3434
3435/*
3436 * Called from rsm_unpublish to force an unload and disconnection of all
3437 * importers of the unpublished segment.
3438 *
3439 * First build the list of segments requiring a force disconnect, then
3440 * send a request for each.
3441 */
3442static void
3443rsm_send_importer_disconnects(rsm_memseg_id_t ex_segid,
3444    rsm_node_id_t ex_nodeid)
3445{
3446	rsmipc_request_t 	request;
3447	importing_token_t	*prev_token, *token, *tmp_token, *tokp;
3448	importing_token_t	*force_disconnect_list = NULL;
3449	int			index;
3450
3451	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3452	    "rsm_send_importer_disconnects enter\n"));
3453
3454	index = rsmhash(ex_segid);
3455
3456	mutex_enter(&importer_list.lock);
3457
3458	prev_token = NULL;
3459	token = importer_list.bucket[index];
3460
3461	while (token != NULL) {
3462		if (token->key == ex_segid) {
3463			/*
3464			 * take it off the importer list and add it
3465			 * to the force disconnect list.
3466			 */
3467			if (prev_token == NULL)
3468				importer_list.bucket[index] = token->next;
3469			else
3470				prev_token->next = token->next;
3471			tmp_token = token;
3472			token = token->next;
3473			if (force_disconnect_list == NULL) {
3474				force_disconnect_list = tmp_token;
3475				tmp_token->next = NULL;
3476			} else {
3477				tokp = force_disconnect_list;
3478				/*
3479				 * make sure that the tmp_token's node
3480				 * is not already on the force disconnect
3481				 * list.
3482				 */
3483				while (tokp != NULL) {
3484					if (tokp->importing_node ==
3485					    tmp_token->importing_node) {
3486						break;
3487					}
3488					tokp = tokp->next;
3489				}
3490				if (tokp == NULL) {
3491					tmp_token->next =
3492					    force_disconnect_list;
3493					force_disconnect_list = tmp_token;
3494				} else {
3495					kmem_free((void *)tmp_token,
3496					    sizeof (*token));
3497				}
3498			}
3499
3500		} else {
3501			prev_token = token;
3502			token = token->next;
3503		}
3504	}
3505	mutex_exit(&importer_list.lock);
3506
3507	token = force_disconnect_list;
3508	while (token != NULL) {
3509		if (token->importing_node == my_nodeid) {
3510			rsm_force_unload(ex_nodeid, ex_segid,
3511			    DISCONNECT);
3512		} else {
3513			request.rsmipc_hdr.rsmipc_type =
3514			    RSMIPC_MSG_DISCONNECT;
3515			request.rsmipc_key = token->key;
3516			for (;;) {
3517				if (rsmipc_send(token->importing_node,
3518				    &request,
3519				    RSM_NO_REPLY) == RSM_SUCCESS) {
3520					break;
3521				} else {
3522					delay(drv_usectohz(10000));
3523				}
3524			}
3525		}
3526		tmp_token = token;
3527		token = token->next;
3528		kmem_free((void *)tmp_token, sizeof (*token));
3529	}
3530
3531	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3532	    "rsm_send_importer_disconnects done\n"));
3533}
3534
3535/*
3536 * This function is used as a callback for unlocking the pages locked
3537 * down by a process which then does a fork or an exec.
3538 * It marks the export segments corresponding to umem cookie given by
3539 * the *arg to be in a ZOMBIE state(by calling rsmseg_close to be
3540 * destroyed later when an rsm_close occurs).
3541 */
3542static void
3543rsm_export_force_destroy(ddi_umem_cookie_t *ck)
3544{
3545	rsmresource_blk_t *blk;
3546	rsmresource_t *p;
3547	rsmseg_t *eseg = NULL;
3548	int i, j;
3549	int found = 0;
3550
3551	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3552	    "rsm_export_force_destroy enter\n"));
3553
3554	/*
3555	 * Walk the resource list and locate the export segment (either
3556	 * in the BIND or the EXPORT state) which corresponds to the
3557	 * ddi_umem_cookie_t being freed up, and call rsmseg_close.
3558	 * Change the state to ZOMBIE by calling rsmseg_close with the
3559	 * force_flag argument (the second argument) set to 1. Also,
3560	 * unpublish and unbind the segment, but don't free it. Free it
3561	 * only on a rsm_close call for the segment.
3562	 */
3563	rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
3564
3565	for (i = 0; i < rsm_resource.rsmrc_len; i++) {
3566		blk = rsm_resource.rsmrc_root[i];
3567		if (blk == NULL) {
3568			continue;
3569		}
3570
3571		for (j = 0; j < RSMRC_BLKSZ; j++) {
3572			p = blk->rsmrcblk_blks[j];
3573			if ((p != NULL) && (p != RSMRC_RESERVED) &&
3574			    (p->rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT)) {
3575				eseg = (rsmseg_t *)p;
3576				if (eseg->s_cookie != ck)
3577					continue; /* continue searching */
3578				/*
3579				 * Found the segment, set flag to indicate
3580				 * force destroy processing is in progress
3581				 */
3582				rsmseglock_acquire(eseg);
3583				eseg->s_flags |= RSM_FORCE_DESTROY_WAIT;
3584				rsmseglock_release(eseg);
3585				found = 1;
3586				break;
3587			}
3588		}
3589
3590		if (found)
3591			break;
3592	}
3593
3594	rw_exit(&rsm_resource.rsmrc_lock);
3595
3596	if (found) {
3597		ASSERT(eseg != NULL);
3598		/* call rsmseg_close with force flag set to 1 */
3599		rsmseg_close(eseg, 1);
3600		/*
3601		 * force destroy processing done, clear flag and signal any
3602		 * thread waiting in rsmseg_close.
3603		 */
3604		rsmseglock_acquire(eseg);
3605		eseg->s_flags &= ~RSM_FORCE_DESTROY_WAIT;
3606		cv_broadcast(&eseg->s_cv);
3607		rsmseglock_release(eseg);
3608	}
3609
3610	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
3611	    "rsm_export_force_destroy done\n"));
3612}
3613
3614/* ******************************* Remote Calls *********************** */
3615static void
3616rsm_intr_segconnect(rsm_node_id_t src, rsmipc_request_t *req)
3617{
3618	rsmipc_reply_t reply;
3619	DBG_DEFINE(category,
3620	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3621
3622	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3623	    "rsm_intr_segconnect enter\n"));
3624
3625	reply.rsmipc_status = (short)rsmsegacl_validate(req, src, &reply);
3626
3627	reply.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_REPLY;
3628	reply.rsmipc_hdr.rsmipc_cookie = req->rsmipc_hdr.rsmipc_cookie;
3629
3630	(void) rsmipc_send(src, NULL, &reply);
3631
3632	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3633	    "rsm_intr_segconnect done\n"));
3634}
3635
3636
3637/*
3638 * When an exported segment is unpublished the exporter sends an ipc
3639 * message (RSMIPC_MSG_DISCONNECT) to all importers.  The recv ipc dispatcher
3640 * calls this function.  The import list is scanned; segments which match the
3641 * exported segment id are unloaded and disconnected.
3642 *
3643 * Will also be called from rsm_rebind with disconnect_flag FALSE.
3644 *
3645 */
3646static void
3647rsm_force_unload(rsm_node_id_t src_nodeid,
3648    rsm_memseg_id_t ex_segid,
3649    boolean_t disconnect_flag)
3650
3651{
3652	rsmresource_t	*p = NULL;
3653	rsmhash_table_t *rhash = &rsm_import_segs;
3654	uint_t		index;
3655	DBG_DEFINE(category,
3656	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3657
3658	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_force_unload enter\n"));
3659
3660	index = rsmhash(ex_segid);
3661
3662	rw_enter(&rhash->rsmhash_rw, RW_READER);
3663
3664	p = rsmhash_getbkt(rhash, index);
3665
3666	for (; p; p = p->rsmrc_next) {
3667		rsmseg_t *seg = (rsmseg_t *)p;
3668		if ((seg->s_segid == ex_segid) && (seg->s_node == src_nodeid)) {
3669			/*
3670			 * In order to make rsmseg_unload and rsm_force_unload
3671			 * thread safe, acquire the segment lock here.
3672			 * rsmseg_unload is responsible for releasing the lock.
3673			 * rsmseg_unload releases the lock just before a call
3674			 * to rsmipc_send or in case of an early exit which
3675			 * occurs if the segment was in the state
3676			 * RSM_STATE_CONNECTING or RSM_STATE_NEW.
3677			 */
3678			rsmseglock_acquire(seg);
3679			if (disconnect_flag)
3680				seg->s_flags |= RSM_FORCE_DISCONNECT;
3681			rsmseg_unload(seg);
3682		}
3683	}
3684	rw_exit(&rhash->rsmhash_rw);
3685
3686	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_force_unload done\n"));
3687}
3688
3689static void
3690rsm_intr_reply(rsmipc_msghdr_t *msg)
3691{
3692	/*
3693	 * Find slot for cookie in reply.
3694	 * Match sequence with sequence in cookie
3695	 * If no match; return
3696	 * Try to grap lock of slot, if locked return
3697	 * copy data into reply slot area
3698	 * signal waiter
3699	 */
3700	rsmipc_slot_t 	*slot;
3701	rsmipc_cookie_t	*cookie;
3702	void *data = (void *) msg;
3703	size_t size = sizeof (rsmipc_reply_t);
3704	DBG_DEFINE(category,
3705	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3706
3707	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_reply enter\n"));
3708
3709	cookie = &msg->rsmipc_cookie;
3710	if (cookie->ic.index >= RSMIPC_SZ) {
3711		DBG_PRINTF((category, RSM_ERR,
3712		    "rsm: rsm_intr_reply bad cookie %d\n", cookie->ic.index));
3713		return;
3714	}
3715
3716	ASSERT(cookie->ic.index < RSMIPC_SZ);
3717	slot = &rsm_ipc.slots[cookie->ic.index];
3718	mutex_enter(&slot->rsmipc_lock);
3719	if (slot->rsmipc_cookie.value == cookie->value) {
3720		/* found a match */
3721		if (RSMIPC_GET(slot, RSMIPC_PENDING)) {
3722			bcopy(data, slot->rsmipc_data, size);
3723			RSMIPC_CLEAR(slot, RSMIPC_PENDING);
3724			cv_signal(&slot->rsmipc_cv);
3725		}
3726	} else {
3727		DBG_PRINTF((category, RSM_DEBUG,
3728		    "rsm: rsm_intr_reply mismatched reply %d\n",
3729		    cookie->ic.index));
3730	}
3731	mutex_exit(&slot->rsmipc_lock);
3732	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_reply done\n"));
3733}
3734
3735/*
3736 * This function gets dispatched on the worker thread when we receive
3737 * the SQREADY message. This function sends the SQREADY_ACK message.
3738 */
3739static void
3740rsm_sqready_ack_deferred(void *arg)
3741{
3742	path_t	*path = (path_t *)arg;
3743	DBG_DEFINE(category,
3744	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3745
3746	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3747	    "rsm_sqready_ack_deferred enter\n"));
3748
3749	mutex_enter(&path->mutex);
3750
3751	/*
3752	 * If path is not active no point in sending the ACK
3753	 * because the whole SQREADY protocol will again start
3754	 * when the path becomes active.
3755	 */
3756	if (path->state != RSMKA_PATH_ACTIVE) {
3757		/*
3758		 * decrement the path refcnt incremented in rsm_proc_sqready
3759		 */
3760		PATH_RELE_NOLOCK(path);
3761		mutex_exit(&path->mutex);
3762		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3763		    "rsm_sqready_ack_deferred done:!ACTIVE\n"));
3764		return;
3765	}
3766
3767	/* send an SQREADY_ACK message */
3768	(void) rsmipc_send_controlmsg(path, RSMIPC_MSG_SQREADY_ACK);
3769
3770	/* initialize credits to the max level */
3771	path->sendq_token.msgbuf_avail = RSMIPC_MAX_MESSAGES;
3772
3773	/* wake up any send that is waiting for credits */
3774	cv_broadcast(&path->sendq_token.sendq_cv);
3775
3776	/*
3777	 * decrement the path refcnt since we incremented it in
3778	 * rsm_proc_sqready
3779	 */
3780	PATH_RELE_NOLOCK(path);
3781
3782	mutex_exit(&path->mutex);
3783
3784	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3785	    "rsm_sqready_ack_deferred done\n"));
3786}
3787
3788/*
3789 * Process the SQREADY message
3790 */
3791static void
3792rsm_proc_sqready(rsmipc_controlmsg_t *msg, rsm_addr_t src_hwaddr,
3793    rsm_intr_hand_arg_t arg)
3794{
3795	rsmipc_msghdr_t		*msghdr = (rsmipc_msghdr_t *)msg;
3796	srv_handler_arg_t	*hdlr_argp = (srv_handler_arg_t *)arg;
3797	path_t			*path;
3798	DBG_DEFINE(category,
3799	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3800
3801	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_proc_sqready enter\n"));
3802
3803	/* look up the path - incr the path refcnt */
3804	path = rsm_find_path(hdlr_argp->adapter_name,
3805	    hdlr_argp->adapter_instance, src_hwaddr);
3806
3807	/*
3808	 * No path exists or path is not active - drop the message
3809	 */
3810	if (path == NULL) {
3811		DBG_PRINTF((category, RSM_DEBUG,
3812		    "rsm_proc_sqready done: msg dropped no path\n"));
3813		return;
3814	}
3815
3816	mutex_exit(&path->mutex);
3817
3818	/* drain any tasks from the previous incarnation */
3819	taskq_wait(path->recv_taskq);
3820
3821	mutex_enter(&path->mutex);
3822	/*
3823	 * If we'd sent an SQREADY message and were waiting for SQREADY_ACK
3824	 * in the meanwhile we received an SQREADY message, blindly reset
3825	 * the WAIT_FOR_SQACK flag because we'll just send SQREADY_ACK
3826	 * and forget about the SQREADY that we sent.
3827	 */
3828	path->flags &= ~RSMKA_WAIT_FOR_SQACK;
3829
3830	if (path->state != RSMKA_PATH_ACTIVE) {
3831		/* decr refcnt and drop the mutex */
3832		PATH_RELE_NOLOCK(path);
3833		mutex_exit(&path->mutex);
3834		DBG_PRINTF((category, RSM_DEBUG,
3835		    "rsm_proc_sqready done: msg dropped path !ACTIVE\n"));
3836		return;
3837	}
3838
3839	DBG_PRINTF((category, RSM_DEBUG, "rsm_proc_sqready:path=%lx "
3840	    " src=%lx:%llx\n", path, msghdr->rsmipc_src, src_hwaddr));
3841
3842	/*
3843	 * The sender's local incarnation number is our remote incarnation
3844	 * number save it in the path data structure
3845	 */
3846	path->remote_incn = msg->rsmipc_local_incn;
3847	path->sendq_token.msgbuf_avail = 0;
3848	path->procmsg_cnt = 0;
3849
3850	/*
3851	 * path is active - dispatch task to send SQREADY_ACK - remember
3852	 * RSMPI calls can't be done in interrupt context
3853	 *
3854	 * We can use the recv_taskq to send because the remote endpoint
3855	 * cannot start sending messages till it receives SQREADY_ACK hence
3856	 * at this point there are no tasks on recv_taskq.
3857	 *
3858	 * The path refcnt will be decremented in rsm_sqready_ack_deferred.
3859	 */
3860	(void) taskq_dispatch(path->recv_taskq,
3861	    rsm_sqready_ack_deferred, path, KM_NOSLEEP);
3862
3863	mutex_exit(&path->mutex);
3864
3865
3866	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_proc_sqready done\n"));
3867}
3868
3869/*
3870 * Process the SQREADY_ACK message
3871 */
3872static void
3873rsm_proc_sqready_ack(rsmipc_controlmsg_t *msg, rsm_addr_t src_hwaddr,
3874    rsm_intr_hand_arg_t arg)
3875{
3876	rsmipc_msghdr_t		*msghdr = (rsmipc_msghdr_t *)msg;
3877	srv_handler_arg_t	*hdlr_argp = (srv_handler_arg_t *)arg;
3878	path_t			*path;
3879	DBG_DEFINE(category,
3880	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
3881
3882	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3883	    "rsm_proc_sqready_ack enter\n"));
3884
3885	/* look up the path - incr the path refcnt */
3886	path = rsm_find_path(hdlr_argp->adapter_name,
3887	    hdlr_argp->adapter_instance, src_hwaddr);
3888
3889	/*
3890	 * drop the message if - no path exists or path is not active
3891	 * or if its not waiting for SQREADY_ACK message
3892	 */
3893	if (path == NULL) {
3894		DBG_PRINTF((category, RSM_DEBUG,
3895		    "rsm_proc_sqready_ack done: msg dropped no path\n"));
3896		return;
3897	}
3898
3899	if ((path->state != RSMKA_PATH_ACTIVE) ||
3900	    !(path->flags & RSMKA_WAIT_FOR_SQACK)) {
3901		/* decrement the refcnt */
3902		PATH_RELE_NOLOCK(path);
3903		mutex_exit(&path->mutex);
3904		DBG_PRINTF((category, RSM_DEBUG,
3905		    "rsm_proc_sqready_ack done: msg dropped\n"));
3906		return;
3907	}
3908
3909	/*
3910	 * Check if this message is in response to the last RSMIPC_MSG_SQREADY
3911	 * sent, if not drop it.
3912	 */
3913	if (path->local_incn != msghdr->rsmipc_incn) {
3914		/* decrement the refcnt */
3915		PATH_RELE_NOLOCK(path);
3916		mutex_exit(&path->mutex);
3917		DBG_PRINTF((category, RSM_DEBUG,
3918		    "rsm_proc_sqready_ack done: msg old incn %lld\n",
3919		    msghdr->rsmipc_incn));
3920		return;
3921	}
3922
3923	DBG_PRINTF((category, RSM_DEBUG, "rsm_proc_sqready_ack:path=%lx "
3924	    " src=%lx:%llx\n", path, msghdr->rsmipc_src, src_hwaddr));
3925
3926	/*
3927	 * clear the WAIT_FOR_SQACK flag since we have recvd the ack
3928	 */
3929	path->flags &= ~RSMKA_WAIT_FOR_SQACK;
3930
3931	/* save the remote sendq incn number */
3932	path->remote_incn = msg->rsmipc_local_incn;
3933
3934	/* initialize credits to the max level */
3935	path->sendq_token.msgbuf_avail = RSMIPC_MAX_MESSAGES;
3936
3937	/* wake up any send that is waiting for credits */
3938	cv_broadcast(&path->sendq_token.sendq_cv);
3939
3940	/* decrement the refcnt */
3941	PATH_RELE_NOLOCK(path);
3942
3943	mutex_exit(&path->mutex);
3944
3945	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
3946	    "rsm_proc_sqready_ack done\n"));
3947}
3948
3949/*
3950 * process the RSMIPC_MSG_CREDIT message
3951 */
3952static void
3953rsm_add_credits(rsmipc_controlmsg_t *msg, rsm_addr_t src_hwaddr,
3954    rsm_intr_hand_arg_t arg)
3955{
3956	rsmipc_msghdr_t		*msghdr = (rsmipc_msghdr_t *)msg;
3957	srv_handler_arg_t	*hdlr_argp = (srv_handler_arg_t *)arg;
3958	path_t			*path;
3959	DBG_DEFINE(category,
3960	    RSM_KERNEL_AGENT | RSM_FUNC_ALL |
3961	    RSM_INTR_CALLBACK | RSM_FLOWCONTROL);
3962
3963	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_add_credits enter\n"));
3964
3965	/* look up the path - incr the path refcnt */
3966	path = rsm_find_path(hdlr_argp->adapter_name,
3967	    hdlr_argp->adapter_instance, src_hwaddr);
3968
3969	if (path == NULL) {
3970		DBG_PRINTF((category, RSM_DEBUG,
3971		    "rsm_add_credits enter: path not found\n"));
3972		return;
3973	}
3974
3975	/* the path is not active - discard credits */
3976	if (path->state != RSMKA_PATH_ACTIVE) {
3977		PATH_RELE_NOLOCK(path);
3978		mutex_exit(&path->mutex);
3979		DBG_PRINTF((category, RSM_DEBUG,
3980		    "rsm_add_credits enter:path=%lx !ACTIVE\n", path));
3981		return;
3982	}
3983
3984	/*
3985	 * Check if these credits are for current incarnation of the path.
3986	 */
3987	if (path->local_incn != msghdr->rsmipc_incn) {
3988		/* decrement the refcnt */
3989		PATH_RELE_NOLOCK(path);
3990		mutex_exit(&path->mutex);
3991		DBG_PRINTF((category, RSM_DEBUG,
3992		    "rsm_add_credits enter: old incn %lld\n",
3993		    msghdr->rsmipc_incn));
3994		return;
3995	}
3996
3997	DBG_PRINTF((category, RSM_DEBUG,
3998	    "rsm_add_credits:path=%lx new-creds=%d "
3999	    "curr credits=%d src=%lx:%llx\n", path, msg->rsmipc_credits,
4000	    path->sendq_token.msgbuf_avail, msghdr->rsmipc_src,
4001	    src_hwaddr));
4002
4003
4004	/* add credits to the path's sendq */
4005	path->sendq_token.msgbuf_avail += msg->rsmipc_credits;
4006
4007	ASSERT(path->sendq_token.msgbuf_avail <= RSMIPC_MAX_MESSAGES);
4008
4009	/* wake up any send that is waiting for credits */
4010	cv_broadcast(&path->sendq_token.sendq_cv);
4011
4012	/* decrement the refcnt */
4013	PATH_RELE_NOLOCK(path);
4014
4015	mutex_exit(&path->mutex);
4016
4017	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_add_credits done\n"));
4018}
4019
4020static void
4021rsm_intr_event(rsmipc_request_t *msg)
4022{
4023	rsmseg_t	*seg;
4024	rsmresource_t	*p;
4025	rsm_node_id_t	src_node;
4026	DBG_DEFINE(category,
4027	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4028
4029	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_event enter\n"));
4030
4031	src_node = msg->rsmipc_hdr.rsmipc_src;
4032
4033	if ((seg = msg->rsmipc_segment_cookie) != NULL) {
4034		/* This is for an import segment */
4035		uint_t hashval = rsmhash(msg->rsmipc_key);
4036
4037		rw_enter(&rsm_import_segs.rsmhash_rw, RW_READER);
4038
4039		p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hashval);
4040
4041		for (; p; p = p->rsmrc_next) {
4042			if ((p->rsmrc_key == msg->rsmipc_key) &&
4043			    (p->rsmrc_node == src_node)) {
4044				seg = (rsmseg_t *)p;
4045				rsmseglock_acquire(seg);
4046
4047				atomic_add_32(&seg->s_pollevent, 1);
4048
4049				if (seg->s_pollflag & RSM_SEGMENT_POLL)
4050					pollwakeup(&seg->s_poll, POLLRDNORM);
4051
4052				rsmseglock_release(seg);
4053			}
4054		}
4055
4056		rw_exit(&rsm_import_segs.rsmhash_rw);
4057	} else {
4058		/* This is for an export segment */
4059		seg = rsmexport_lookup(msg->rsmipc_key);
4060		if (!seg) {
4061			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4062			    "rsm_intr_event done: exp seg not found\n"));
4063			return;
4064		}
4065
4066		ASSERT(rsmseglock_held(seg));
4067
4068		atomic_add_32(&seg->s_pollevent, 1);
4069
4070		/*
4071		 * We must hold the segment lock here, or else the segment
4072		 * can be freed while pollwakeup is using it. This implies
4073		 * that we MUST NOT grab the segment lock during rsm_chpoll,
4074		 * as outlined in the chpoll(2) man page.
4075		 */
4076		if (seg->s_pollflag & RSM_SEGMENT_POLL)
4077			pollwakeup(&seg->s_poll, POLLRDNORM);
4078
4079		rsmseglock_release(seg);
4080	}
4081
4082	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_event done\n"));
4083}
4084
4085/*
4086 * The exporter did a republish and changed the ACL - this change is only
4087 * visible to new importers.
4088 */
4089static void
4090importer_update(rsm_node_id_t src_node, rsm_memseg_id_t key,
4091    rsm_permission_t perm)
4092{
4093
4094	rsmresource_t	*p;
4095	rsmseg_t	*seg;
4096	uint_t		hashval = rsmhash(key);
4097	DBG_DEFINE(category,
4098	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4099
4100	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_update enter\n"));
4101
4102	rw_enter(&rsm_import_segs.rsmhash_rw, RW_READER);
4103
4104	p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hashval);
4105
4106	for (; p; p = p->rsmrc_next) {
4107		/*
4108		 * find the importer and update the permission in the shared
4109		 * data structure. Any new importers will use the new perms
4110		 */
4111		if ((p->rsmrc_key == key) && (p->rsmrc_node == src_node)) {
4112			seg = (rsmseg_t *)p;
4113
4114			rsmseglock_acquire(seg);
4115			rsmsharelock_acquire(seg);
4116			seg->s_share->rsmsi_mode = perm;
4117			rsmsharelock_release(seg);
4118			rsmseglock_release(seg);
4119
4120			break;
4121		}
4122	}
4123
4124	rw_exit(&rsm_import_segs.rsmhash_rw);
4125
4126	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_update done\n"));
4127}
4128
4129void
4130rsm_suspend_complete(rsm_node_id_t src_node, int flag)
4131{
4132	int		done = 1; /* indicate all SUSPENDS have been acked */
4133	list_element_t	*elem;
4134	DBG_DEFINE(category,
4135	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4136
4137	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4138	    "rsm_suspend_complete enter\n"));
4139
4140	mutex_enter(&rsm_suspend_list.list_lock);
4141
4142	if (rsm_suspend_list.list_head == NULL) {
4143		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4144		    "rsm_suspend_complete done: suspend_list is empty\n"));
4145		mutex_exit(&rsm_suspend_list.list_lock);
4146		return;
4147	}
4148
4149	elem = rsm_suspend_list.list_head;
4150	while (elem != NULL) {
4151		if (elem->nodeid == src_node) {
4152			/* clear the pending flag for the node */
4153			elem->flags &= ~RSM_SUSPEND_ACKPENDING;
4154			elem->flags |= flag;
4155		}
4156
4157		if (done && (elem->flags & RSM_SUSPEND_ACKPENDING))
4158			done = 0; /* still some nodes have not yet ACKED */
4159
4160		elem = elem->next;
4161	}
4162
4163	mutex_exit(&rsm_suspend_list.list_lock);
4164
4165	if (!done) {
4166		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4167		    "rsm_suspend_complete done: acks pending\n"));
4168		return;
4169	}
4170	/*
4171	 * Now that we are done with suspending all the remote importers
4172	 * time to quiesce the local exporters
4173	 */
4174	exporter_quiesce();
4175
4176	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4177	    "rsm_suspend_complete done\n"));
4178}
4179
4180static void
4181exporter_quiesce()
4182{
4183	int		i, e;
4184	rsmresource_t	*current;
4185	rsmseg_t	*seg;
4186	adapter_t	*adapter;
4187	DBG_DEFINE(category,
4188	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4189
4190	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "exporter_quiesce enter\n"));
4191	/*
4192	 * The importers send a SUSPEND_COMPLETE to the exporter node
4193	 *	Unpublish, unbind the export segment and
4194	 *	move the segments to the EXPORT_QUIESCED state
4195	 */
4196
4197	rw_enter(&rsm_export_segs.rsmhash_rw, RW_READER);
4198
4199	for (i = 0; i < rsm_hash_size; i++) {
4200		current = rsm_export_segs.bucket[i];
4201		while (current != NULL) {
4202			seg = (rsmseg_t *)current;
4203			rsmseglock_acquire(seg);
4204			if (current->rsmrc_state ==
4205			    RSM_STATE_EXPORT_QUIESCING) {
4206				adapter = seg->s_adapter;
4207				/*
4208				 * some local memory handles are not published
4209				 * check if it was published
4210				 */
4211				if ((seg->s_acl == NULL) ||
4212				    (seg->s_acl[0].ae_node != my_nodeid) ||
4213				    (seg->s_acl[0].ae_permission != 0)) {
4214
4215					e = adapter->rsmpi_ops->rsm_unpublish(
4216					    seg->s_handle.out);
4217					DBG_PRINTF((category, RSM_DEBUG,
4218					    "exporter_quiesce:unpub %d\n", e));
4219
4220					e = adapter->rsmpi_ops->rsm_seg_destroy(
4221					    seg->s_handle.out);
4222
4223					DBG_PRINTF((category, RSM_DEBUG,
4224					    "exporter_quiesce:destroy %d\n",
4225					    e));
4226				}
4227
4228				(void) rsm_unbind_pages(seg);
4229				seg->s_state = RSM_STATE_EXPORT_QUIESCED;
4230				cv_broadcast(&seg->s_cv);
4231			}
4232			rsmseglock_release(seg);
4233			current = current->rsmrc_next;
4234		}
4235	}
4236	rw_exit(&rsm_export_segs.rsmhash_rw);
4237
4238	/*
4239	 * All the local segments we are done with the pre-del processing
4240	 * - time to move to PREDEL_COMPLETED.
4241	 */
4242
4243	mutex_enter(&rsm_drv_data.drv_lock);
4244
4245	ASSERT(rsm_drv_data.drv_state == RSM_DRV_PREDEL_STARTED);
4246
4247	rsm_drv_data.drv_state = RSM_DRV_PREDEL_COMPLETED;
4248
4249	cv_broadcast(&rsm_drv_data.drv_cv);
4250
4251	mutex_exit(&rsm_drv_data.drv_lock);
4252
4253	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "exporter_quiesce done\n"));
4254}
4255
4256static void
4257importer_suspend(rsm_node_id_t src_node)
4258{
4259	int		i;
4260	int		susp_flg; /* true means already suspended */
4261	int		num_importers;
4262	rsmresource_t	*p = NULL, *curp;
4263	rsmhash_table_t *rhash = &rsm_import_segs;
4264	rsmseg_t	*seg;
4265	rsmipc_request_t request;
4266	DBG_DEFINE(category,
4267	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4268
4269	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_suspend enter\n"));
4270
4271	rw_enter(&rhash->rsmhash_rw, RW_READER);
4272	for (i = 0; i < rsm_hash_size; i++) {
4273		p = rhash->bucket[i];
4274
4275		/*
4276		 * Suspend all importers with same <node, key> pair.
4277		 * After the last one of the shared importers has been
4278		 * suspended - suspend the shared mappings/connection.
4279		 */
4280		for (; p; p = p->rsmrc_next) {
4281			rsmseg_t *first = (rsmseg_t *)p;
4282			if ((first->s_node != src_node) ||
4283			    (first->s_state == RSM_STATE_DISCONNECT))
4284				continue; /* go to next entry */
4285			/*
4286			 * search the rest of the bucket for
4287			 * other siblings (imprtrs with the same key)
4288			 * of "first" and suspend them.
4289			 * All importers with same key fall in
4290			 * the same bucket.
4291			 */
4292			num_importers = 0;
4293			for (curp = p; curp; curp = curp->rsmrc_next) {
4294				seg = (rsmseg_t *)curp;
4295
4296				rsmseglock_acquire(seg);
4297
4298				if ((seg->s_node != first->s_node) ||
4299				    (seg->s_key != first->s_key) ||
4300				    (seg->s_state == RSM_STATE_DISCONNECT)) {
4301					/*
4302					 * either not a peer segment or its a
4303					 * disconnected segment - skip it
4304					 */
4305					rsmseglock_release(seg);
4306					continue;
4307				}
4308
4309				rsmseg_suspend(seg, &susp_flg);
4310
4311				if (susp_flg) { /* seg already suspended */
4312					rsmseglock_release(seg);
4313					break; /* the inner for loop */
4314				}
4315
4316				num_importers++;
4317				rsmsharelock_acquire(seg);
4318				/*
4319				 * we've processed all importers that are
4320				 * siblings of "first"
4321				 */
4322				if (num_importers ==
4323				    seg->s_share->rsmsi_refcnt) {
4324					rsmsharelock_release(seg);
4325					rsmseglock_release(seg);
4326					break;
4327				}
4328				rsmsharelock_release(seg);
4329				rsmseglock_release(seg);
4330			}
4331
4332			/*
4333			 * All the importers with the same key and
4334			 * nodeid as "first" have been suspended.
4335			 * Now suspend the shared connect/mapping.
4336			 * This is done only once.
4337			 */
4338			if (!susp_flg) {
4339				rsmsegshare_suspend(seg);
4340			}
4341		}
4342	}
4343
4344	rw_exit(&rhash->rsmhash_rw);
4345
4346	/* send an ACK for SUSPEND message */
4347	request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_SUSPEND_DONE;
4348	(void) rsmipc_send(src_node, &request, RSM_NO_REPLY);
4349
4350
4351	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_suspend done\n"));
4352
4353}
4354
4355static void
4356rsmseg_suspend(rsmseg_t *seg, int *susp_flg)
4357{
4358	int		recheck_state;
4359	rsmcookie_t	*hdl;
4360	DBG_DEFINE(category,
4361	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4362
4363	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4364	    "rsmseg_suspend enter: key=%u\n", seg->s_key));
4365
4366	*susp_flg = 0;
4367
4368	ASSERT(rsmseglock_held(seg));
4369	/* wait if putv/getv is in progress */
4370	while (seg->s_rdmacnt > 0)
4371		cv_wait(&seg->s_cv, &seg->s_lock);
4372
4373	do {
4374		recheck_state = 0;
4375
4376		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4377		    "rsmseg_suspend:segment %x state=%d\n",
4378		    seg->s_key, seg->s_state));
4379
4380		switch (seg->s_state) {
4381		case RSM_STATE_NEW:
4382			/* not a valid state */
4383			break;
4384		case RSM_STATE_CONNECTING:
4385			seg->s_state = RSM_STATE_ABORT_CONNECT;
4386			break;
4387		case RSM_STATE_ABORT_CONNECT:
4388			break;
4389		case RSM_STATE_CONNECT:
4390			seg->s_handle.in = NULL;
4391			seg->s_state = RSM_STATE_CONN_QUIESCE;
4392			break;
4393		case RSM_STATE_MAPPING:
4394			/* wait until segment leaves the mapping state */
4395			while (seg->s_state == RSM_STATE_MAPPING)
4396				cv_wait(&seg->s_cv, &seg->s_lock);
4397			recheck_state = 1;
4398			break;
4399		case RSM_STATE_ACTIVE:
4400			/* unload the mappings */
4401			if (seg->s_ckl != NULL) {
4402				hdl = seg->s_ckl;
4403				for (; hdl != NULL; hdl = hdl->c_next) {
4404					(void) devmap_unload(hdl->c_dhp,
4405					    hdl->c_off, hdl->c_len);
4406				}
4407			}
4408			seg->s_mapinfo = NULL;
4409			seg->s_state = RSM_STATE_MAP_QUIESCE;
4410			break;
4411		case RSM_STATE_CONN_QUIESCE:
4412			/* FALLTHRU */
4413		case RSM_STATE_MAP_QUIESCE:
4414			/* rsmseg_suspend already done for seg */
4415			*susp_flg = 1;
4416			break;
4417		case RSM_STATE_DISCONNECT:
4418			break;
4419		default:
4420			ASSERT(0); /* invalid state */
4421		}
4422	} while (recheck_state);
4423
4424	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_suspend done\n"));
4425}
4426
4427static void
4428rsmsegshare_suspend(rsmseg_t *seg)
4429{
4430	int			e;
4431	adapter_t		*adapter;
4432	rsm_import_share_t	*sharedp;
4433	DBG_DEFINE(category,
4434	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4435
4436	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4437	    "rsmsegshare_suspend enter\n"));
4438
4439	rsmseglock_acquire(seg);
4440	rsmsharelock_acquire(seg);
4441
4442	sharedp = seg->s_share;
4443	adapter = seg->s_adapter;
4444	switch (sharedp->rsmsi_state) {
4445	case RSMSI_STATE_NEW:
4446		break;
4447	case RSMSI_STATE_CONNECTING:
4448		sharedp->rsmsi_state = RSMSI_STATE_ABORT_CONNECT;
4449		break;
4450	case RSMSI_STATE_ABORT_CONNECT:
4451		break;
4452	case RSMSI_STATE_CONNECTED:
4453		/* do the rsmpi disconnect */
4454		if (sharedp->rsmsi_node != my_nodeid) {
4455			e = adapter->rsmpi_ops->
4456			    rsm_disconnect(sharedp->rsmsi_handle);
4457
4458			DBG_PRINTF((category, RSM_DEBUG,
4459			    "rsm:rsmpi disconnect seg=%x:err=%d\n",
4460			    sharedp->rsmsi_segid, e));
4461		}
4462
4463		sharedp->rsmsi_handle = NULL;
4464
4465		sharedp->rsmsi_state = RSMSI_STATE_CONN_QUIESCE;
4466		break;
4467	case RSMSI_STATE_CONN_QUIESCE:
4468		break;
4469	case RSMSI_STATE_MAPPED:
4470		/* do the rsmpi unmap and disconnect */
4471		if (sharedp->rsmsi_node != my_nodeid) {
4472			e = adapter->rsmpi_ops->rsm_unmap(seg->s_handle.in);
4473
4474			DBG_PRINTF((category, RSM_DEBUG,
4475			    "rsmshare_suspend: rsmpi unmap %d\n", e));
4476
4477			e = adapter->rsmpi_ops->
4478			    rsm_disconnect(sharedp->rsmsi_handle);
4479			DBG_PRINTF((category, RSM_DEBUG,
4480			    "rsm:rsmpi disconnect seg=%x:err=%d\n",
4481			    sharedp->rsmsi_segid, e));
4482		}
4483
4484		sharedp->rsmsi_handle = NULL;
4485
4486		sharedp->rsmsi_state = RSMSI_STATE_MAP_QUIESCE;
4487		break;
4488	case RSMSI_STATE_MAP_QUIESCE:
4489		break;
4490	case RSMSI_STATE_DISCONNECTED:
4491		break;
4492	default:
4493		ASSERT(0); /* invalid state */
4494	}
4495
4496	rsmsharelock_release(seg);
4497	rsmseglock_release(seg);
4498
4499	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4500	    "rsmsegshare_suspend done\n"));
4501}
4502
4503/*
4504 * This should get called on receiving a RESUME message or from
4505 * the pathmanger if the node undergoing DR dies.
4506 */
4507static void
4508importer_resume(rsm_node_id_t src_node)
4509{
4510	int		i;
4511	rsmresource_t	*p = NULL;
4512	rsmhash_table_t *rhash = &rsm_import_segs;
4513	void		*cookie;
4514	DBG_DEFINE(category,
4515	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4516
4517	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_resume enter\n"));
4518
4519	rw_enter(&rhash->rsmhash_rw, RW_READER);
4520
4521	for (i = 0; i < rsm_hash_size; i++) {
4522		p = rhash->bucket[i];
4523
4524		for (; p; p = p->rsmrc_next) {
4525			rsmseg_t *seg = (rsmseg_t *)p;
4526
4527			rsmseglock_acquire(seg);
4528
4529			/* process only importers of node undergoing DR */
4530			if (seg->s_node != src_node) {
4531				rsmseglock_release(seg);
4532				continue;
4533			}
4534
4535			if (rsmseg_resume(seg, &cookie) != RSM_SUCCESS) {
4536				rsmipc_request_t	request;
4537				/*
4538				 * rsmpi map/connect failed
4539				 * inform the exporter so that it can
4540				 * remove the importer.
4541				 */
4542				request.rsmipc_hdr.rsmipc_type =
4543				    RSMIPC_MSG_NOTIMPORTING;
4544				request.rsmipc_key = seg->s_segid;
4545				request.rsmipc_segment_cookie = cookie;
4546				rsmseglock_release(seg);
4547				(void) rsmipc_send(seg->s_node, &request,
4548				    RSM_NO_REPLY);
4549			} else {
4550				rsmseglock_release(seg);
4551			}
4552		}
4553	}
4554
4555	rw_exit(&rhash->rsmhash_rw);
4556
4557	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_resume done\n"));
4558}
4559
4560static int
4561rsmseg_resume(rsmseg_t *seg, void **cookie)
4562{
4563	int			e;
4564	int			retc;
4565	off_t			dev_offset;
4566	size_t			maplen;
4567	uint_t			maxprot;
4568	rsm_mapinfo_t		*p;
4569	rsmcookie_t		*hdl;
4570	rsm_import_share_t	*sharedp;
4571	DBG_DEFINE(category,
4572	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4573
4574	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4575	    "rsmseg_resume enter: key=%u\n", seg->s_key));
4576
4577	*cookie = NULL;
4578
4579	ASSERT(rsmseglock_held(seg));
4580
4581	if ((seg->s_state != RSM_STATE_CONN_QUIESCE) &&
4582	    (seg->s_state != RSM_STATE_MAP_QUIESCE)) {
4583		return (RSM_SUCCESS);
4584	}
4585
4586	sharedp = seg->s_share;
4587
4588	rsmsharelock_acquire(seg);
4589
4590	/* resume the shared connection and/or mapping */
4591	retc = rsmsegshare_resume(seg);
4592
4593	if (seg->s_state == RSM_STATE_CONN_QUIESCE) {
4594		/* shared state can either be connected or mapped */
4595		if ((sharedp->rsmsi_state == RSMSI_STATE_CONNECTED) ||
4596		    (sharedp->rsmsi_state == RSMSI_STATE_MAPPED)) {
4597			ASSERT(retc == RSM_SUCCESS);
4598			seg->s_handle.in = sharedp->rsmsi_handle;
4599			rsmsharelock_release(seg);
4600			seg->s_state = RSM_STATE_CONNECT;
4601
4602		} else { /* error in rsmpi connect during resume */
4603			seg->s_handle.in = NULL;
4604			seg->s_state = RSM_STATE_DISCONNECT;
4605
4606			sharedp->rsmsi_refcnt--;
4607			cookie = (void *)sharedp->rsmsi_cookie;
4608
4609			if (sharedp->rsmsi_refcnt == 0) {
4610				ASSERT(sharedp->rsmsi_mapcnt == 0);
4611				rsmsharelock_release(seg);
4612
4613				/* clean up the shared data structure */
4614				mutex_destroy(&sharedp->rsmsi_lock);
4615				cv_destroy(&sharedp->rsmsi_cv);
4616				kmem_free((void *)(sharedp),
4617				    sizeof (rsm_import_share_t));
4618
4619			} else {
4620				rsmsharelock_release(seg);
4621			}
4622			/*
4623			 * The following needs to be done after any
4624			 * rsmsharelock calls which use seg->s_share.
4625			 */
4626			seg->s_share = NULL;
4627		}
4628
4629		/* signal any waiting segment */
4630		cv_broadcast(&seg->s_cv);
4631
4632		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4633		    "rsmseg_resume done:state=%d\n", seg->s_state));
4634		return (retc);
4635	}
4636
4637	ASSERT(seg->s_state == RSM_STATE_MAP_QUIESCE);
4638
4639	/* Setup protections for remap */
4640	maxprot = PROT_USER;
4641	if (seg->s_mode & RSM_PERM_READ) {
4642		maxprot |= PROT_READ;
4643	}
4644	if (seg->s_mode & RSM_PERM_WRITE) {
4645		maxprot |= PROT_WRITE;
4646	}
4647
4648	if (sharedp->rsmsi_state != RSMSI_STATE_MAPPED) {
4649		/* error in rsmpi connect or map during resume */
4650
4651		/* remap to trash page */
4652		ASSERT(seg->s_ckl != NULL);
4653
4654		for (hdl = seg->s_ckl; hdl != NULL; hdl = hdl->c_next) {
4655			e = devmap_umem_remap(hdl->c_dhp, rsm_dip,
4656			    remap_cookie, hdl->c_off, hdl->c_len,
4657			    maxprot, 0, NULL);
4658
4659			DBG_PRINTF((category, RSM_ERR,
4660			    "rsmseg_resume:remap=%d\n", e));
4661		}
4662
4663		seg->s_handle.in = NULL;
4664		seg->s_state = RSM_STATE_DISCONNECT;
4665
4666		sharedp->rsmsi_refcnt--;
4667
4668		sharedp->rsmsi_mapcnt--;
4669		seg->s_mapinfo = NULL;
4670
4671		if (sharedp->rsmsi_refcnt == 0) {
4672			ASSERT(sharedp->rsmsi_mapcnt == 0);
4673			rsmsharelock_release(seg);
4674
4675			/* clean up the shared data structure */
4676			mutex_destroy(&sharedp->rsmsi_lock);
4677			cv_destroy(&sharedp->rsmsi_cv);
4678			kmem_free((void *)(sharedp),
4679			    sizeof (rsm_import_share_t));
4680
4681		} else {
4682			rsmsharelock_release(seg);
4683		}
4684		/*
4685		 * The following needs to be done after any
4686		 * rsmsharelock calls which use seg->s_share.
4687		 */
4688		seg->s_share = NULL;
4689
4690		/* signal any waiting segment */
4691		cv_broadcast(&seg->s_cv);
4692
4693		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4694		    "rsmseg_resume done:seg=%x,err=%d\n",
4695		    seg->s_key, retc));
4696		return (retc);
4697
4698	}
4699
4700	seg->s_handle.in = sharedp->rsmsi_handle;
4701
4702	if (seg->s_node == my_nodeid) { /* loopback */
4703		ASSERT(seg->s_mapinfo == NULL);
4704
4705		for (hdl = seg->s_ckl; hdl != NULL; hdl = hdl->c_next) {
4706			e = devmap_umem_remap(hdl->c_dhp,
4707			    rsm_dip, seg->s_cookie,
4708			    hdl->c_off, hdl->c_len,
4709			    maxprot, 0, NULL);
4710
4711			DBG_PRINTF((category, RSM_ERR,
4712			    "rsmseg_resume:remap=%d\n", e));
4713		}
4714	} else { /* remote exporter */
4715		/* remap to the new rsmpi maps */
4716		seg->s_mapinfo = sharedp->rsmsi_mapinfo;
4717
4718		for (hdl = seg->s_ckl; hdl != NULL; hdl = hdl->c_next) {
4719			p = rsm_get_mapinfo(seg, hdl->c_off, hdl->c_len,
4720			    &dev_offset, &maplen);
4721			e = devmap_devmem_remap(hdl->c_dhp,
4722			    p->dip, p->dev_register, dev_offset,
4723			    maplen, maxprot, 0, NULL);
4724
4725			DBG_PRINTF((category, RSM_ERR,
4726			    "rsmseg_resume:remap=%d\n", e));
4727		}
4728	}
4729
4730	rsmsharelock_release(seg);
4731
4732	seg->s_state = RSM_STATE_ACTIVE;
4733	cv_broadcast(&seg->s_cv);
4734
4735	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_resume done\n"));
4736
4737	return (retc);
4738}
4739
4740static int
4741rsmsegshare_resume(rsmseg_t *seg)
4742{
4743	int			e = RSM_SUCCESS;
4744	adapter_t		*adapter;
4745	rsm_import_share_t	*sharedp;
4746	DBG_DEFINE(category,
4747	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4748
4749	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegshare_resume enter\n"));
4750
4751	ASSERT(rsmseglock_held(seg));
4752	ASSERT(rsmsharelock_held(seg));
4753
4754	sharedp = seg->s_share;
4755
4756	/*
4757	 * If we are not in a xxxx_QUIESCE state that means shared
4758	 * connect/mapping processing has been already been done
4759	 * so return success.
4760	 */
4761	if ((sharedp->rsmsi_state != RSMSI_STATE_CONN_QUIESCE) &&
4762	    (sharedp->rsmsi_state != RSMSI_STATE_MAP_QUIESCE)) {
4763		return (RSM_SUCCESS);
4764	}
4765
4766	adapter = seg->s_adapter;
4767
4768	if (sharedp->rsmsi_node != my_nodeid) {
4769		rsm_addr_t	hwaddr;
4770		hwaddr = get_remote_hwaddr(adapter, sharedp->rsmsi_node);
4771
4772		e = adapter->rsmpi_ops->rsm_connect(
4773		    adapter->rsmpi_handle, hwaddr,
4774		    sharedp->rsmsi_segid, &sharedp->rsmsi_handle);
4775
4776		DBG_PRINTF((category, RSM_DEBUG,
4777		    "rsmsegshare_resume:rsmpi connect seg=%x:err=%d\n",
4778		    sharedp->rsmsi_segid, e));
4779
4780		if (e != RSM_SUCCESS) {
4781			/* when do we send the NOT_IMPORTING message */
4782			sharedp->rsmsi_handle = NULL;
4783			sharedp->rsmsi_state = RSMSI_STATE_DISCONNECTED;
4784			/* signal any waiting segment */
4785			cv_broadcast(&sharedp->rsmsi_cv);
4786			return (e);
4787		}
4788	}
4789
4790	if (sharedp->rsmsi_state == RSMSI_STATE_CONN_QUIESCE) {
4791		sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
4792		/* signal any waiting segment */
4793		cv_broadcast(&sharedp->rsmsi_cv);
4794		return (e);
4795	}
4796
4797	ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAP_QUIESCE);
4798
4799	/* do the rsmpi map of the whole segment here */
4800	if (sharedp->rsmsi_node != my_nodeid) {
4801		size_t mapped_len;
4802		rsm_mapinfo_t *p;
4803
4804		/*
4805		 * We need to do rsmpi maps with <off, lens> identical to
4806		 * the old mapinfo list because the segment mapping handles
4807		 * dhp and such need the fragmentation of rsmpi maps to be
4808		 * identical to what it was during the mmap of the segment
4809		 */
4810		p = sharedp->rsmsi_mapinfo;
4811
4812		while (p != NULL) {
4813			mapped_len = 0;
4814
4815			e = adapter->rsmpi_ops->rsm_map(
4816			    sharedp->rsmsi_handle, p->start_offset,
4817			    p->individual_len, &mapped_len,
4818			    &p->dip, &p->dev_register, &p->dev_offset,
4819			    NULL, NULL);
4820
4821			if (e != 0) {
4822				DBG_PRINTF((category, RSM_ERR,
4823				    "rsmsegshare_resume: rsmpi map err=%d\n",
4824				    e));
4825				break;
4826			}
4827
4828			if (mapped_len != p->individual_len) {
4829				DBG_PRINTF((category, RSM_ERR,
4830				    "rsmsegshare_resume: rsmpi maplen"
4831				    "< reqlen=%lx\n", mapped_len));
4832				e = RSMERR_BAD_LENGTH;
4833				break;
4834			}
4835
4836			p = p->next;
4837
4838		}
4839
4840
4841		if (e != RSM_SUCCESS) { /* rsmpi map failed */
4842			int	err;
4843			/* Check if this is the first rsm_map */
4844			if (p != sharedp->rsmsi_mapinfo) {
4845				/*
4846				 * A single rsm_unmap undoes multiple rsm_maps.
4847				 */
4848				(void) seg->s_adapter->rsmpi_ops->
4849				    rsm_unmap(sharedp->rsmsi_handle);
4850			}
4851
4852			rsm_free_mapinfo(sharedp->rsmsi_mapinfo);
4853			sharedp->rsmsi_mapinfo = NULL;
4854
4855			err = adapter->rsmpi_ops->
4856			    rsm_disconnect(sharedp->rsmsi_handle);
4857
4858			DBG_PRINTF((category, RSM_DEBUG,
4859			    "rsmsegshare_resume:disconn seg=%x:err=%d\n",
4860			    sharedp->rsmsi_segid, err));
4861
4862			sharedp->rsmsi_handle = NULL;
4863			sharedp->rsmsi_state = RSMSI_STATE_DISCONNECTED;
4864
4865			/* signal the waiting segments */
4866			cv_broadcast(&sharedp->rsmsi_cv);
4867			DBG_PRINTF((category, RSM_DEBUG,
4868			    "rsmsegshare_resume done: rsmpi map err\n"));
4869			return (e);
4870		}
4871	}
4872
4873	sharedp->rsmsi_state = RSMSI_STATE_MAPPED;
4874
4875	/* signal any waiting segment */
4876	cv_broadcast(&sharedp->rsmsi_cv);
4877
4878	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegshare_resume done\n"));
4879
4880	return (e);
4881}
4882
4883/*
4884 * this is the routine that gets called by recv_taskq which is the
4885 * thread that processes messages that are flow-controlled.
4886 */
4887static void
4888rsm_intr_proc_deferred(void *arg)
4889{
4890	path_t			*path = (path_t *)arg;
4891	rsmipc_request_t	*msg;
4892	rsmipc_msghdr_t		*msghdr;
4893	rsm_node_id_t		src_node;
4894	msgbuf_elem_t		*head;
4895	int			e;
4896	DBG_DEFINE(category,
4897	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4898
4899	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4900	    "rsm_intr_proc_deferred enter\n"));
4901
4902	mutex_enter(&path->mutex);
4903
4904	/* use the head of the msgbuf_queue */
4905	head = rsmka_gethead_msgbuf(path);
4906
4907	mutex_exit(&path->mutex);
4908
4909	msg = (rsmipc_request_t *)&(head->msg);
4910	msghdr = (rsmipc_msghdr_t *)msg;
4911
4912	src_node = msghdr->rsmipc_src;
4913
4914	/*
4915	 * messages that need to send a reply should check the message version
4916	 * before processing the message. And all messages that need to
4917	 * send a reply should be processed here by the worker thread.
4918	 */
4919	switch (msghdr->rsmipc_type) {
4920	case RSMIPC_MSG_SEGCONNECT:
4921		if (msghdr->rsmipc_version != RSM_VERSION) {
4922			rsmipc_reply_t reply;
4923			reply.rsmipc_status = RSMERR_BAD_DRIVER_VERSION;
4924			reply.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_REPLY;
4925			reply.rsmipc_hdr.rsmipc_cookie = msghdr->rsmipc_cookie;
4926			(void) rsmipc_send(msghdr->rsmipc_src, NULL, &reply);
4927		} else {
4928			rsm_intr_segconnect(src_node, msg);
4929		}
4930		break;
4931	case RSMIPC_MSG_DISCONNECT:
4932		rsm_force_unload(src_node, msg->rsmipc_key, DISCONNECT);
4933		break;
4934	case RSMIPC_MSG_SUSPEND:
4935		importer_suspend(src_node);
4936		break;
4937	case RSMIPC_MSG_SUSPEND_DONE:
4938		rsm_suspend_complete(src_node, 0);
4939		break;
4940	case RSMIPC_MSG_RESUME:
4941		importer_resume(src_node);
4942		break;
4943	default:
4944		ASSERT(0);
4945	}
4946
4947	mutex_enter(&path->mutex);
4948
4949	rsmka_dequeue_msgbuf(path);
4950
4951	/* incr procmsg_cnt can be at most RSMIPC_MAX_MESSAGES */
4952	if (path->procmsg_cnt < RSMIPC_MAX_MESSAGES)
4953		path->procmsg_cnt++;
4954
4955	ASSERT(path->procmsg_cnt <= RSMIPC_MAX_MESSAGES);
4956
4957	/* No need to send credits if path is going down */
4958	if ((path->state == RSMKA_PATH_ACTIVE) &&
4959	    (path->procmsg_cnt >= RSMIPC_LOTSFREE_MSGBUFS)) {
4960		/*
4961		 * send credits and reset procmsg_cnt if success otherwise
4962		 * credits will be sent after processing the next message
4963		 */
4964		e = rsmipc_send_controlmsg(path, RSMIPC_MSG_CREDIT);
4965		if (e == 0)
4966			path->procmsg_cnt = 0;
4967		else
4968			DBG_PRINTF((category, RSM_ERR,
4969			    "rsm_intr_proc_deferred:send credits err=%d\n", e));
4970	}
4971
4972	/*
4973	 * decrement the path refcnt since we incremented it in
4974	 * rsm_intr_callback_dispatch
4975	 */
4976	PATH_RELE_NOLOCK(path);
4977
4978	mutex_exit(&path->mutex);
4979
4980	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4981	    "rsm_intr_proc_deferred done\n"));
4982}
4983
4984/*
4985 * Flow-controlled messages are enqueued and dispatched onto a taskq here
4986 */
4987static void
4988rsm_intr_callback_dispatch(void *data, rsm_addr_t src_hwaddr,
4989    rsm_intr_hand_arg_t arg)
4990{
4991	srv_handler_arg_t	*hdlr_argp = (srv_handler_arg_t *)arg;
4992	path_t			*path;
4993	rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)data;
4994	DBG_DEFINE(category,
4995	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
4996
4997	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
4998	    "rsm_intr_callback_dispatch enter\n"));
4999	ASSERT(data && hdlr_argp);
5000
5001	/* look up the path - incr the path refcnt */
5002	path = rsm_find_path(hdlr_argp->adapter_name,
5003	    hdlr_argp->adapter_instance, src_hwaddr);
5004
5005	/* the path has been removed - drop this message */
5006	if (path == NULL) {
5007		DBG_PRINTF((category, RSM_DEBUG,
5008		    "rsm_intr_callback_dispatch done: msg dropped\n"));
5009		return;
5010	}
5011	/* the path is not active - don't accept new messages */
5012	if (path->state != RSMKA_PATH_ACTIVE) {
5013		PATH_RELE_NOLOCK(path);
5014		mutex_exit(&path->mutex);
5015		DBG_PRINTF((category, RSM_DEBUG,
5016		    "rsm_intr_callback_dispatch done: msg dropped"
5017		    " path=%lx !ACTIVE\n", path));
5018		return;
5019	}
5020
5021	/*
5022	 * Check if this message was sent to an older incarnation
5023	 * of the path/sendq.
5024	 */
5025	if (path->local_incn != msghdr->rsmipc_incn) {
5026		/* decrement the refcnt */
5027		PATH_RELE_NOLOCK(path);
5028		mutex_exit(&path->mutex);
5029		DBG_PRINTF((category, RSM_DEBUG,
5030		    "rsm_intr_callback_dispatch done: old incn %lld\n",
5031		    msghdr->rsmipc_incn));
5032		return;
5033	}
5034
5035	/* copy and enqueue msg on the path's msgbuf queue */
5036	rsmka_enqueue_msgbuf(path, data);
5037
5038	/*
5039	 * schedule task to process messages - ignore retval from
5040	 * task_dispatch because we sender cannot send more than
5041	 * what receiver can handle.
5042	 */
5043	(void) taskq_dispatch(path->recv_taskq,
5044	    rsm_intr_proc_deferred, path, KM_NOSLEEP);
5045
5046	mutex_exit(&path->mutex);
5047
5048	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5049	    "rsm_intr_callback_dispatch done\n"));
5050}
5051
5052/*
5053 * This procedure is called from rsm_srv_func when a remote node creates a
5054 * a send queue.  This event is used as a hint that an  earlier failed
5055 * attempt to create a send queue to that remote node may now succeed and
5056 * should be retried.  Indication of an earlier failed attempt is provided
5057 * by the RSMKA_SQCREATE_PENDING flag.
5058 */
5059static void
5060rsm_sqcreateop_callback(rsm_addr_t src_hwaddr, rsm_intr_hand_arg_t arg)
5061{
5062	srv_handler_arg_t	*hdlr_argp = (srv_handler_arg_t *)arg;
5063	path_t			*path;
5064	DBG_DEFINE(category,
5065	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5066
5067	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5068	    "rsm_sqcreateop_callback enter\n"));
5069
5070	/* look up the path - incr the path refcnt */
5071	path = rsm_find_path(hdlr_argp->adapter_name,
5072	    hdlr_argp->adapter_instance, src_hwaddr);
5073
5074	if (path == NULL) {
5075		DBG_PRINTF((category, RSM_DEBUG,
5076		    "rsm_sqcreateop_callback done: no path\n"));
5077		return;
5078	}
5079
5080	if ((path->state == RSMKA_PATH_UP) &&
5081	    (path->flags & RSMKA_SQCREATE_PENDING)) {
5082		/*
5083		 * previous attempt to create sendq had failed, retry
5084		 * it and move to RSMKA_PATH_ACTIVE state if successful.
5085		 * the refcnt will be decremented in the do_deferred_work
5086		 */
5087		(void) rsmka_do_path_active(path, RSMKA_NO_SLEEP);
5088	} else {
5089		/* decrement the refcnt */
5090		PATH_RELE_NOLOCK(path);
5091	}
5092	mutex_exit(&path->mutex);
5093
5094	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5095	    "rsm_sqcreateop_callback done\n"));
5096}
5097
5098static void
5099rsm_intr_callback(void *data, rsm_addr_t src_hwaddr, rsm_intr_hand_arg_t arg)
5100{
5101	rsmipc_msghdr_t *msghdr = (rsmipc_msghdr_t *)data;
5102	rsmipc_request_t *msg = (rsmipc_request_t *)data;
5103	rsmipc_controlmsg_t *ctrlmsg = (rsmipc_controlmsg_t *)data;
5104	rsm_node_id_t src_node;
5105	DBG_DEFINE(category,
5106	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5107
5108	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_callback enter:"
5109	    "src=%d, type=%d\n", msghdr->rsmipc_src,
5110	    msghdr->rsmipc_type));
5111
5112	/*
5113	 * Check for the version number in the msg header. If it is not
5114	 * RSM_VERSION, drop the message. In the future, we need to manage
5115	 * incompatible version numbers in some way
5116	 */
5117	if (msghdr->rsmipc_version != RSM_VERSION) {
5118		DBG_PRINTF((category, RSM_ERR, "wrong KA version\n"));
5119		/*
5120		 * Drop requests that don't have a reply right here
5121		 * Request with reply will send a BAD_VERSION reply
5122		 * when they get processed by the worker thread.
5123		 */
5124		if (msghdr->rsmipc_type != RSMIPC_MSG_SEGCONNECT) {
5125			return;
5126		}
5127
5128	}
5129
5130	src_node = msghdr->rsmipc_src;
5131
5132	switch (msghdr->rsmipc_type) {
5133	case RSMIPC_MSG_SEGCONNECT:
5134	case RSMIPC_MSG_DISCONNECT:
5135	case RSMIPC_MSG_SUSPEND:
5136	case RSMIPC_MSG_SUSPEND_DONE:
5137	case RSMIPC_MSG_RESUME:
5138		/*
5139		 * These message types are handled by a worker thread using
5140		 * the flow-control algorithm.
5141		 * Any message processing that does one or more of the
5142		 * following should be handled in a worker thread.
5143		 *	- allocates resources and might sleep
5144		 *	- makes RSMPI calls down to the interconnect driver
5145		 *	this by defn include requests with reply.
5146		 *	- takes a long duration of time
5147		 */
5148		rsm_intr_callback_dispatch(data, src_hwaddr, arg);
5149		break;
5150	case RSMIPC_MSG_NOTIMPORTING:
5151		importer_list_rm(src_node, msg->rsmipc_key,
5152		    msg->rsmipc_segment_cookie);
5153		break;
5154	case RSMIPC_MSG_SQREADY:
5155		rsm_proc_sqready(data, src_hwaddr, arg);
5156		break;
5157	case RSMIPC_MSG_SQREADY_ACK:
5158		rsm_proc_sqready_ack(data, src_hwaddr, arg);
5159		break;
5160	case RSMIPC_MSG_CREDIT:
5161		rsm_add_credits(ctrlmsg, src_hwaddr, arg);
5162		break;
5163	case RSMIPC_MSG_REPLY:
5164		rsm_intr_reply(msghdr);
5165		break;
5166	case RSMIPC_MSG_BELL:
5167		rsm_intr_event(msg);
5168		break;
5169	case RSMIPC_MSG_IMPORTING:
5170		importer_list_add(src_node, msg->rsmipc_key,
5171		    msg->rsmipc_adapter_hwaddr,
5172		    msg->rsmipc_segment_cookie);
5173		break;
5174	case RSMIPC_MSG_REPUBLISH:
5175		importer_update(src_node, msg->rsmipc_key, msg->rsmipc_perm);
5176		break;
5177	default:
5178		DBG_PRINTF((category, RSM_DEBUG,
5179		    "rsm_intr_callback: bad msg %lx type %d data %lx\n",
5180		    (size_t)msg, (int)(msghdr->rsmipc_type), (size_t)data));
5181	}
5182
5183	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_intr_callback done\n"));
5184
5185}
5186
5187rsm_intr_hand_ret_t rsm_srv_func(rsm_controller_object_t *chd,
5188    rsm_intr_q_op_t opcode, rsm_addr_t src,
5189    void *data, size_t size, rsm_intr_hand_arg_t arg)
5190{
5191	DBG_DEFINE(category,
5192	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5193
5194	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_srv_func enter\n"));
5195
5196	switch (opcode) {
5197	case RSM_INTR_Q_OP_CREATE:
5198		DBG_PRINTF((category, RSM_DEBUG, "rsm_srv_func:OP_CREATE\n"));
5199		rsm_sqcreateop_callback(src, arg);
5200		break;
5201	case RSM_INTR_Q_OP_DESTROY:
5202		DBG_PRINTF((category, RSM_DEBUG, "rsm_srv_func:OP_DESTROY\n"));
5203		break;
5204	case RSM_INTR_Q_OP_RECEIVE:
5205		rsm_intr_callback(data, src, arg);
5206		break;
5207	default:
5208		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5209		    "rsm_srv_func: unknown opcode = %x\n", opcode));
5210	}
5211
5212	chd = chd;
5213	size = size;
5214
5215	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_srv_func done\n"));
5216
5217	return (RSM_INTR_HAND_CLAIMED);
5218}
5219
5220/* *************************** IPC slots ************************* */
5221static rsmipc_slot_t *
5222rsmipc_alloc()
5223{
5224	int i;
5225	rsmipc_slot_t *slot;
5226	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
5227
5228	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_alloc enter\n"));
5229
5230	/* try to find a free slot, if not wait */
5231	mutex_enter(&rsm_ipc.lock);
5232
5233	while (rsm_ipc.count == 0) {
5234		rsm_ipc.wanted = 1;
5235		cv_wait(&rsm_ipc.cv, &rsm_ipc.lock);
5236	}
5237
5238	/* An empty slot is available, find it */
5239	slot = &rsm_ipc.slots[0];
5240	for (i = 0; i < RSMIPC_SZ; i++, slot++) {
5241		if (RSMIPC_GET(slot, RSMIPC_FREE)) {
5242			RSMIPC_CLEAR(slot, RSMIPC_FREE);
5243			break;
5244		}
5245	}
5246
5247	ASSERT(i < RSMIPC_SZ);
5248	rsm_ipc.count--;	/* one less is available */
5249	rsm_ipc.sequence++; /* new sequence */
5250
5251	slot->rsmipc_cookie.ic.sequence = (uint_t)rsm_ipc.sequence;
5252	slot->rsmipc_cookie.ic.index = (uint_t)i;
5253
5254	mutex_exit(&rsm_ipc.lock);
5255
5256	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_alloc done\n"));
5257
5258	return (slot);
5259}
5260
5261static void
5262rsmipc_free(rsmipc_slot_t *slot)
5263{
5264	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
5265
5266	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_free enter\n"));
5267
5268	ASSERT(MUTEX_HELD(&slot->rsmipc_lock));
5269	ASSERT(&rsm_ipc.slots[slot->rsmipc_cookie.ic.index] == slot);
5270
5271	mutex_enter(&rsm_ipc.lock);
5272
5273	RSMIPC_SET(slot, RSMIPC_FREE);
5274
5275	slot->rsmipc_cookie.ic.sequence = 0;
5276
5277	mutex_exit(&slot->rsmipc_lock);
5278	rsm_ipc.count++;
5279	ASSERT(rsm_ipc.count <= RSMIPC_SZ);
5280	if (rsm_ipc.wanted) {
5281		rsm_ipc.wanted = 0;
5282		cv_broadcast(&rsm_ipc.cv);
5283	}
5284
5285	mutex_exit(&rsm_ipc.lock);
5286
5287	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_free done\n"));
5288}
5289
5290static int
5291rsmipc_send(rsm_node_id_t dest, rsmipc_request_t *req, rsmipc_reply_t *reply)
5292{
5293	int		e = 0;
5294	int		credit_check = 0;
5295	int		retry_cnt = 0;
5296	int		min_retry_cnt = 10;
5297	clock_t		ticks;
5298	rsm_send_t	is;
5299	rsmipc_slot_t	*rslot;
5300	adapter_t	*adapter;
5301	path_t		*path;
5302	sendq_token_t	*sendq_token;
5303	sendq_token_t	*used_sendq_token = NULL;
5304	rsm_send_q_handle_t	ipc_handle;
5305	DBG_DEFINE(category,
5306	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5307
5308	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_send enter:dest=%d",
5309	    dest));
5310
5311	/*
5312	 * Check if this is a local case
5313	 */
5314	if (dest == my_nodeid) {
5315		switch (req->rsmipc_hdr.rsmipc_type) {
5316		case RSMIPC_MSG_SEGCONNECT:
5317			reply->rsmipc_status = (short)rsmsegacl_validate(
5318			    req, dest, reply);
5319			break;
5320		case RSMIPC_MSG_BELL:
5321			req->rsmipc_hdr.rsmipc_src = dest;
5322			rsm_intr_event(req);
5323			break;
5324		case RSMIPC_MSG_IMPORTING:
5325			importer_list_add(dest, req->rsmipc_key,
5326			    req->rsmipc_adapter_hwaddr,
5327			    req->rsmipc_segment_cookie);
5328			break;
5329		case RSMIPC_MSG_NOTIMPORTING:
5330			importer_list_rm(dest, req->rsmipc_key,
5331			    req->rsmipc_segment_cookie);
5332			break;
5333		case RSMIPC_MSG_REPUBLISH:
5334			importer_update(dest, req->rsmipc_key,
5335			    req->rsmipc_perm);
5336			break;
5337		case RSMIPC_MSG_SUSPEND:
5338			importer_suspend(dest);
5339			break;
5340		case RSMIPC_MSG_SUSPEND_DONE:
5341			rsm_suspend_complete(dest, 0);
5342			break;
5343		case RSMIPC_MSG_RESUME:
5344			importer_resume(dest);
5345			break;
5346		default:
5347			ASSERT(0);
5348		}
5349		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5350		    "rsmipc_send done\n"));
5351		return (0);
5352	}
5353
5354	if (dest >= MAX_NODES) {
5355		DBG_PRINTF((category, RSM_ERR,
5356		    "rsm: rsmipc_send bad node number %x\n", dest));
5357		return (RSMERR_REMOTE_NODE_UNREACHABLE);
5358	}
5359
5360	/*
5361	 * Oh boy! we are going remote.
5362	 */
5363
5364	/*
5365	 * identify if we need to have credits to send this message
5366	 * - only selected requests are flow controlled
5367	 */
5368	if (req != NULL) {
5369		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5370		    "rsmipc_send:request type=%d\n",
5371		    req->rsmipc_hdr.rsmipc_type));
5372
5373		switch (req->rsmipc_hdr.rsmipc_type) {
5374		case RSMIPC_MSG_SEGCONNECT:
5375		case RSMIPC_MSG_DISCONNECT:
5376		case RSMIPC_MSG_IMPORTING:
5377		case RSMIPC_MSG_SUSPEND:
5378		case RSMIPC_MSG_SUSPEND_DONE:
5379		case RSMIPC_MSG_RESUME:
5380			credit_check = 1;
5381			break;
5382		default:
5383			credit_check = 0;
5384		}
5385	}
5386
5387again:
5388	if (retry_cnt++ == min_retry_cnt) {
5389		/* backoff before further retries for 10ms */
5390		delay(drv_usectohz(10000));
5391		retry_cnt = 0; /* reset retry_cnt */
5392	}
5393	sendq_token = rsmka_get_sendq_token(dest, used_sendq_token);
5394	if (sendq_token == NULL) {
5395		DBG_PRINTF((category, RSM_ERR,
5396		    "rsm: rsmipc_send no device to reach node %d\n", dest));
5397		return (RSMERR_REMOTE_NODE_UNREACHABLE);
5398	}
5399
5400	if ((sendq_token == used_sendq_token) &&
5401	    ((e == RSMERR_CONN_ABORTED) || (e == RSMERR_TIMEOUT) ||
5402	    (e == RSMERR_COMM_ERR_MAYBE_DELIVERED))) {
5403		rele_sendq_token(sendq_token);
5404		DBG_PRINTF((category, RSM_DEBUG, "rsmipc_send done=%d\n", e));
5405		return (RSMERR_CONN_ABORTED);
5406	} else
5407		used_sendq_token = sendq_token;
5408
5409/* lint -save -e413 */
5410	path = SQ_TOKEN_TO_PATH(sendq_token);
5411	adapter = path->local_adapter;
5412/* lint -restore */
5413	ipc_handle = sendq_token->rsmpi_sendq_handle;
5414
5415	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5416	    "rsmipc_send: path=%lx sendq_hdl=%lx\n", path, ipc_handle));
5417
5418	if (reply == NULL) {
5419		/* Send request without ack */
5420		/*
5421		 * Set the rsmipc_version number in the msghdr for KA
5422		 * communication versioning
5423		 */
5424		req->rsmipc_hdr.rsmipc_version = RSM_VERSION;
5425		req->rsmipc_hdr.rsmipc_src = my_nodeid;
5426		/*
5427		 * remote endpoints incn should match the value in our
5428		 * path's remote_incn field. No need to grab any lock
5429		 * since we have refcnted the path in rsmka_get_sendq_token
5430		 */
5431		req->rsmipc_hdr.rsmipc_incn = path->remote_incn;
5432
5433		is.is_data = (void *)req;
5434		is.is_size = sizeof (*req);
5435		is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5436		is.is_wait = 0;
5437
5438		if (credit_check) {
5439			mutex_enter(&path->mutex);
5440			/*
5441			 * wait till we recv credits or path goes down. If path
5442			 * goes down rsm_send will fail and we handle the error
5443			 * then
5444			 */
5445			while ((sendq_token->msgbuf_avail == 0) &&
5446			    (path->state == RSMKA_PATH_ACTIVE)) {
5447				e = cv_wait_sig(&sendq_token->sendq_cv,
5448				    &path->mutex);
5449				if (e == 0) {
5450					mutex_exit(&path->mutex);
5451					no_reply_cnt++;
5452					rele_sendq_token(sendq_token);
5453					DBG_PRINTF((category, RSM_DEBUG,
5454					    "rsmipc_send done: "
5455					    "cv_wait INTERRUPTED"));
5456					return (RSMERR_INTERRUPTED);
5457				}
5458			}
5459
5460			/*
5461			 * path is not active retry on another path.
5462			 */
5463			if (path->state != RSMKA_PATH_ACTIVE) {
5464				mutex_exit(&path->mutex);
5465				rele_sendq_token(sendq_token);
5466				e = RSMERR_CONN_ABORTED;
5467				DBG_PRINTF((category, RSM_ERR,
5468				    "rsm: rsmipc_send: path !ACTIVE"));
5469				goto again;
5470			}
5471
5472			ASSERT(sendq_token->msgbuf_avail > 0);
5473
5474			/*
5475			 * reserve a msgbuf
5476			 */
5477			sendq_token->msgbuf_avail--;
5478
5479			mutex_exit(&path->mutex);
5480
5481			e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5482			    NULL);
5483
5484			if (e != RSM_SUCCESS) {
5485				mutex_enter(&path->mutex);
5486				/*
5487				 * release the reserved msgbuf since
5488				 * the send failed
5489				 */
5490				sendq_token->msgbuf_avail++;
5491				cv_broadcast(&sendq_token->sendq_cv);
5492				mutex_exit(&path->mutex);
5493			}
5494		} else
5495			e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5496			    NULL);
5497
5498		no_reply_cnt++;
5499		rele_sendq_token(sendq_token);
5500		if (e != RSM_SUCCESS) {
5501			DBG_PRINTF((category, RSM_ERR,
5502			    "rsm: rsmipc_send no reply send"
5503			    " err = %d no reply count = %d\n",
5504			    e, no_reply_cnt));
5505			ASSERT(e != RSMERR_QUEUE_FENCE_UP &&
5506			    e != RSMERR_BAD_BARRIER_HNDL);
5507			atomic_add_64(&rsm_ipcsend_errcnt, 1);
5508			goto again;
5509		} else {
5510			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5511			    "rsmipc_send done\n"));
5512			return (e);
5513		}
5514
5515	}
5516
5517	if (req == NULL) {
5518		/* Send reply - No flow control is done for reply */
5519		/*
5520		 * Set the version in the msg header for KA communication
5521		 * versioning
5522		 */
5523		reply->rsmipc_hdr.rsmipc_version = RSM_VERSION;
5524		reply->rsmipc_hdr.rsmipc_src = my_nodeid;
5525		/* incn number is not used for reply msgs currently */
5526		reply->rsmipc_hdr.rsmipc_incn = path->remote_incn;
5527
5528		is.is_data = (void *)reply;
5529		is.is_size = sizeof (*reply);
5530		is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5531		is.is_wait = 0;
5532		e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is, NULL);
5533		rele_sendq_token(sendq_token);
5534		if (e != RSM_SUCCESS) {
5535			DBG_PRINTF((category, RSM_ERR,
5536			    "rsm: rsmipc_send reply send"
5537			    " err = %d\n", e));
5538			atomic_add_64(&rsm_ipcsend_errcnt, 1);
5539			goto again;
5540		} else {
5541			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5542			    "rsmipc_send done\n"));
5543			return (e);
5544		}
5545	}
5546
5547	/* Reply needed */
5548	rslot = rsmipc_alloc(); /* allocate a new ipc slot */
5549
5550	mutex_enter(&rslot->rsmipc_lock);
5551
5552	rslot->rsmipc_data = (void *)reply;
5553	RSMIPC_SET(rslot, RSMIPC_PENDING);
5554
5555	while (RSMIPC_GET(rslot, RSMIPC_PENDING)) {
5556		/*
5557		 * Set the rsmipc_version number in the msghdr for KA
5558		 * communication versioning
5559		 */
5560		req->rsmipc_hdr.rsmipc_version = RSM_VERSION;
5561		req->rsmipc_hdr.rsmipc_src = my_nodeid;
5562		req->rsmipc_hdr.rsmipc_cookie = rslot->rsmipc_cookie;
5563		/*
5564		 * remote endpoints incn should match the value in our
5565		 * path's remote_incn field. No need to grab any lock
5566		 * since we have refcnted the path in rsmka_get_sendq_token
5567		 */
5568		req->rsmipc_hdr.rsmipc_incn = path->remote_incn;
5569
5570		is.is_data = (void *)req;
5571		is.is_size = sizeof (*req);
5572		is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5573		is.is_wait = 0;
5574		if (credit_check) {
5575
5576			mutex_enter(&path->mutex);
5577			/*
5578			 * wait till we recv credits or path goes down. If path
5579			 * goes down rsm_send will fail and we handle the error
5580			 * then.
5581			 */
5582			while ((sendq_token->msgbuf_avail == 0) &&
5583			    (path->state == RSMKA_PATH_ACTIVE)) {
5584				e = cv_wait_sig(&sendq_token->sendq_cv,
5585				    &path->mutex);
5586				if (e == 0) {
5587					mutex_exit(&path->mutex);
5588					RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5589					rsmipc_free(rslot);
5590					rele_sendq_token(sendq_token);
5591					DBG_PRINTF((category, RSM_DEBUG,
5592					    "rsmipc_send done: "
5593					    "cv_wait INTERRUPTED"));
5594					return (RSMERR_INTERRUPTED);
5595				}
5596			}
5597
5598			/*
5599			 * path is not active retry on another path.
5600			 */
5601			if (path->state != RSMKA_PATH_ACTIVE) {
5602				mutex_exit(&path->mutex);
5603				RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5604				rsmipc_free(rslot);
5605				rele_sendq_token(sendq_token);
5606				e = RSMERR_CONN_ABORTED;
5607				DBG_PRINTF((category, RSM_ERR,
5608				    "rsm: rsmipc_send: path !ACTIVE"));
5609				goto again;
5610			}
5611
5612			ASSERT(sendq_token->msgbuf_avail > 0);
5613
5614			/*
5615			 * reserve a msgbuf
5616			 */
5617			sendq_token->msgbuf_avail--;
5618
5619			mutex_exit(&path->mutex);
5620
5621			e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5622			    NULL);
5623
5624			if (e != RSM_SUCCESS) {
5625				mutex_enter(&path->mutex);
5626				/*
5627				 * release the reserved msgbuf since
5628				 * the send failed
5629				 */
5630				sendq_token->msgbuf_avail++;
5631				cv_broadcast(&sendq_token->sendq_cv);
5632				mutex_exit(&path->mutex);
5633			}
5634		} else
5635			e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is,
5636			    NULL);
5637
5638		if (e != RSM_SUCCESS) {
5639			DBG_PRINTF((category, RSM_ERR,
5640			    "rsm: rsmipc_send rsmpi send err = %d\n", e));
5641			RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5642			rsmipc_free(rslot);
5643			rele_sendq_token(sendq_token);
5644			atomic_add_64(&rsm_ipcsend_errcnt, 1);
5645			goto again;
5646		}
5647
5648		/* wait for a reply signal, a SIGINT, or 5 sec. timeout */
5649		(void) drv_getparm(LBOLT, &ticks);
5650		ticks += drv_usectohz(5000000);
5651		e = cv_timedwait_sig(&rslot->rsmipc_cv, &rslot->rsmipc_lock,
5652		    ticks);
5653		if (e < 0) {
5654			/* timed out - retry */
5655			e = RSMERR_TIMEOUT;
5656		} else if (e == 0) {
5657			/* signalled - return error */
5658			e = RSMERR_INTERRUPTED;
5659			break;
5660		} else {
5661			e = RSM_SUCCESS;
5662		}
5663	}
5664
5665	RSMIPC_CLEAR(rslot, RSMIPC_PENDING);
5666	rsmipc_free(rslot);
5667	rele_sendq_token(sendq_token);
5668
5669	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmipc_send done=%d\n", e));
5670	return (e);
5671}
5672
5673static int
5674rsm_send_notimporting(rsm_node_id_t dest, rsm_memseg_id_t segid,  void *cookie)
5675{
5676	rsmipc_request_t request;
5677
5678	/*
5679	 *  inform the exporter to delete this importer
5680	 */
5681	request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_NOTIMPORTING;
5682	request.rsmipc_key = segid;
5683	request.rsmipc_segment_cookie = cookie;
5684	return (rsmipc_send(dest, &request, RSM_NO_REPLY));
5685}
5686
5687static void
5688rsm_send_republish(rsm_memseg_id_t segid, rsmapi_access_entry_t	*acl,
5689    int acl_len, rsm_permission_t default_permission)
5690{
5691	int			i;
5692	importing_token_t	*token;
5693	rsmipc_request_t	request;
5694	republish_token_t	*republish_list = NULL;
5695	republish_token_t	*rp;
5696	rsm_permission_t	permission;
5697	int			index;
5698
5699	/*
5700	 * send the new access mode to all the nodes that have imported
5701	 * this segment.
5702	 * If the new acl does not have a node that was present in
5703	 * the old acl a access permission of 0 is sent.
5704	 */
5705
5706	index = rsmhash(segid);
5707
5708	/*
5709	 * create a list of node/permissions to send the republish message
5710	 */
5711	mutex_enter(&importer_list.lock);
5712
5713	token = importer_list.bucket[index];
5714	while (token != NULL) {
5715		if (segid == token->key) {
5716			permission = default_permission;
5717
5718			for (i = 0; i < acl_len; i++) {
5719				if (token->importing_node == acl[i].ae_node) {
5720					permission = acl[i].ae_permission;
5721					break;
5722				}
5723			}
5724			rp = kmem_zalloc(sizeof (republish_token_t), KM_SLEEP);
5725
5726			rp->key = segid;
5727			rp->importing_node = token->importing_node;
5728			rp->permission = permission;
5729			rp->next = republish_list;
5730			republish_list = rp;
5731		}
5732		token = token->next;
5733	}
5734
5735	mutex_exit(&importer_list.lock);
5736
5737	request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_REPUBLISH;
5738	request.rsmipc_key = segid;
5739
5740	while (republish_list != NULL) {
5741		request.rsmipc_perm = republish_list->permission;
5742		(void) rsmipc_send(republish_list->importing_node,
5743		    &request, RSM_NO_REPLY);
5744		rp = republish_list;
5745		republish_list = republish_list->next;
5746		kmem_free(rp, sizeof (republish_token_t));
5747	}
5748}
5749
5750static void
5751rsm_send_suspend()
5752{
5753	int			i, e;
5754	rsmipc_request_t 	request;
5755	list_element_t		*tokp;
5756	list_element_t		*head = NULL;
5757	importing_token_t	*token;
5758	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
5759	    "rsm_send_suspend enter\n"));
5760
5761	/*
5762	 * create a list of node to send the suspend message
5763	 *
5764	 * Currently the whole importer list is scanned and we obtain
5765	 * all the nodes - this basically gets all nodes that at least
5766	 * import one segment from the local node.
5767	 *
5768	 * no need to grab the rsm_suspend_list lock here since we are
5769	 * single threaded when suspend is called.
5770	 */
5771
5772	mutex_enter(&importer_list.lock);
5773	for (i = 0; i < rsm_hash_size; i++) {
5774
5775		token = importer_list.bucket[i];
5776
5777		while (token != NULL) {
5778
5779			tokp = head;
5780
5781			/*
5782			 * make sure that the token's node
5783			 * is not already on the suspend list
5784			 */
5785			while (tokp != NULL) {
5786				if (tokp->nodeid == token->importing_node) {
5787					break;
5788				}
5789				tokp = tokp->next;
5790			}
5791
5792			if (tokp == NULL) { /* not in suspend list */
5793				tokp = kmem_zalloc(sizeof (list_element_t),
5794				    KM_SLEEP);
5795				tokp->nodeid = token->importing_node;
5796				tokp->next = head;
5797				head = tokp;
5798			}
5799
5800			token = token->next;
5801		}
5802	}
5803	mutex_exit(&importer_list.lock);
5804
5805	if (head == NULL) { /* no importers so go ahead and quiesce segments */
5806		exporter_quiesce();
5807		return;
5808	}
5809
5810	mutex_enter(&rsm_suspend_list.list_lock);
5811	ASSERT(rsm_suspend_list.list_head == NULL);
5812	/*
5813	 * update the suspend list righaway so that if a node dies the
5814	 * pathmanager can set the NODE dead flag
5815	 */
5816	rsm_suspend_list.list_head = head;
5817	mutex_exit(&rsm_suspend_list.list_lock);
5818
5819	tokp = head;
5820
5821	while (tokp != NULL) {
5822		request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_SUSPEND;
5823		e = rsmipc_send(tokp->nodeid, &request, RSM_NO_REPLY);
5824		/*
5825		 * Error in rsmipc_send currently happens due to inaccessibility
5826		 * of the remote node.
5827		 */
5828		if (e == RSM_SUCCESS) { /* send failed - don't wait for ack */
5829			tokp->flags |= RSM_SUSPEND_ACKPENDING;
5830		}
5831
5832		tokp = tokp->next;
5833	}
5834
5835	DBG_PRINTF((RSM_KERNEL_AGENT | RSM_EXPORT, RSM_DEBUG_VERBOSE,
5836	    "rsm_send_suspend done\n"));
5837
5838}
5839
5840static void
5841rsm_send_resume()
5842{
5843	rsmipc_request_t 	request;
5844	list_element_t		*elem, *head;
5845
5846	/*
5847	 * save the suspend list so that we know where to send
5848	 * the resume messages and make the suspend list head
5849	 * NULL.
5850	 */
5851	mutex_enter(&rsm_suspend_list.list_lock);
5852	head = rsm_suspend_list.list_head;
5853	rsm_suspend_list.list_head = NULL;
5854	mutex_exit(&rsm_suspend_list.list_lock);
5855
5856	while (head != NULL) {
5857		elem = head;
5858		head = head->next;
5859
5860		request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_RESUME;
5861
5862		(void) rsmipc_send(elem->nodeid, &request, RSM_NO_REPLY);
5863
5864		kmem_free((void *)elem, sizeof (list_element_t));
5865
5866	}
5867
5868}
5869
5870/*
5871 * This function takes path and sends a message using the sendq
5872 * corresponding to it. The RSMIPC_MSG_SQREADY, RSMIPC_MSG_SQREADY_ACK
5873 * and RSMIPC_MSG_CREDIT are sent using this function.
5874 */
5875int
5876rsmipc_send_controlmsg(path_t *path, int msgtype)
5877{
5878	int			e;
5879	int			retry_cnt = 0;
5880	int			min_retry_cnt = 10;
5881	clock_t			timeout;
5882	adapter_t		*adapter;
5883	rsm_send_t		is;
5884	rsm_send_q_handle_t	ipc_handle;
5885	rsmipc_controlmsg_t	msg;
5886	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_FLOWCONTROL);
5887
5888	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5889	    "rsmipc_send_controlmsg enter\n"));
5890
5891	ASSERT(MUTEX_HELD(&path->mutex));
5892
5893	adapter = path->local_adapter;
5894
5895	DBG_PRINTF((category, RSM_DEBUG, "rsmipc_send_controlmsg:path=%lx "
5896	    "msgtype=%d %lx:%llx->%lx:%llx procmsg=%d\n", path, msgtype,
5897	    my_nodeid, adapter->hwaddr, path->remote_node,
5898	    path->remote_hwaddr, path->procmsg_cnt));
5899
5900	if (path->state != RSMKA_PATH_ACTIVE) {
5901		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5902		    "rsmipc_send_controlmsg done: ! RSMKA_PATH_ACTIVE"));
5903		return (1);
5904	}
5905
5906	ipc_handle = path->sendq_token.rsmpi_sendq_handle;
5907
5908	msg.rsmipc_hdr.rsmipc_version = RSM_VERSION;
5909	msg.rsmipc_hdr.rsmipc_src = my_nodeid;
5910	msg.rsmipc_hdr.rsmipc_type = msgtype;
5911	msg.rsmipc_hdr.rsmipc_incn = path->remote_incn;
5912
5913	if (msgtype == RSMIPC_MSG_CREDIT)
5914		msg.rsmipc_credits = path->procmsg_cnt;
5915
5916	msg.rsmipc_local_incn = path->local_incn;
5917
5918	msg.rsmipc_adapter_hwaddr = adapter->hwaddr;
5919	/* incr the sendq, path refcnt */
5920	PATH_HOLD_NOLOCK(path);
5921	SENDQ_TOKEN_HOLD(path);
5922
5923	do {
5924		/* drop the path lock before doing the rsm_send */
5925		mutex_exit(&path->mutex);
5926
5927		is.is_data = (void *)&msg;
5928		is.is_size = sizeof (msg);
5929		is.is_flags = RSM_INTR_SEND_DELIVER | RSM_INTR_SEND_SLEEP;
5930		is.is_wait = 0;
5931
5932		e = adapter->rsmpi_ops->rsm_send(ipc_handle, &is, NULL);
5933
5934		ASSERT(e != RSMERR_QUEUE_FENCE_UP &&
5935		    e != RSMERR_BAD_BARRIER_HNDL);
5936
5937		mutex_enter(&path->mutex);
5938
5939		if (e == RSM_SUCCESS) {
5940			break;
5941		}
5942		/* error counter for statistics */
5943		atomic_add_64(&rsm_ctrlmsg_errcnt, 1);
5944
5945		DBG_PRINTF((category, RSM_ERR,
5946		    "rsmipc_send_controlmsg:rsm_send error=%d", e));
5947
5948		if (++retry_cnt == min_retry_cnt) { /* backoff before retry */
5949			timeout  = ddi_get_lbolt() + drv_usectohz(10000);
5950			(void) cv_timedwait(&path->sendq_token.sendq_cv,
5951			    &path->mutex, timeout);
5952			retry_cnt = 0;
5953		}
5954	} while (path->state == RSMKA_PATH_ACTIVE);
5955
5956	/* decrement the sendq,path refcnt that we incr before rsm_send */
5957	SENDQ_TOKEN_RELE(path);
5958	PATH_RELE_NOLOCK(path);
5959
5960	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
5961	    "rsmipc_send_controlmsg done=%d", e));
5962	return (e);
5963}
5964
5965/*
5966 * Called from rsm_force_unload and path_importer_disconnect. The memory
5967 * mapping for the imported segment is removed and the segment is
5968 * disconnected at the interconnect layer if disconnect_flag is TRUE.
5969 * rsm_force_unload will get disconnect_flag TRUE from rsm_intr_callback
5970 * and FALSE from rsm_rebind.
5971 *
5972 * When subsequent accesses cause page faulting, the dummy page is mapped
5973 * to resolve the fault, and the mapping generation number is incremented
5974 * so that the application can be notified on a close barrier operation.
5975 *
5976 * It is important to note that the caller of rsmseg_unload is responsible for
5977 * acquiring the segment lock before making a call to rsmseg_unload. This is
5978 * required to make the caller and rsmseg_unload thread safe. The segment lock
5979 * will be released by the rsmseg_unload function.
5980 */
5981void
5982rsmseg_unload(rsmseg_t *im_seg)
5983{
5984	rsmcookie_t		*hdl;
5985	void			*shared_cookie;
5986	rsmipc_request_t	request;
5987	uint_t			maxprot;
5988
5989	DBG_DEFINE(category,
5990	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_INTR_CALLBACK);
5991
5992	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_unload enter\n"));
5993
5994	ASSERT(im_seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
5995
5996	/* wait until segment leaves the mapping state */
5997	while (im_seg->s_state == RSM_STATE_MAPPING)
5998		cv_wait(&im_seg->s_cv, &im_seg->s_lock);
5999	/*
6000	 * An unload is only necessary if the segment is connected. However,
6001	 * if the segment was on the import list in state RSM_STATE_CONNECTING
6002	 * then a connection was in progress. Change to RSM_STATE_NEW
6003	 * here to cause an early exit from the connection process.
6004	 */
6005	if (im_seg->s_state == RSM_STATE_NEW) {
6006		rsmseglock_release(im_seg);
6007		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6008		    "rsmseg_unload done: RSM_STATE_NEW\n"));
6009		return;
6010	} else if (im_seg->s_state == RSM_STATE_CONNECTING) {
6011		im_seg->s_state = RSM_STATE_ABORT_CONNECT;
6012		rsmsharelock_acquire(im_seg);
6013		im_seg->s_share->rsmsi_state = RSMSI_STATE_ABORT_CONNECT;
6014		rsmsharelock_release(im_seg);
6015		rsmseglock_release(im_seg);
6016		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6017		    "rsmseg_unload done: RSM_STATE_CONNECTING\n"));
6018		return;
6019	}
6020
6021	if (im_seg->s_flags & RSM_FORCE_DISCONNECT) {
6022		if (im_seg->s_ckl != NULL) {
6023			int e;
6024			/* Setup protections for remap */
6025			maxprot = PROT_USER;
6026			if (im_seg->s_mode & RSM_PERM_READ) {
6027				maxprot |= PROT_READ;
6028			}
6029			if (im_seg->s_mode & RSM_PERM_WRITE) {
6030				maxprot |= PROT_WRITE;
6031			}
6032			hdl = im_seg->s_ckl;
6033			for (; hdl != NULL; hdl = hdl->c_next) {
6034				e = devmap_umem_remap(hdl->c_dhp, rsm_dip,
6035				    remap_cookie,
6036				    hdl->c_off, hdl->c_len,
6037				    maxprot, 0, NULL);
6038
6039				DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6040				    "remap returns %d\n", e));
6041			}
6042		}
6043
6044		(void) rsm_closeconnection(im_seg, &shared_cookie);
6045
6046		if (shared_cookie != NULL) {
6047			/*
6048			 * inform the exporting node so this import
6049			 * can be deleted from the list of importers.
6050			 */
6051			request.rsmipc_hdr.rsmipc_type =
6052			    RSMIPC_MSG_NOTIMPORTING;
6053			request.rsmipc_key = im_seg->s_segid;
6054			request.rsmipc_segment_cookie = shared_cookie;
6055			rsmseglock_release(im_seg);
6056			(void) rsmipc_send(im_seg->s_node, &request,
6057			    RSM_NO_REPLY);
6058		} else {
6059			rsmseglock_release(im_seg);
6060		}
6061	}
6062	else
6063		rsmseglock_release(im_seg);
6064
6065	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_unload done\n"));
6066
6067}
6068
6069/* ****************************** Importer Calls ************************ */
6070
6071static int
6072rsm_access(uid_t owner, gid_t group, int perm, int mode, const struct cred *cr)
6073{
6074	int shifts = 0;
6075
6076	if (crgetuid(cr) != owner) {
6077		shifts += 3;
6078		if (!groupmember(group, cr))
6079			shifts += 3;
6080	}
6081
6082	mode &= ~(perm << shifts);
6083
6084	if (mode == 0)
6085		return (0);
6086
6087	return (secpolicy_rsm_access(cr, owner, mode));
6088}
6089
6090
6091static int
6092rsm_connect(rsmseg_t *seg, rsm_ioctlmsg_t *msg, cred_t *cred,
6093    intptr_t dataptr, int mode)
6094{
6095	int e;
6096	int			recheck_state = 0;
6097	void			*shared_cookie;
6098	rsmipc_request_t	request;
6099	rsmipc_reply_t		reply;
6100	rsm_permission_t	access;
6101	adapter_t		*adapter;
6102	rsm_addr_t		addr = 0;
6103	rsm_import_share_t	*sharedp;
6104	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6105
6106	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_connect enter\n"));
6107
6108	adapter = rsm_getadapter(msg, mode);
6109	if (adapter == NULL) {
6110		DBG_PRINTF((category, RSM_ERR,
6111		    "rsm_connect done:ENODEV adapter=NULL\n"));
6112		return (RSMERR_CTLR_NOT_PRESENT);
6113	}
6114
6115	if ((adapter == &loopback_adapter) && (msg->nodeid != my_nodeid)) {
6116		rsmka_release_adapter(adapter);
6117		DBG_PRINTF((category, RSM_ERR,
6118		    "rsm_connect done:ENODEV loopback\n"));
6119		return (RSMERR_CTLR_NOT_PRESENT);
6120	}
6121
6122
6123	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6124	ASSERT(seg->s_state == RSM_STATE_NEW);
6125
6126	/*
6127	 * Translate perm to access
6128	 */
6129	if (msg->perm & ~RSM_PERM_RDWR) {
6130		rsmka_release_adapter(adapter);
6131		DBG_PRINTF((category, RSM_ERR,
6132		    "rsm_connect done:EINVAL invalid perms\n"));
6133		return (RSMERR_BAD_PERMS);
6134	}
6135	access = 0;
6136	if (msg->perm & RSM_PERM_READ)
6137		access |= RSM_ACCESS_READ;
6138	if (msg->perm & RSM_PERM_WRITE)
6139		access |= RSM_ACCESS_WRITE;
6140
6141	seg->s_node = msg->nodeid;
6142
6143	/*
6144	 * Adding to the import list locks the segment; release the segment
6145	 * lock so we can get the reply for the send.
6146	 */
6147	e = rsmimport_add(seg, msg->key);
6148	if (e) {
6149		rsmka_release_adapter(adapter);
6150		DBG_PRINTF((category, RSM_ERR,
6151		    "rsm_connect done:rsmimport_add failed %d\n", e));
6152		return (e);
6153	}
6154	seg->s_state = RSM_STATE_CONNECTING;
6155
6156	/*
6157	 * Set the s_adapter field here so as to have a valid comparison of
6158	 * the adapter and the s_adapter value during rsmshare_get. For
6159	 * any error, set s_adapter to NULL before doing a release_adapter
6160	 */
6161	seg->s_adapter = adapter;
6162
6163	rsmseglock_release(seg);
6164
6165	/*
6166	 * get the pointer to the shared data structure; the
6167	 * shared data is locked and refcount has been incremented
6168	 */
6169	sharedp = rsmshare_get(msg->key, msg->nodeid, adapter, seg);
6170
6171	ASSERT(rsmsharelock_held(seg));
6172
6173	do {
6174		/* flag indicates whether we need to recheck the state */
6175		recheck_state = 0;
6176		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6177		    "rsm_connect:RSMSI_STATE=%d\n", sharedp->rsmsi_state));
6178		switch (sharedp->rsmsi_state) {
6179		case RSMSI_STATE_NEW:
6180			sharedp->rsmsi_state = RSMSI_STATE_CONNECTING;
6181			break;
6182		case RSMSI_STATE_CONNECTING:
6183			/* FALLTHRU */
6184		case RSMSI_STATE_CONN_QUIESCE:
6185			/* FALLTHRU */
6186		case RSMSI_STATE_MAP_QUIESCE:
6187			/* wait for the state to change */
6188			while ((sharedp->rsmsi_state ==
6189			    RSMSI_STATE_CONNECTING) ||
6190			    (sharedp->rsmsi_state ==
6191			    RSMSI_STATE_CONN_QUIESCE) ||
6192			    (sharedp->rsmsi_state ==
6193			    RSMSI_STATE_MAP_QUIESCE)) {
6194				if (cv_wait_sig(&sharedp->rsmsi_cv,
6195				    &sharedp->rsmsi_lock) == 0) {
6196					/* signalled - clean up and return */
6197					rsmsharelock_release(seg);
6198					rsmimport_rm(seg);
6199					seg->s_adapter = NULL;
6200					rsmka_release_adapter(adapter);
6201					seg->s_state = RSM_STATE_NEW;
6202					DBG_PRINTF((category, RSM_ERR,
6203					    "rsm_connect done: INTERRUPTED\n"));
6204					return (RSMERR_INTERRUPTED);
6205				}
6206			}
6207			/*
6208			 * the state changed, loop back and check what it is
6209			 */
6210			recheck_state = 1;
6211			break;
6212		case RSMSI_STATE_ABORT_CONNECT:
6213			/* exit the loop and clean up further down */
6214			break;
6215		case RSMSI_STATE_CONNECTED:
6216			/* already connected, good - fall through */
6217		case RSMSI_STATE_MAPPED:
6218			/* already mapped, wow - fall through */
6219			/* access validation etc is done further down */
6220			break;
6221		case RSMSI_STATE_DISCONNECTED:
6222			/* disconnected - so reconnect now */
6223			sharedp->rsmsi_state = RSMSI_STATE_CONNECTING;
6224			break;
6225		default:
6226			ASSERT(0); /* Invalid State */
6227		}
6228	} while (recheck_state);
6229
6230	if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTING) {
6231		/* we are the first to connect */
6232		rsmsharelock_release(seg);
6233
6234		if (msg->nodeid != my_nodeid) {
6235			addr = get_remote_hwaddr(adapter, msg->nodeid);
6236
6237			if ((int64_t)addr < 0) {
6238				rsmsharelock_acquire(seg);
6239				rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING,
6240				    RSMSI_STATE_NEW);
6241				rsmsharelock_release(seg);
6242				rsmimport_rm(seg);
6243				seg->s_adapter = NULL;
6244				rsmka_release_adapter(adapter);
6245				seg->s_state = RSM_STATE_NEW;
6246				DBG_PRINTF((category, RSM_ERR,
6247				    "rsm_connect done: hwaddr<0\n"));
6248				return (RSMERR_INTERNAL_ERROR);
6249			}
6250		} else {
6251			addr = adapter->hwaddr;
6252		}
6253
6254		/*
6255		 * send request to node [src, dest, key, msgid] and get back
6256		 * [status, msgid, cookie]
6257		 */
6258		request.rsmipc_key = msg->key;
6259		/*
6260		 * we need the s_mode of the exporter so pass
6261		 * RSM_ACCESS_TRUSTED
6262		 */
6263		request.rsmipc_perm = RSM_ACCESS_TRUSTED;
6264		request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_SEGCONNECT;
6265		request.rsmipc_adapter_hwaddr = addr;
6266		request.rsmipc_segment_cookie = sharedp;
6267
6268		e = (int)rsmipc_send(msg->nodeid, &request, &reply);
6269		if (e) {
6270			rsmsharelock_acquire(seg);
6271			rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING,
6272			    RSMSI_STATE_NEW);
6273			rsmsharelock_release(seg);
6274			rsmimport_rm(seg);
6275			seg->s_adapter = NULL;
6276			rsmka_release_adapter(adapter);
6277			seg->s_state = RSM_STATE_NEW;
6278			DBG_PRINTF((category, RSM_ERR,
6279			    "rsm_connect done:rsmipc_send failed %d\n", e));
6280			return (e);
6281		}
6282
6283		if (reply.rsmipc_status != RSM_SUCCESS) {
6284			rsmsharelock_acquire(seg);
6285			rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING,
6286			    RSMSI_STATE_NEW);
6287			rsmsharelock_release(seg);
6288			rsmimport_rm(seg);
6289			seg->s_adapter = NULL;
6290			rsmka_release_adapter(adapter);
6291			seg->s_state = RSM_STATE_NEW;
6292			DBG_PRINTF((category, RSM_ERR,
6293			    "rsm_connect done:rsmipc_send reply err %d\n",
6294			    reply.rsmipc_status));
6295			return (reply.rsmipc_status);
6296		}
6297
6298		rsmsharelock_acquire(seg);
6299		/* store the information recvd into the shared data struct */
6300		sharedp->rsmsi_mode = reply.rsmipc_mode;
6301		sharedp->rsmsi_uid = reply.rsmipc_uid;
6302		sharedp->rsmsi_gid = reply.rsmipc_gid;
6303		sharedp->rsmsi_seglen = reply.rsmipc_seglen;
6304		sharedp->rsmsi_cookie = sharedp;
6305	}
6306
6307	rsmsharelock_release(seg);
6308
6309	/*
6310	 * Get the segment lock and check for a force disconnect
6311	 * from the export side which would have changed the state
6312	 * back to RSM_STATE_NEW. Once the segment lock is acquired a
6313	 * force disconnect will be held off until the connection
6314	 * has completed.
6315	 */
6316	rsmseglock_acquire(seg);
6317	rsmsharelock_acquire(seg);
6318	ASSERT(seg->s_state == RSM_STATE_CONNECTING ||
6319	    seg->s_state == RSM_STATE_ABORT_CONNECT);
6320
6321	shared_cookie = sharedp->rsmsi_cookie;
6322
6323	if ((seg->s_state == RSM_STATE_ABORT_CONNECT) ||
6324	    (sharedp->rsmsi_state == RSMSI_STATE_ABORT_CONNECT)) {
6325		seg->s_state = RSM_STATE_NEW;
6326		seg->s_adapter = NULL;
6327		rsmsharelock_release(seg);
6328		rsmseglock_release(seg);
6329		rsmimport_rm(seg);
6330		rsmka_release_adapter(adapter);
6331
6332		rsmsharelock_acquire(seg);
6333		if (!(sharedp->rsmsi_flags & RSMSI_FLAGS_ABORTDONE)) {
6334			/*
6335			 * set a flag indicating abort handling has been
6336			 * done
6337			 */
6338			sharedp->rsmsi_flags |= RSMSI_FLAGS_ABORTDONE;
6339			rsmsharelock_release(seg);
6340			/* send a message to exporter - only once */
6341			(void) rsm_send_notimporting(msg->nodeid,
6342			    msg->key, shared_cookie);
6343			rsmsharelock_acquire(seg);
6344			/*
6345			 * wake up any waiting importers and inform that
6346			 * connection has been aborted
6347			 */
6348			cv_broadcast(&sharedp->rsmsi_cv);
6349		}
6350		rsmsharelock_release(seg);
6351
6352		DBG_PRINTF((category, RSM_ERR,
6353		    "rsm_connect done: RSM_STATE_ABORT_CONNECT\n"));
6354		return (RSMERR_INTERRUPTED);
6355	}
6356
6357
6358	/*
6359	 * We need to verify that this process has access
6360	 */
6361	e = rsm_access(sharedp->rsmsi_uid, sharedp->rsmsi_gid,
6362	    access & sharedp->rsmsi_mode,
6363	    (int)(msg->perm & RSM_PERM_RDWR), cred);
6364	if (e) {
6365		rsmsharelock_release(seg);
6366		seg->s_state = RSM_STATE_NEW;
6367		seg->s_adapter = NULL;
6368		rsmseglock_release(seg);
6369		rsmimport_rm(seg);
6370		rsmka_release_adapter(adapter);
6371		/*
6372		 * No need to lock segment it has been removed
6373		 * from the hash table
6374		 */
6375		rsmsharelock_acquire(seg);
6376		if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTING) {
6377			rsmsharelock_release(seg);
6378			/* this is the first importer */
6379
6380			(void) rsm_send_notimporting(msg->nodeid, msg->key,
6381			    shared_cookie);
6382			rsmsharelock_acquire(seg);
6383			sharedp->rsmsi_state = RSMSI_STATE_NEW;
6384			cv_broadcast(&sharedp->rsmsi_cv);
6385		}
6386		rsmsharelock_release(seg);
6387
6388		DBG_PRINTF((category, RSM_ERR,
6389		    "rsm_connect done: ipcaccess failed\n"));
6390		return (RSMERR_PERM_DENIED);
6391	}
6392
6393	/* update state and cookie */
6394	seg->s_segid = sharedp->rsmsi_segid;
6395	seg->s_len = sharedp->rsmsi_seglen;
6396	seg->s_mode = access & sharedp->rsmsi_mode;
6397	seg->s_pid = ddi_get_pid();
6398	seg->s_mapinfo = NULL;
6399
6400	if (seg->s_node != my_nodeid) {
6401		if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTING) {
6402			e = adapter->rsmpi_ops->rsm_connect(
6403			    adapter->rsmpi_handle,
6404			    addr, seg->s_segid, &sharedp->rsmsi_handle);
6405
6406			if (e != RSM_SUCCESS) {
6407				seg->s_state = RSM_STATE_NEW;
6408				seg->s_adapter = NULL;
6409				rsmsharelock_release(seg);
6410				rsmseglock_release(seg);
6411				rsmimport_rm(seg);
6412				rsmka_release_adapter(adapter);
6413				/*
6414				 *  inform the exporter to delete this importer
6415				 */
6416				(void) rsm_send_notimporting(msg->nodeid,
6417				    msg->key, shared_cookie);
6418
6419				/*
6420				 * Now inform any waiting importers to
6421				 * retry connect. This needs to be done
6422				 * after sending notimporting so that
6423				 * the notimporting is sent before a waiting
6424				 * importer sends a segconnect while retrying
6425				 *
6426				 * No need to lock segment it has been removed
6427				 * from the hash table
6428				 */
6429
6430				rsmsharelock_acquire(seg);
6431				sharedp->rsmsi_state = RSMSI_STATE_NEW;
6432				cv_broadcast(&sharedp->rsmsi_cv);
6433				rsmsharelock_release(seg);
6434
6435				DBG_PRINTF((category, RSM_ERR,
6436				    "rsm_connect error %d\n", e));
6437				if (e == RSMERR_SEG_NOT_PUBLISHED_TO_RSM_ADDR)
6438					return (
6439					    RSMERR_SEG_NOT_PUBLISHED_TO_NODE);
6440				else if ((e == RSMERR_RSM_ADDR_UNREACHABLE) ||
6441				    (e == RSMERR_UNKNOWN_RSM_ADDR))
6442					return (RSMERR_REMOTE_NODE_UNREACHABLE);
6443				else
6444					return (e);
6445			}
6446
6447		}
6448		seg->s_handle.in = sharedp->rsmsi_handle;
6449
6450	}
6451
6452	seg->s_state = RSM_STATE_CONNECT;
6453
6454
6455	seg->s_flags &= ~RSM_IMPORT_DUMMY;	/* clear dummy flag */
6456	if (bar_va) {
6457		/* increment generation number on barrier page */
6458		atomic_add_16(bar_va + seg->s_hdr.rsmrc_num, 1);
6459		/* return user off into barrier page where status will be */
6460		msg->off = (int)seg->s_hdr.rsmrc_num;
6461		msg->gnum = bar_va[msg->off]; 	/* gnum race */
6462	} else {
6463		msg->off = 0;
6464		msg->gnum = 0;	/* gnum race */
6465	}
6466
6467	msg->len = (int)sharedp->rsmsi_seglen;
6468	msg->rnum = seg->s_minor;
6469	rsmsharecv_signal(seg, RSMSI_STATE_CONNECTING, RSMSI_STATE_CONNECTED);
6470	rsmsharelock_release(seg);
6471	rsmseglock_release(seg);
6472
6473	/* Return back to user the segment size & perm in case it's needed */
6474
6475#ifdef _MULTI_DATAMODEL
6476	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
6477		rsm_ioctlmsg32_t msg32;
6478
6479		if (msg->len > UINT_MAX)
6480			msg32.len = RSM_MAXSZ_PAGE_ALIGNED;
6481		else
6482			msg32.len = msg->len;
6483		msg32.off = msg->off;
6484		msg32.perm = msg->perm;
6485		msg32.gnum = msg->gnum;
6486		msg32.rnum = msg->rnum;
6487
6488		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6489		    "rsm_connect done\n"));
6490
6491		if (ddi_copyout((caddr_t)&msg32, (caddr_t)dataptr,
6492		    sizeof (msg32), mode))
6493			return (RSMERR_BAD_ADDR);
6494		else
6495			return (RSM_SUCCESS);
6496	}
6497#endif
6498	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_connect done\n"));
6499
6500	if (ddi_copyout((caddr_t)msg, (caddr_t)dataptr, sizeof (*msg),
6501	    mode))
6502		return (RSMERR_BAD_ADDR);
6503	else
6504		return (RSM_SUCCESS);
6505}
6506
6507static int
6508rsm_unmap(rsmseg_t *seg)
6509{
6510	int			err;
6511	adapter_t		*adapter;
6512	rsm_import_share_t	*sharedp;
6513	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6514
6515	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6516	    "rsm_unmap enter %u\n", seg->s_segid));
6517
6518	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6519
6520	/* assert seg is locked */
6521	ASSERT(rsmseglock_held(seg));
6522	ASSERT(seg->s_state != RSM_STATE_MAPPING);
6523
6524	if ((seg->s_state != RSM_STATE_ACTIVE) &&
6525	    (seg->s_state != RSM_STATE_MAP_QUIESCE)) {
6526		/* segment unmap has already been done */
6527		DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unmap done\n"));
6528		return (RSM_SUCCESS);
6529	}
6530
6531	sharedp = seg->s_share;
6532
6533	rsmsharelock_acquire(seg);
6534
6535	/*
6536	 *	- shared data struct is in MAPPED or MAP_QUIESCE state
6537	 */
6538
6539	ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAPPED ||
6540	    sharedp->rsmsi_state == RSMSI_STATE_MAP_QUIESCE);
6541
6542	/*
6543	 * Unmap pages - previously rsm_memseg_import_unmap was called only if
6544	 * the segment cookie list was NULL; but it is always NULL when
6545	 * called from rsmmap_unmap and won't be NULL when called for
6546	 * a force disconnect - so the check for NULL cookie list was removed
6547	 */
6548
6549	ASSERT(sharedp->rsmsi_mapcnt > 0);
6550
6551	sharedp->rsmsi_mapcnt--;
6552
6553	if (sharedp->rsmsi_mapcnt == 0) {
6554		if (sharedp->rsmsi_state == RSMSI_STATE_MAPPED) {
6555			/* unmap the shared RSMPI mapping */
6556			adapter = seg->s_adapter;
6557			if (seg->s_node != my_nodeid) {
6558				ASSERT(sharedp->rsmsi_handle != NULL);
6559				err = adapter->rsmpi_ops->
6560				    rsm_unmap(sharedp->rsmsi_handle);
6561				DBG_PRINTF((category, RSM_DEBUG,
6562				    "rsm_unmap: rsmpi unmap %d\n", err));
6563				rsm_free_mapinfo(sharedp->rsmsi_mapinfo);
6564				sharedp->rsmsi_mapinfo = NULL;
6565			}
6566			sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
6567		} else { /* MAP_QUIESCE --munmap()--> CONN_QUIESCE */
6568			sharedp->rsmsi_state = RSMSI_STATE_CONN_QUIESCE;
6569		}
6570	}
6571
6572	rsmsharelock_release(seg);
6573
6574	/*
6575	 * The s_cookie field is used to store the cookie returned from the
6576	 * ddi_umem_lock when binding the pages for an export segment. This
6577	 * is the primary use of the s_cookie field and does not normally
6578	 * pertain to any importing segment except in the loopback case.
6579	 * For the loopback case, the import segment and export segment are
6580	 * on the same node, the s_cookie field of the segment structure for
6581	 * the importer is initialized to the s_cookie field in the exported
6582	 * segment during the map operation and is used during the call to
6583	 * devmap_umem_setup for the import mapping.
6584	 * Thus, during unmap, we simply need to set s_cookie to NULL to
6585	 * indicate that the mapping no longer exists.
6586	 */
6587	seg->s_cookie = NULL;
6588
6589	seg->s_mapinfo = NULL;
6590
6591	if (seg->s_state == RSM_STATE_ACTIVE)
6592		seg->s_state = RSM_STATE_CONNECT;
6593	else
6594		seg->s_state = RSM_STATE_CONN_QUIESCE;
6595
6596	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unmap done\n"));
6597
6598	return (RSM_SUCCESS);
6599}
6600
6601/*
6602 * cookie returned here if not null indicates that it is
6603 * the last importer and it can be used in the RSMIPC_NOT_IMPORTING
6604 * message.
6605 */
6606static int
6607rsm_closeconnection(rsmseg_t *seg, void **cookie)
6608{
6609	int			e;
6610	adapter_t		*adapter;
6611	rsm_import_share_t	*sharedp;
6612	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6613
6614	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6615	    "rsm_closeconnection enter\n"));
6616
6617	*cookie = (void *)NULL;
6618
6619	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6620
6621	/* assert seg is locked */
6622	ASSERT(rsmseglock_held(seg));
6623
6624	if (seg->s_state == RSM_STATE_DISCONNECT) {
6625		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6626		    "rsm_closeconnection done: already disconnected\n"));
6627		return (RSM_SUCCESS);
6628	}
6629
6630	/* wait for all putv/getv ops to get done */
6631	while (seg->s_rdmacnt > 0) {
6632		cv_wait(&seg->s_cv, &seg->s_lock);
6633	}
6634
6635	(void) rsm_unmap(seg);
6636
6637	ASSERT(seg->s_state == RSM_STATE_CONNECT ||
6638	    seg->s_state == RSM_STATE_CONN_QUIESCE);
6639
6640	adapter = seg->s_adapter;
6641	sharedp = seg->s_share;
6642
6643	ASSERT(sharedp != NULL);
6644
6645	rsmsharelock_acquire(seg);
6646
6647	/*
6648	 * Disconnect on adapter
6649	 *
6650	 * The current algorithm is stateless, I don't have to contact
6651	 * server when I go away. He only gives me permissions. Of course,
6652	 * the adapters will talk to terminate the connect.
6653	 *
6654	 * disconnect is needed only if we are CONNECTED not in CONN_QUIESCE
6655	 */
6656	if ((sharedp->rsmsi_state == RSMSI_STATE_CONNECTED) &&
6657	    (sharedp->rsmsi_node != my_nodeid)) {
6658
6659		if (sharedp->rsmsi_refcnt == 1) {
6660			/* this is the last importer */
6661			ASSERT(sharedp->rsmsi_mapcnt == 0);
6662
6663			e = adapter->rsmpi_ops->
6664			    rsm_disconnect(sharedp->rsmsi_handle);
6665			if (e != RSM_SUCCESS) {
6666				DBG_PRINTF((category, RSM_DEBUG,
6667				    "rsm:disconnect failed seg=%x:err=%d\n",
6668				    seg->s_key, e));
6669			}
6670		}
6671	}
6672
6673	seg->s_handle.in = NULL;
6674
6675	sharedp->rsmsi_refcnt--;
6676
6677	if (sharedp->rsmsi_refcnt == 0) {
6678		*cookie = (void *)sharedp->rsmsi_cookie;
6679		sharedp->rsmsi_state = RSMSI_STATE_DISCONNECTED;
6680		sharedp->rsmsi_handle = NULL;
6681		rsmsharelock_release(seg);
6682
6683		/* clean up the shared data structure */
6684		mutex_destroy(&sharedp->rsmsi_lock);
6685		cv_destroy(&sharedp->rsmsi_cv);
6686		kmem_free((void *)(sharedp), sizeof (rsm_import_share_t));
6687
6688	} else {
6689		rsmsharelock_release(seg);
6690	}
6691
6692	/* increment generation number on barrier page */
6693	if (bar_va) {
6694		atomic_add_16(bar_va + seg->s_hdr.rsmrc_num, 1);
6695	}
6696
6697	/*
6698	 * The following needs to be done after any
6699	 * rsmsharelock calls which use seg->s_share.
6700	 */
6701	seg->s_share = NULL;
6702
6703	seg->s_state = RSM_STATE_DISCONNECT;
6704	/* signal anyone waiting in the CONN_QUIESCE state */
6705	cv_broadcast(&seg->s_cv);
6706
6707	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6708	    "rsm_closeconnection done\n"));
6709
6710	return (RSM_SUCCESS);
6711}
6712
6713int
6714rsm_disconnect(rsmseg_t *seg)
6715{
6716	rsmipc_request_t	request;
6717	void			*shared_cookie;
6718	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT);
6719
6720	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_disconnect enter\n"));
6721
6722	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
6723
6724	/* assert seg isn't locked */
6725	ASSERT(!rsmseglock_held(seg));
6726
6727
6728	/* Remove segment from imported list */
6729	rsmimport_rm(seg);
6730
6731	/* acquire the segment */
6732	rsmseglock_acquire(seg);
6733
6734	/* wait until segment leaves the mapping state */
6735	while (seg->s_state == RSM_STATE_MAPPING)
6736		cv_wait(&seg->s_cv, &seg->s_lock);
6737
6738	if (seg->s_state == RSM_STATE_DISCONNECT) {
6739		seg->s_state = RSM_STATE_NEW;
6740		rsmseglock_release(seg);
6741		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6742		    "rsm_disconnect done: already disconnected\n"));
6743		return (RSM_SUCCESS);
6744	}
6745
6746	(void) rsm_closeconnection(seg, &shared_cookie);
6747
6748	/* update state */
6749	seg->s_state = RSM_STATE_NEW;
6750
6751	if (shared_cookie != NULL) {
6752		/*
6753		 *  This is the last importer so inform the exporting node
6754		 *  so this import can be deleted from the list of importers.
6755		 */
6756		request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_NOTIMPORTING;
6757		request.rsmipc_key = seg->s_segid;
6758		request.rsmipc_segment_cookie = shared_cookie;
6759		rsmseglock_release(seg);
6760		(void) rsmipc_send(seg->s_node, &request, RSM_NO_REPLY);
6761	} else {
6762		rsmseglock_release(seg);
6763	}
6764
6765	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_disconnect done\n"));
6766
6767	return (DDI_SUCCESS);
6768}
6769
6770/*ARGSUSED*/
6771static int
6772rsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
6773    struct pollhead **phpp)
6774{
6775	minor_t		rnum;
6776	rsmresource_t	*res;
6777	rsmseg_t 	*seg;
6778	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
6779
6780	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_chpoll enter\n"));
6781
6782	/* find minor, no lock */
6783	rnum = getminor(dev);
6784	res = rsmresource_lookup(rnum, RSM_NOLOCK);
6785
6786	/* poll is supported only for export/import segments */
6787	if ((res == NULL) || (res == RSMRC_RESERVED) ||
6788	    (res->rsmrc_type == RSM_RESOURCE_BAR)) {
6789		return (ENXIO);
6790	}
6791
6792	*reventsp = 0;
6793
6794	/*
6795	 * An exported segment must be in state RSM_STATE_EXPORT; an
6796	 * imported segment must be in state RSM_STATE_ACTIVE.
6797	 */
6798	seg = (rsmseg_t *)res;
6799
6800	if (seg->s_pollevent) {
6801		*reventsp = POLLRDNORM;
6802	} else if (!anyyet) {
6803		/* cannot take segment lock here */
6804		*phpp = &seg->s_poll;
6805		seg->s_pollflag |= RSM_SEGMENT_POLL;
6806	}
6807	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_chpoll done\n"));
6808	return (0);
6809}
6810
6811
6812
6813/* ************************* IOCTL Commands ********************* */
6814
6815static rsmseg_t *
6816rsmresource_seg(rsmresource_t *res, minor_t rnum, cred_t *credp,
6817    rsm_resource_type_t type)
6818{
6819	/* get segment from resource handle */
6820	rsmseg_t *seg;
6821	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL);
6822
6823	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmresource_seg enter\n"));
6824
6825
6826	if (res != RSMRC_RESERVED) {
6827		seg = (rsmseg_t *)res;
6828	} else {
6829		/* Allocate segment now and bind it */
6830		seg = rsmseg_alloc(rnum, credp);
6831
6832		/*
6833		 * if DR pre-processing is going on or DR is in progress
6834		 * then the new export segments should be in the NEW_QSCD state
6835		 */
6836		if (type == RSM_RESOURCE_EXPORT_SEGMENT) {
6837			mutex_enter(&rsm_drv_data.drv_lock);
6838			if ((rsm_drv_data.drv_state ==
6839			    RSM_DRV_PREDEL_STARTED) ||
6840			    (rsm_drv_data.drv_state ==
6841			    RSM_DRV_PREDEL_COMPLETED) ||
6842			    (rsm_drv_data.drv_state ==
6843			    RSM_DRV_DR_IN_PROGRESS)) {
6844				seg->s_state = RSM_STATE_NEW_QUIESCED;
6845			}
6846			mutex_exit(&rsm_drv_data.drv_lock);
6847		}
6848
6849		rsmresource_insert(rnum, (rsmresource_t *)seg, type);
6850	}
6851
6852	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmresource_seg done\n"));
6853
6854	return (seg);
6855}
6856
6857static int
6858rsmexport_ioctl(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int cmd, intptr_t arg,
6859    int mode, cred_t *credp)
6860{
6861	int error;
6862	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT | RSM_IOCTL);
6863
6864	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmexport_ioctl enter\n"));
6865
6866	arg = arg;
6867	credp = credp;
6868
6869	ASSERT(seg != NULL);
6870
6871	switch (cmd) {
6872	case RSM_IOCTL_BIND:
6873		error = rsm_bind(seg, msg, arg, mode);
6874		break;
6875	case RSM_IOCTL_REBIND:
6876		error = rsm_rebind(seg, msg);
6877		break;
6878	case RSM_IOCTL_UNBIND:
6879		error = ENOTSUP;
6880		break;
6881	case RSM_IOCTL_PUBLISH:
6882		error = rsm_publish(seg, msg, arg, mode);
6883		break;
6884	case RSM_IOCTL_REPUBLISH:
6885		error = rsm_republish(seg, msg, mode);
6886		break;
6887	case RSM_IOCTL_UNPUBLISH:
6888		error = rsm_unpublish(seg, 1);
6889		break;
6890	default:
6891		error = EINVAL;
6892		break;
6893	}
6894
6895	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmexport_ioctl done: %d\n",
6896	    error));
6897
6898	return (error);
6899}
6900static int
6901rsmimport_ioctl(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int cmd, intptr_t arg,
6902    int mode, cred_t *credp)
6903{
6904	int error;
6905	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
6906
6907	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmimport_ioctl enter\n"));
6908
6909	ASSERT(seg);
6910
6911	switch (cmd) {
6912	case RSM_IOCTL_CONNECT:
6913		error = rsm_connect(seg, msg, credp, arg, mode);
6914		break;
6915	default:
6916		error = EINVAL;
6917		break;
6918	}
6919
6920	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmimport_ioctl done: %d\n",
6921	    error));
6922	return (error);
6923}
6924
6925static int
6926rsmbar_ioctl(rsmseg_t *seg, rsm_ioctlmsg_t *msg, int cmd, intptr_t arg,
6927    int mode)
6928{
6929	int e;
6930	adapter_t *adapter;
6931	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
6932
6933	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmbar_ioctl enter\n"));
6934
6935
6936	if ((seg->s_flags & RSM_IMPORT_DUMMY) != 0) {
6937		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6938		    "rsmbar_ioctl done: RSM_IMPORT_DUMMY\n"));
6939		return (RSMERR_CONN_ABORTED);
6940	} else if (seg->s_node == my_nodeid) {
6941		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6942		    "rsmbar_ioctl done: loopback\n"));
6943		return (RSM_SUCCESS);
6944	}
6945
6946	adapter = seg->s_adapter;
6947
6948	switch (cmd) {
6949	case RSM_IOCTL_BAR_CHECK:
6950		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6951		    "rsmbar_ioctl done: RSM_BAR_CHECK %d\n", bar_va));
6952		return (bar_va ? RSM_SUCCESS : EINVAL);
6953	case RSM_IOCTL_BAR_OPEN:
6954		e = adapter->rsmpi_ops->
6955		    rsm_open_barrier_ctrl(adapter->rsmpi_handle, &msg->bar);
6956		break;
6957	case RSM_IOCTL_BAR_ORDER:
6958		e = adapter->rsmpi_ops->rsm_order_barrier(&msg->bar);
6959		break;
6960	case RSM_IOCTL_BAR_CLOSE:
6961		e = adapter->rsmpi_ops->rsm_close_barrier(&msg->bar);
6962		break;
6963	default:
6964		e = EINVAL;
6965		break;
6966	}
6967
6968	if (e == RSM_SUCCESS) {
6969#ifdef _MULTI_DATAMODEL
6970		if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
6971			rsm_ioctlmsg32_t msg32;
6972			int i;
6973
6974			for (i = 0; i < 4; i++) {
6975				msg32.bar.comp[i].u64 = msg->bar.comp[i].u64;
6976			}
6977
6978			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6979			    "rsmbar_ioctl done\n"));
6980			if (ddi_copyout((caddr_t)&msg32, (caddr_t)arg,
6981			    sizeof (msg32), mode))
6982				return (RSMERR_BAD_ADDR);
6983			else
6984				return (RSM_SUCCESS);
6985		}
6986#endif
6987		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6988		    "rsmbar_ioctl done\n"));
6989		if (ddi_copyout((caddr_t)&msg->bar, (caddr_t)arg,
6990		    sizeof (*msg), mode))
6991			return (RSMERR_BAD_ADDR);
6992		else
6993			return (RSM_SUCCESS);
6994	}
6995
6996	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
6997	    "rsmbar_ioctl done: error=%d\n", e));
6998
6999	return (e);
7000}
7001
7002/*
7003 * Ring the doorbell of the export segment to which this segment is
7004 * connected.
7005 */
7006static int
7007exportbell_ioctl(rsmseg_t *seg, int cmd /*ARGSUSED*/)
7008{
7009	int e = 0;
7010	rsmipc_request_t request;
7011
7012	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7013
7014	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "exportbell_ioctl enter\n"));
7015
7016	request.rsmipc_key = seg->s_segid;
7017	request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_BELL;
7018	request.rsmipc_segment_cookie = NULL;
7019	e = rsmipc_send(seg->s_node, &request, RSM_NO_REPLY);
7020
7021	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7022	    "exportbell_ioctl done: %d\n", e));
7023
7024	return (e);
7025}
7026
7027/*
7028 * Ring the doorbells of all segments importing this segment
7029 */
7030static int
7031importbell_ioctl(rsmseg_t *seg, int cmd /*ARGSUSED*/)
7032{
7033	importing_token_t	*token = NULL;
7034	rsmipc_request_t	request;
7035	int			index;
7036
7037	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT | RSM_IOCTL);
7038
7039	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importbell_ioctl enter\n"));
7040
7041	ASSERT(seg->s_state != RSM_STATE_NEW &&
7042	    seg->s_state != RSM_STATE_NEW_QUIESCED);
7043
7044	request.rsmipc_key = seg->s_segid;
7045	request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_BELL;
7046
7047	index = rsmhash(seg->s_segid);
7048
7049	token = importer_list.bucket[index];
7050
7051	while (token != NULL) {
7052		if (seg->s_key == token->key) {
7053			request.rsmipc_segment_cookie =
7054			    token->import_segment_cookie;
7055			(void) rsmipc_send(token->importing_node,
7056			    &request, RSM_NO_REPLY);
7057		}
7058		token = token->next;
7059	}
7060
7061	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7062	    "importbell_ioctl done\n"));
7063	return (RSM_SUCCESS);
7064}
7065
7066static int
7067rsm_consumeevent_copyin(caddr_t arg, rsm_consume_event_msg_t *msgp,
7068    rsm_poll_event_t **eventspp, int mode)
7069{
7070	rsm_poll_event_t	*evlist = NULL;
7071	size_t			evlistsz;
7072	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IOCTL);
7073
7074#ifdef _MULTI_DATAMODEL
7075	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7076		int i;
7077		rsm_consume_event_msg32_t cemsg32 = {0};
7078		rsm_poll_event32_t	event32[RSM_MAX_POLLFDS];
7079		rsm_poll_event32_t	*evlist32;
7080		size_t			evlistsz32;
7081
7082		/* copyin the ioctl message */
7083		if (ddi_copyin(arg, (caddr_t)&cemsg32,
7084		    sizeof (rsm_consume_event_msg32_t), mode)) {
7085			DBG_PRINTF((category, RSM_ERR,
7086			    "consumeevent_copyin msgp: RSMERR_BAD_ADDR\n"));
7087			return (RSMERR_BAD_ADDR);
7088		}
7089		msgp->seglist = (caddr_t)(uintptr_t)cemsg32.seglist;
7090		msgp->numents = (int)cemsg32.numents;
7091
7092		evlistsz32 = sizeof (rsm_poll_event32_t) * msgp->numents;
7093		/*
7094		 * If numents is large alloc events list on heap otherwise
7095		 * use the address of array that was passed in.
7096		 */
7097		if (msgp->numents > RSM_MAX_POLLFDS) {
7098			if (msgp->numents > max_segs) { /* validate numents */
7099				DBG_PRINTF((category, RSM_ERR,
7100				    "consumeevent_copyin: "
7101				    "RSMERR_BAD_ARGS_ERRORS\n"));
7102				return (RSMERR_BAD_ARGS_ERRORS);
7103			}
7104			evlist32 = kmem_zalloc(evlistsz32, KM_SLEEP);
7105		} else {
7106			evlist32 = event32;
7107		}
7108
7109		/* copyin the seglist into the rsm_poll_event32_t array */
7110		if (ddi_copyin((caddr_t)msgp->seglist, (caddr_t)evlist32,
7111		    evlistsz32, mode)) {
7112			if ((msgp->numents > RSM_MAX_POLLFDS) && evlist32) {
7113				kmem_free(evlist32, evlistsz32);
7114			}
7115			DBG_PRINTF((category, RSM_ERR,
7116			    "consumeevent_copyin evlist: RSMERR_BAD_ADDR\n"));
7117			return (RSMERR_BAD_ADDR);
7118		}
7119
7120		/* evlist and evlistsz are based on rsm_poll_event_t type */
7121		evlistsz = sizeof (rsm_poll_event_t)* msgp->numents;
7122
7123		if (msgp->numents > RSM_MAX_POLLFDS) {
7124			evlist = kmem_zalloc(evlistsz, KM_SLEEP);
7125			*eventspp = evlist;
7126		} else {
7127			evlist = *eventspp;
7128		}
7129		/*
7130		 * copy the rsm_poll_event32_t array to the rsm_poll_event_t
7131		 * array
7132		 */
7133		for (i = 0; i < msgp->numents; i++) {
7134			evlist[i].rnum = evlist32[i].rnum;
7135			evlist[i].fdsidx = evlist32[i].fdsidx;
7136			evlist[i].revent = evlist32[i].revent;
7137		}
7138		/* free the temp 32-bit event list */
7139		if ((msgp->numents > RSM_MAX_POLLFDS) && evlist32) {
7140			kmem_free(evlist32, evlistsz32);
7141		}
7142
7143		return (RSM_SUCCESS);
7144	}
7145#endif
7146	/* copyin the ioctl message */
7147	if (ddi_copyin(arg, (caddr_t)msgp, sizeof (rsm_consume_event_msg_t),
7148	    mode)) {
7149		DBG_PRINTF((category, RSM_ERR,
7150		    "consumeevent_copyin msgp: RSMERR_BAD_ADDR\n"));
7151		return (RSMERR_BAD_ADDR);
7152	}
7153	/*
7154	 * If numents is large alloc events list on heap otherwise
7155	 * use the address of array that was passed in.
7156	 */
7157	if (msgp->numents > RSM_MAX_POLLFDS) {
7158		if (msgp->numents > max_segs) { /* validate numents */
7159			DBG_PRINTF((category, RSM_ERR,
7160			    "consumeevent_copyin: RSMERR_BAD_ARGS_ERRORS\n"));
7161			return (RSMERR_BAD_ARGS_ERRORS);
7162		}
7163		evlistsz = sizeof (rsm_poll_event_t)*msgp->numents;
7164		evlist = kmem_zalloc(evlistsz, KM_SLEEP);
7165		*eventspp  = evlist;
7166	}
7167
7168	/* copyin the seglist */
7169	if (ddi_copyin((caddr_t)msgp->seglist, (caddr_t)(*eventspp),
7170	    sizeof (rsm_poll_event_t)*msgp->numents, mode)) {
7171		if (evlist) {
7172			kmem_free(evlist, evlistsz);
7173			*eventspp = NULL;
7174		}
7175		DBG_PRINTF((category, RSM_ERR,
7176		    "consumeevent_copyin evlist: RSMERR_BAD_ADDR\n"));
7177		return (RSMERR_BAD_ADDR);
7178	}
7179
7180	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7181	    "consumeevent_copyin done\n"));
7182	return (RSM_SUCCESS);
7183}
7184
7185static int
7186rsm_consumeevent_copyout(rsm_consume_event_msg_t *msgp,
7187    rsm_poll_event_t *eventsp, int mode)
7188{
7189	size_t			evlistsz;
7190	int			err = RSM_SUCCESS;
7191	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IOCTL);
7192
7193	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7194	    "consumeevent_copyout enter: numents(%d) eventsp(%p)\n",
7195	    msgp->numents, eventsp));
7196
7197#ifdef _MULTI_DATAMODEL
7198	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7199		int i;
7200		rsm_poll_event32_t	event32[RSM_MAX_POLLFDS];
7201		rsm_poll_event32_t	*evlist32;
7202		size_t			evlistsz32;
7203
7204		evlistsz32 = sizeof (rsm_poll_event32_t)*msgp->numents;
7205		if (msgp->numents > RSM_MAX_POLLFDS) {
7206			evlist32 = kmem_zalloc(evlistsz32, KM_SLEEP);
7207		} else {
7208			evlist32 = event32;
7209		}
7210
7211		/*
7212		 * copy the rsm_poll_event_t array to the rsm_poll_event32_t
7213		 * array
7214		 */
7215		for (i = 0; i < msgp->numents; i++) {
7216			evlist32[i].rnum = eventsp[i].rnum;
7217			evlist32[i].fdsidx = eventsp[i].fdsidx;
7218			evlist32[i].revent = eventsp[i].revent;
7219		}
7220
7221		if (ddi_copyout((caddr_t)evlist32, (caddr_t)msgp->seglist,
7222		    evlistsz32, mode)) {
7223			err = RSMERR_BAD_ADDR;
7224		}
7225
7226		if (msgp->numents > RSM_MAX_POLLFDS) {
7227			if (evlist32) {	/* free the temp 32-bit event list */
7228				kmem_free(evlist32, evlistsz32);
7229			}
7230			/*
7231			 * eventsp and evlistsz are based on rsm_poll_event_t
7232			 * type
7233			 */
7234			evlistsz = sizeof (rsm_poll_event_t)*msgp->numents;
7235			/* event list on the heap and needs to be freed here */
7236			if (eventsp) {
7237				kmem_free(eventsp, evlistsz);
7238			}
7239		}
7240
7241		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7242		    "consumeevent_copyout done: err=%d\n", err));
7243		return (err);
7244	}
7245#endif
7246	evlistsz = sizeof (rsm_poll_event_t)*msgp->numents;
7247
7248	if (ddi_copyout((caddr_t)eventsp, (caddr_t)msgp->seglist, evlistsz,
7249	    mode)) {
7250		err = RSMERR_BAD_ADDR;
7251	}
7252
7253	if ((msgp->numents > RSM_MAX_POLLFDS) && eventsp) {
7254		/* event list on the heap and needs to be freed here */
7255		kmem_free(eventsp, evlistsz);
7256	}
7257
7258	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7259	    "consumeevent_copyout done: err=%d\n", err));
7260	return (err);
7261}
7262
7263static int
7264rsm_consumeevent_ioctl(caddr_t arg, int mode)
7265{
7266	int	rc;
7267	int	i;
7268	minor_t	rnum;
7269	rsm_consume_event_msg_t	msg = {0};
7270	rsmseg_t		*seg;
7271	rsm_poll_event_t	*event_list;
7272	rsm_poll_event_t	events[RSM_MAX_POLLFDS];
7273	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IOCTL);
7274
7275	event_list = events;
7276
7277	if ((rc = rsm_consumeevent_copyin(arg, &msg, &event_list, mode)) !=
7278	    RSM_SUCCESS) {
7279		return (rc);
7280	}
7281
7282	for (i = 0; i < msg.numents; i++) {
7283		rnum = event_list[i].rnum;
7284		event_list[i].revent = 0;
7285		/* get the segment structure */
7286		seg = (rsmseg_t *)rsmresource_lookup(rnum, RSM_LOCK);
7287		if (seg) {
7288			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7289			    "consumeevent_ioctl: rnum(%d) seg(%p)\n", rnum,
7290			    seg));
7291			if (seg->s_pollevent) {
7292				/* consume the event */
7293				atomic_add_32(&seg->s_pollevent, -1);
7294				event_list[i].revent = POLLRDNORM;
7295			}
7296			rsmseglock_release(seg);
7297		}
7298	}
7299
7300	if ((rc = rsm_consumeevent_copyout(&msg, event_list, mode)) !=
7301	    RSM_SUCCESS) {
7302		return (rc);
7303	}
7304
7305	return (RSM_SUCCESS);
7306}
7307
7308static int
7309iovec_copyin(caddr_t user_vec, rsmka_iovec_t *iovec, int count, int mode)
7310{
7311	int size;
7312	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7313
7314	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "iovec_copyin enter\n"));
7315
7316#ifdef _MULTI_DATAMODEL
7317	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7318		rsmka_iovec32_t	*iovec32, *iovec32_base;
7319		int i;
7320
7321		size = count * sizeof (rsmka_iovec32_t);
7322		iovec32_base = iovec32 = kmem_zalloc(size, KM_SLEEP);
7323		if (ddi_copyin((caddr_t)user_vec,
7324		    (caddr_t)iovec32, size, mode)) {
7325			kmem_free(iovec32, size);
7326			DBG_PRINTF((category, RSM_DEBUG,
7327			    "iovec_copyin: returning RSMERR_BAD_ADDR\n"));
7328			return (RSMERR_BAD_ADDR);
7329		}
7330
7331		for (i = 0; i < count; i++, iovec++, iovec32++) {
7332			iovec->io_type = (int)iovec32->io_type;
7333			if (iovec->io_type == RSM_HANDLE_TYPE)
7334				iovec->local.segid = (rsm_memseg_id_t)
7335				    iovec32->local;
7336			else
7337				iovec->local.vaddr =
7338				    (caddr_t)(uintptr_t)iovec32->local;
7339			iovec->local_offset = (size_t)iovec32->local_offset;
7340			iovec->remote_offset = (size_t)iovec32->remote_offset;
7341			iovec->transfer_len = (size_t)iovec32->transfer_len;
7342
7343		}
7344		kmem_free(iovec32_base, size);
7345		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7346		    "iovec_copyin done\n"));
7347		return (DDI_SUCCESS);
7348	}
7349#endif
7350
7351	size = count * sizeof (rsmka_iovec_t);
7352	if (ddi_copyin((caddr_t)user_vec, (caddr_t)iovec, size, mode)) {
7353		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7354		    "iovec_copyin done: RSMERR_BAD_ADDR\n"));
7355		return (RSMERR_BAD_ADDR);
7356	}
7357
7358	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "iovec_copyin done\n"));
7359
7360	return (DDI_SUCCESS);
7361}
7362
7363
7364static int
7365sgio_copyin(caddr_t arg, rsmka_scat_gath_t *sg_io, int mode)
7366{
7367	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7368
7369	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "sgio_copyin enter\n"));
7370
7371#ifdef _MULTI_DATAMODEL
7372	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7373		rsmka_scat_gath32_t sg_io32;
7374
7375		if (ddi_copyin(arg, (caddr_t)&sg_io32, sizeof (sg_io32),
7376		    mode)) {
7377			DBG_PRINTF((category, RSM_DEBUG,
7378			    "sgio_copyin done: returning EFAULT\n"));
7379			return (RSMERR_BAD_ADDR);
7380		}
7381		sg_io->local_nodeid = (rsm_node_id_t)sg_io32.local_nodeid;
7382		sg_io->io_request_count =  (size_t)sg_io32.io_request_count;
7383		sg_io->io_residual_count = (size_t)sg_io32.io_residual_count;
7384		sg_io->flags = (size_t)sg_io32.flags;
7385		sg_io->remote_handle = (rsm_memseg_import_handle_t)
7386		    (uintptr_t)sg_io32.remote_handle;
7387		sg_io->iovec = (rsmka_iovec_t *)(uintptr_t)sg_io32.iovec;
7388		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7389		    "sgio_copyin done\n"));
7390		return (DDI_SUCCESS);
7391	}
7392#endif
7393	if (ddi_copyin(arg, (caddr_t)sg_io, sizeof (rsmka_scat_gath_t),
7394	    mode)) {
7395		DBG_PRINTF((category, RSM_DEBUG,
7396		    "sgio_copyin done: returning EFAULT\n"));
7397		return (RSMERR_BAD_ADDR);
7398	}
7399	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "sgio_copyin done\n"));
7400	return (DDI_SUCCESS);
7401}
7402
7403static int
7404sgio_resid_copyout(caddr_t arg, rsmka_scat_gath_t *sg_io, int mode)
7405{
7406	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7407
7408	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7409	    "sgio_resid_copyout enter\n"));
7410
7411#ifdef _MULTI_DATAMODEL
7412	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7413		rsmka_scat_gath32_t sg_io32;
7414
7415		sg_io32.io_residual_count = sg_io->io_residual_count;
7416		sg_io32.flags = sg_io->flags;
7417
7418		if (ddi_copyout((caddr_t)&sg_io32.io_residual_count,
7419		    (caddr_t)&((rsmka_scat_gath32_t *)arg)->io_residual_count,
7420		    sizeof (uint32_t), mode)) {
7421
7422			DBG_PRINTF((category, RSM_ERR,
7423			    "sgio_resid_copyout error: rescnt\n"));
7424			return (RSMERR_BAD_ADDR);
7425		}
7426
7427		if (ddi_copyout((caddr_t)&sg_io32.flags,
7428		    (caddr_t)&((rsmka_scat_gath32_t *)arg)->flags,
7429		    sizeof (uint32_t), mode)) {
7430
7431			DBG_PRINTF((category, RSM_ERR,
7432			    "sgio_resid_copyout error: flags\n"));
7433			return (RSMERR_BAD_ADDR);
7434		}
7435		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7436		    "sgio_resid_copyout done\n"));
7437		return (DDI_SUCCESS);
7438	}
7439#endif
7440	if (ddi_copyout((caddr_t)&sg_io->io_residual_count,
7441	    (caddr_t)&((rsmka_scat_gath_t *)arg)->io_residual_count,
7442	    sizeof (ulong_t), mode)) {
7443
7444		DBG_PRINTF((category, RSM_ERR,
7445		    "sgio_resid_copyout error:rescnt\n"));
7446		return (RSMERR_BAD_ADDR);
7447	}
7448
7449	if (ddi_copyout((caddr_t)&sg_io->flags,
7450	    (caddr_t)&((rsmka_scat_gath_t *)arg)->flags,
7451	    sizeof (uint_t), mode)) {
7452
7453		DBG_PRINTF((category, RSM_ERR,
7454		    "sgio_resid_copyout error:flags\n"));
7455		return (RSMERR_BAD_ADDR);
7456	}
7457
7458	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "sgio_resid_copyout done\n"));
7459	return (DDI_SUCCESS);
7460}
7461
7462
7463static int
7464rsm_iovec_ioctl(dev_t dev, caddr_t arg, int cmd, int mode, cred_t *credp)
7465{
7466	rsmka_scat_gath_t	sg_io;
7467	rsmka_iovec_t		ka_iovec_arr[RSM_MAX_IOVLEN];
7468	rsmka_iovec_t		*ka_iovec;
7469	rsmka_iovec_t		*ka_iovec_start;
7470	rsmpi_scat_gath_t	rsmpi_sg_io;
7471	rsmpi_iovec_t		iovec_arr[RSM_MAX_IOVLEN];
7472	rsmpi_iovec_t		*iovec;
7473	rsmpi_iovec_t		*iovec_start = NULL;
7474	rsmapi_access_entry_t	*acl;
7475	rsmresource_t		*res;
7476	minor_t			rnum;
7477	rsmseg_t		*im_seg, *ex_seg;
7478	int			e;
7479	int			error = 0;
7480	uint_t			i;
7481	uint_t			iov_proc = 0; /* num of iovecs processed */
7482	size_t			size = 0;
7483	size_t			ka_size;
7484
7485	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_IMPORT | RSM_IOCTL);
7486
7487	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_iovec_ioctl enter\n"));
7488
7489	credp = credp;
7490
7491	/*
7492	 * Copyin the scatter/gather structure  and build new structure
7493	 * for rsmpi.
7494	 */
7495	e = sgio_copyin(arg, &sg_io, mode);
7496	if (e != DDI_SUCCESS) {
7497		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7498		    "rsm_iovec_ioctl done: sgio_copyin %d\n", e));
7499		return (e);
7500	}
7501
7502	if (sg_io.io_request_count > RSM_MAX_SGIOREQS) {
7503		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7504		    "rsm_iovec_ioctl done: request_count(%d) too large\n",
7505		    sg_io.io_request_count));
7506		return (RSMERR_BAD_SGIO);
7507	}
7508
7509	rsmpi_sg_io.io_request_count = sg_io.io_request_count;
7510	rsmpi_sg_io.io_residual_count = sg_io.io_request_count;
7511	rsmpi_sg_io.io_segflg = 0;
7512
7513	/* Allocate memory and copyin io vector array  */
7514	if (sg_io.io_request_count > RSM_MAX_IOVLEN) {
7515		ka_size =  sg_io.io_request_count * sizeof (rsmka_iovec_t);
7516		ka_iovec_start = ka_iovec = kmem_zalloc(ka_size, KM_SLEEP);
7517	} else {
7518		ka_iovec_start = ka_iovec = ka_iovec_arr;
7519	}
7520	e = iovec_copyin((caddr_t)sg_io.iovec, ka_iovec,
7521	    sg_io.io_request_count, mode);
7522	if (e != DDI_SUCCESS) {
7523		if (sg_io.io_request_count > RSM_MAX_IOVLEN)
7524			kmem_free(ka_iovec, ka_size);
7525		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7526		    "rsm_iovec_ioctl done: iovec_copyin %d\n", e));
7527		return (e);
7528	}
7529
7530	/* get the import segment descriptor */
7531	rnum = getminor(dev);
7532	res = rsmresource_lookup(rnum, RSM_LOCK);
7533
7534	/*
7535	 * The following sequence of locking may (or MAY NOT) cause a
7536	 * deadlock but this is currently not addressed here since the
7537	 * implementation will be changed to incorporate the use of
7538	 * reference counting for both the import and the export segments.
7539	 */
7540
7541	/* rsmseglock_acquire(im_seg) done in rsmresource_lookup */
7542
7543	im_seg = (rsmseg_t *)res;
7544
7545	if (im_seg == NULL) {
7546		if (sg_io.io_request_count > RSM_MAX_IOVLEN)
7547			kmem_free(ka_iovec, ka_size);
7548		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7549		    "rsm_iovec_ioctl done: rsmresource_lookup failed\n"));
7550		return (EINVAL);
7551	}
7552	/* putv/getv supported is supported only on import segments */
7553	if (im_seg->s_type != RSM_RESOURCE_IMPORT_SEGMENT) {
7554		rsmseglock_release(im_seg);
7555		if (sg_io.io_request_count > RSM_MAX_IOVLEN)
7556			kmem_free(ka_iovec, ka_size);
7557		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7558		    "rsm_iovec_ioctl done: not an import segment\n"));
7559		return (EINVAL);
7560	}
7561
7562	/*
7563	 * wait for a remote DR to complete ie. for segments to get UNQUIESCED
7564	 * as well as wait for a local DR to complete.
7565	 */
7566	while ((im_seg->s_state == RSM_STATE_CONN_QUIESCE) ||
7567	    (im_seg->s_state == RSM_STATE_MAP_QUIESCE) ||
7568	    (im_seg->s_flags & RSM_DR_INPROGRESS)) {
7569		if (cv_wait_sig(&im_seg->s_cv, &im_seg->s_lock) == 0) {
7570			DBG_PRINTF((category, RSM_DEBUG,
7571			    "rsm_iovec_ioctl done: cv_wait INTR"));
7572			rsmseglock_release(im_seg);
7573			return (RSMERR_INTERRUPTED);
7574		}
7575	}
7576
7577	if ((im_seg->s_state != RSM_STATE_CONNECT) &&
7578	    (im_seg->s_state != RSM_STATE_ACTIVE)) {
7579
7580		ASSERT(im_seg->s_state == RSM_STATE_DISCONNECT ||
7581		    im_seg->s_state == RSM_STATE_NEW);
7582
7583		DBG_PRINTF((category, RSM_DEBUG,
7584		    "rsm_iovec_ioctl done: im_seg not conn/map"));
7585		rsmseglock_release(im_seg);
7586		e = RSMERR_BAD_SGIO;
7587		goto out;
7588	}
7589
7590	im_seg->s_rdmacnt++;
7591	rsmseglock_release(im_seg);
7592
7593	/*
7594	 * Allocate and set up the io vector for rsmpi
7595	 */
7596	if (sg_io.io_request_count > RSM_MAX_IOVLEN) {
7597		size = sg_io.io_request_count * sizeof (rsmpi_iovec_t);
7598		iovec_start = iovec = kmem_zalloc(size, KM_SLEEP);
7599	} else {
7600		iovec_start = iovec = iovec_arr;
7601	}
7602
7603	rsmpi_sg_io.iovec = iovec;
7604	for (iov_proc = 0; iov_proc < sg_io.io_request_count; iov_proc++) {
7605		if (ka_iovec->io_type == RSM_HANDLE_TYPE) {
7606			ex_seg = rsmexport_lookup(ka_iovec->local.segid);
7607
7608			if (ex_seg == NULL) {
7609				e = RSMERR_BAD_SGIO;
7610				break;
7611			}
7612			ASSERT(ex_seg->s_state == RSM_STATE_EXPORT);
7613
7614			acl = ex_seg->s_acl;
7615			if (acl[0].ae_permission == 0) {
7616				struct buf *xbuf;
7617				dev_t sdev = 0;
7618
7619				xbuf = ddi_umem_iosetup(ex_seg->s_cookie,
7620				    0, ex_seg->s_len, B_WRITE,
7621				    sdev, 0, NULL, DDI_UMEM_SLEEP);
7622
7623				ASSERT(xbuf != NULL);
7624
7625				iovec->local_mem.ms_type = RSM_MEM_BUF;
7626				iovec->local_mem.ms_memory.bp = xbuf;
7627			} else {
7628				iovec->local_mem.ms_type = RSM_MEM_HANDLE;
7629				iovec->local_mem.ms_memory.handle =
7630				    ex_seg->s_handle.out;
7631			}
7632			ex_seg->s_rdmacnt++; /* refcnt the handle */
7633			rsmseglock_release(ex_seg);
7634		} else {
7635			iovec->local_mem.ms_type = RSM_MEM_VADDR;
7636			iovec->local_mem.ms_memory.vr.vaddr =
7637			    ka_iovec->local.vaddr;
7638		}
7639
7640		iovec->local_offset = ka_iovec->local_offset;
7641		iovec->remote_handle = im_seg->s_handle.in;
7642		iovec->remote_offset = ka_iovec->remote_offset;
7643		iovec->transfer_length = ka_iovec->transfer_len;
7644		iovec++;
7645		ka_iovec++;
7646	}
7647
7648	if (iov_proc <  sg_io.io_request_count) {
7649		/* error while processing handle */
7650		rsmseglock_acquire(im_seg);
7651		im_seg->s_rdmacnt--;   /* decrement the refcnt for importseg */
7652		if (im_seg->s_rdmacnt == 0) {
7653			cv_broadcast(&im_seg->s_cv);
7654		}
7655		rsmseglock_release(im_seg);
7656		goto out;
7657	}
7658
7659	/* call rsmpi */
7660	if (cmd == RSM_IOCTL_PUTV)
7661		e = im_seg->s_adapter->rsmpi_ops->rsm_memseg_import_putv(
7662		    im_seg->s_adapter->rsmpi_handle,
7663		    &rsmpi_sg_io);
7664	else if (cmd == RSM_IOCTL_GETV)
7665		e = im_seg->s_adapter->rsmpi_ops->rsm_memseg_import_getv(
7666		    im_seg->s_adapter->rsmpi_handle,
7667		    &rsmpi_sg_io);
7668	else {
7669		e = EINVAL;
7670		DBG_PRINTF((category, RSM_DEBUG,
7671		    "iovec_ioctl: bad command = %x\n", cmd));
7672	}
7673
7674
7675	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7676	    "rsm_iovec_ioctl RSMPI oper done %d\n", e));
7677
7678	sg_io.io_residual_count = rsmpi_sg_io.io_residual_count;
7679
7680	/*
7681	 * Check for implicit signal post flag and do the signal
7682	 * post if needed
7683	 */
7684	if (sg_io.flags & RSM_IMPLICIT_SIGPOST &&
7685	    e == RSM_SUCCESS) {
7686		rsmipc_request_t request;
7687
7688		request.rsmipc_key = im_seg->s_segid;
7689		request.rsmipc_hdr.rsmipc_type = RSMIPC_MSG_BELL;
7690		request.rsmipc_segment_cookie = NULL;
7691		e = rsmipc_send(im_seg->s_node, &request, RSM_NO_REPLY);
7692		/*
7693		 * Reset the implicit signal post flag to 0 to indicate
7694		 * that the signal post has been done and need not be
7695		 * done in the RSMAPI library
7696		 */
7697		sg_io.flags &= ~RSM_IMPLICIT_SIGPOST;
7698	}
7699
7700	rsmseglock_acquire(im_seg);
7701	im_seg->s_rdmacnt--;
7702	if (im_seg->s_rdmacnt == 0) {
7703		cv_broadcast(&im_seg->s_cv);
7704	}
7705	rsmseglock_release(im_seg);
7706	error = sgio_resid_copyout(arg, &sg_io, mode);
7707out:
7708	iovec = iovec_start;
7709	ka_iovec = ka_iovec_start;
7710	for (i = 0; i < iov_proc; i++) {
7711		if (ka_iovec->io_type == RSM_HANDLE_TYPE) {
7712			ex_seg = rsmexport_lookup(ka_iovec->local.segid);
7713
7714			ASSERT(ex_seg != NULL);
7715			ASSERT(ex_seg->s_state == RSM_STATE_EXPORT);
7716
7717			ex_seg->s_rdmacnt--; /* unrefcnt the handle */
7718			if (ex_seg->s_rdmacnt == 0) {
7719				cv_broadcast(&ex_seg->s_cv);
7720			}
7721			rsmseglock_release(ex_seg);
7722		}
7723
7724		ASSERT(iovec != NULL); /* true if iov_proc > 0 */
7725
7726		/*
7727		 * At present there is no dependency on the existence of xbufs
7728		 * created by ddi_umem_iosetup for each of the iovecs. So we
7729		 * can these xbufs here.
7730		 */
7731		if (iovec->local_mem.ms_type == RSM_MEM_BUF) {
7732			freerbuf(iovec->local_mem.ms_memory.bp);
7733		}
7734
7735		iovec++;
7736		ka_iovec++;
7737	}
7738
7739	if (sg_io.io_request_count > RSM_MAX_IOVLEN) {
7740		if (iovec_start)
7741			kmem_free(iovec_start, size);
7742		kmem_free(ka_iovec_start, ka_size);
7743	}
7744
7745	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7746	    "rsm_iovec_ioctl done %d\n", e));
7747	/* if RSMPI call fails return that else return copyout's retval */
7748	return ((e != RSM_SUCCESS) ? e : error);
7749
7750}
7751
7752
7753static int
7754rsmaddr_ioctl(int cmd, rsm_ioctlmsg_t *msg, int mode)
7755{
7756	adapter_t	*adapter;
7757	rsm_addr_t	addr;
7758	rsm_node_id_t	node;
7759	int		rval = DDI_SUCCESS;
7760	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL);
7761
7762	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmaddr_ioctl enter\n"));
7763
7764	adapter =  rsm_getadapter(msg, mode);
7765	if (adapter == NULL) {
7766		DBG_PRINTF((category, RSM_DEBUG,
7767		    "rsmaddr_ioctl done: adapter not found\n"));
7768		return (RSMERR_CTLR_NOT_PRESENT);
7769	}
7770
7771	switch (cmd) {
7772	case RSM_IOCTL_MAP_TO_ADDR: /* nodeid to hwaddr mapping */
7773		/* returns the hwaddr in msg->hwaddr */
7774		if (msg->nodeid == my_nodeid) {
7775			msg->hwaddr = adapter->hwaddr;
7776		} else {
7777			addr = get_remote_hwaddr(adapter, msg->nodeid);
7778			if ((int64_t)addr < 0) {
7779				rval = RSMERR_INTERNAL_ERROR;
7780			} else {
7781				msg->hwaddr = addr;
7782			}
7783		}
7784		break;
7785	case RSM_IOCTL_MAP_TO_NODEID: /* hwaddr to nodeid mapping */
7786		/* returns the nodeid in msg->nodeid */
7787		if (msg->hwaddr == adapter->hwaddr) {
7788			msg->nodeid = my_nodeid;
7789		} else {
7790			node = get_remote_nodeid(adapter, msg->hwaddr);
7791			if ((int)node < 0) {
7792				rval = RSMERR_INTERNAL_ERROR;
7793			} else {
7794				msg->nodeid = (rsm_node_id_t)node;
7795			}
7796		}
7797		break;
7798	default:
7799		rval = EINVAL;
7800		break;
7801	}
7802
7803	rsmka_release_adapter(adapter);
7804	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7805	    "rsmaddr_ioctl done: %d\n", rval));
7806	return (rval);
7807}
7808
7809static int
7810rsm_ddi_copyin(caddr_t arg, rsm_ioctlmsg_t *msg, int mode)
7811{
7812	DBG_DEFINE(category,
7813	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL | RSM_DDI);
7814
7815	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ddi_copyin enter\n"));
7816
7817#ifdef _MULTI_DATAMODEL
7818
7819	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7820		rsm_ioctlmsg32_t msg32;
7821		int i;
7822
7823		if (ddi_copyin(arg, (caddr_t)&msg32, sizeof (msg32), mode)) {
7824			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7825			    "rsm_ddi_copyin done: EFAULT\n"));
7826			return (RSMERR_BAD_ADDR);
7827		}
7828		msg->len = msg32.len;
7829		msg->vaddr = (caddr_t)(uintptr_t)msg32.vaddr;
7830		msg->arg = (caddr_t)(uintptr_t)msg32.arg;
7831		msg->key = msg32.key;
7832		msg->acl_len = msg32.acl_len;
7833		msg->acl = (rsmapi_access_entry_t *)(uintptr_t)msg32.acl;
7834		msg->cnum = msg32.cnum;
7835		msg->cname = (caddr_t)(uintptr_t)msg32.cname;
7836		msg->cname_len = msg32.cname_len;
7837		msg->nodeid = msg32.nodeid;
7838		msg->hwaddr = msg32.hwaddr;
7839		msg->perm = msg32.perm;
7840		for (i = 0; i < 4; i++) {
7841			msg->bar.comp[i].u64 = msg32.bar.comp[i].u64;
7842		}
7843		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7844		    "rsm_ddi_copyin done\n"));
7845		return (RSM_SUCCESS);
7846	}
7847#endif
7848	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ddi_copyin done\n"));
7849	if (ddi_copyin(arg, (caddr_t)msg, sizeof (*msg), mode))
7850		return (RSMERR_BAD_ADDR);
7851	else
7852		return (RSM_SUCCESS);
7853}
7854
7855static int
7856rsmattr_ddi_copyout(adapter_t *adapter, caddr_t arg, int mode)
7857{
7858	rsmka_int_controller_attr_t	rsm_cattr;
7859	DBG_DEFINE(category,
7860	    RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL | RSM_DDI);
7861
7862	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7863	    "rsmattr_ddi_copyout enter\n"));
7864	/*
7865	 * need to copy appropriate data from rsm_controller_attr_t
7866	 * to rsmka_int_controller_attr_t
7867	 */
7868#ifdef	_MULTI_DATAMODEL
7869	if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
7870		rsmka_int_controller_attr32_t rsm_cattr32;
7871
7872		rsm_cattr32.attr_direct_access_sizes =
7873		    adapter->rsm_attr.attr_direct_access_sizes;
7874		rsm_cattr32.attr_atomic_sizes =
7875		    adapter->rsm_attr.attr_atomic_sizes;
7876		rsm_cattr32.attr_page_size =
7877		    adapter->rsm_attr.attr_page_size;
7878		if (adapter->rsm_attr.attr_max_export_segment_size >
7879		    UINT_MAX)
7880			rsm_cattr32.attr_max_export_segment_size =
7881			    RSM_MAXSZ_PAGE_ALIGNED;
7882		else
7883			rsm_cattr32.attr_max_export_segment_size =
7884			    adapter->rsm_attr.attr_max_export_segment_size;
7885		if (adapter->rsm_attr.attr_tot_export_segment_size >
7886		    UINT_MAX)
7887			rsm_cattr32.attr_tot_export_segment_size =
7888			    RSM_MAXSZ_PAGE_ALIGNED;
7889		else
7890			rsm_cattr32.attr_tot_export_segment_size =
7891			    adapter->rsm_attr.attr_tot_export_segment_size;
7892		if (adapter->rsm_attr.attr_max_export_segments >
7893		    UINT_MAX)
7894			rsm_cattr32.attr_max_export_segments =
7895			    UINT_MAX;
7896		else
7897			rsm_cattr32.attr_max_export_segments =
7898			    adapter->rsm_attr.attr_max_export_segments;
7899		if (adapter->rsm_attr.attr_max_import_map_size >
7900		    UINT_MAX)
7901			rsm_cattr32.attr_max_import_map_size =
7902			    RSM_MAXSZ_PAGE_ALIGNED;
7903		else
7904			rsm_cattr32.attr_max_import_map_size =
7905			    adapter->rsm_attr.attr_max_import_map_size;
7906		if (adapter->rsm_attr.attr_tot_import_map_size >
7907		    UINT_MAX)
7908			rsm_cattr32.attr_tot_import_map_size =
7909			    RSM_MAXSZ_PAGE_ALIGNED;
7910		else
7911			rsm_cattr32.attr_tot_import_map_size =
7912			    adapter->rsm_attr.attr_tot_import_map_size;
7913		if (adapter->rsm_attr.attr_max_import_segments >
7914		    UINT_MAX)
7915			rsm_cattr32.attr_max_import_segments =
7916			    UINT_MAX;
7917		else
7918			rsm_cattr32.attr_max_import_segments =
7919			    adapter->rsm_attr.attr_max_import_segments;
7920		rsm_cattr32.attr_controller_addr =
7921		    adapter->rsm_attr.attr_controller_addr;
7922
7923		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7924		    "rsmattr_ddi_copyout done\n"));
7925		if (ddi_copyout((caddr_t)&rsm_cattr32, arg,
7926		    sizeof (rsmka_int_controller_attr32_t), mode)) {
7927			return (RSMERR_BAD_ADDR);
7928		}
7929		else
7930			return (RSM_SUCCESS);
7931	}
7932#endif
7933	rsm_cattr.attr_direct_access_sizes =
7934	    adapter->rsm_attr.attr_direct_access_sizes;
7935	rsm_cattr.attr_atomic_sizes =
7936	    adapter->rsm_attr.attr_atomic_sizes;
7937	rsm_cattr.attr_page_size =
7938	    adapter->rsm_attr.attr_page_size;
7939	rsm_cattr.attr_max_export_segment_size =
7940	    adapter->rsm_attr.attr_max_export_segment_size;
7941	rsm_cattr.attr_tot_export_segment_size =
7942	    adapter->rsm_attr.attr_tot_export_segment_size;
7943	rsm_cattr.attr_max_export_segments =
7944	    adapter->rsm_attr.attr_max_export_segments;
7945	rsm_cattr.attr_max_import_map_size =
7946	    adapter->rsm_attr.attr_max_import_map_size;
7947	rsm_cattr.attr_tot_import_map_size =
7948	    adapter->rsm_attr.attr_tot_import_map_size;
7949	rsm_cattr.attr_max_import_segments =
7950	    adapter->rsm_attr.attr_max_import_segments;
7951	rsm_cattr.attr_controller_addr =
7952	    adapter->rsm_attr.attr_controller_addr;
7953	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7954	    "rsmattr_ddi_copyout done\n"));
7955	if (ddi_copyout((caddr_t)&rsm_cattr, arg,
7956	    sizeof (rsmka_int_controller_attr_t), mode)) {
7957		return (RSMERR_BAD_ADDR);
7958	}
7959	else
7960		return (RSM_SUCCESS);
7961}
7962
7963/*ARGSUSED*/
7964static int
7965rsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
7966    int *rvalp)
7967{
7968	rsmseg_t *seg;
7969	rsmresource_t	*res;
7970	minor_t		rnum;
7971	rsm_ioctlmsg_t msg = {0};
7972	int error;
7973	adapter_t *adapter;
7974	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_IOCTL);
7975
7976	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ioctl enter\n"));
7977
7978	if (cmd == RSM_IOCTL_CONSUMEEVENT) {
7979		error = rsm_consumeevent_ioctl((caddr_t)arg, mode);
7980		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7981		    "rsm_ioctl RSM_IOCTL_CONSUMEEVENT done: %d\n", error));
7982		return (error);
7983	}
7984
7985	/* topology cmd does not use the arg common to other cmds */
7986	if (RSM_IOCTL_CMDGRP(cmd) == RSM_IOCTL_TOPOLOGY) {
7987		error = rsmka_topology_ioctl((caddr_t)arg, cmd, mode);
7988		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7989		    "rsm_ioctl done: %d\n", error));
7990		return (error);
7991	}
7992
7993	if (RSM_IOCTL_CMDGRP(cmd) == RSM_IOCTL_IOVEC) {
7994		error = rsm_iovec_ioctl(dev, (caddr_t)arg, cmd, mode, credp);
7995		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
7996		    "rsm_ioctl done: %d\n", error));
7997		return (error);
7998	}
7999
8000	/*
8001	 * try to load arguments
8002	 */
8003	if (cmd != RSM_IOCTL_RING_BELL &&
8004	    rsm_ddi_copyin((caddr_t)arg, &msg, mode)) {
8005		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8006		    "rsm_ioctl done: EFAULT\n"));
8007		return (RSMERR_BAD_ADDR);
8008	}
8009
8010	if (cmd == RSM_IOCTL_ATTR) {
8011		adapter =  rsm_getadapter(&msg, mode);
8012		if (adapter == NULL) {
8013			DBG_PRINTF((category, RSM_DEBUG,
8014			    "rsm_ioctl done: ENODEV\n"));
8015			return (RSMERR_CTLR_NOT_PRESENT);
8016		}
8017		error = rsmattr_ddi_copyout(adapter, msg.arg, mode);
8018		rsmka_release_adapter(adapter);
8019		DBG_PRINTF((category, RSM_DEBUG,
8020		    "rsm_ioctl:after copyout %d\n", error));
8021		return (error);
8022	}
8023
8024	if (cmd == RSM_IOCTL_BAR_INFO) {
8025		/* Return library off,len of barrier page */
8026		msg.off = barrier_offset;
8027		msg.len = (int)barrier_size;
8028#ifdef _MULTI_DATAMODEL
8029		if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
8030			rsm_ioctlmsg32_t msg32;
8031
8032			if (msg.len > UINT_MAX)
8033				msg.len = RSM_MAXSZ_PAGE_ALIGNED;
8034			else
8035				msg32.len = (int32_t)msg.len;
8036			msg32.off = (int32_t)msg.off;
8037			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8038			    "rsm_ioctl done\n"));
8039			if (ddi_copyout((caddr_t)&msg32, (caddr_t)arg,
8040			    sizeof (msg32), mode))
8041				return (RSMERR_BAD_ADDR);
8042			else
8043				return (RSM_SUCCESS);
8044		}
8045#endif
8046		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8047		    "rsm_ioctl done\n"));
8048		if (ddi_copyout((caddr_t)&msg, (caddr_t)arg,
8049		    sizeof (msg), mode))
8050			return (RSMERR_BAD_ADDR);
8051		else
8052			return (RSM_SUCCESS);
8053	}
8054
8055	if (RSM_IOCTL_CMDGRP(cmd) == RSM_IOCTL_MAP_ADDR) {
8056		/* map the nodeid or hwaddr */
8057		error = rsmaddr_ioctl(cmd, &msg, mode);
8058		if (error == RSM_SUCCESS) {
8059#ifdef _MULTI_DATAMODEL
8060			if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
8061				rsm_ioctlmsg32_t msg32;
8062
8063				msg32.hwaddr = (uint64_t)msg.hwaddr;
8064				msg32.nodeid = (uint32_t)msg.nodeid;
8065
8066				DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8067				    "rsm_ioctl done\n"));
8068				if (ddi_copyout((caddr_t)&msg32, (caddr_t)arg,
8069				    sizeof (msg32), mode))
8070					return (RSMERR_BAD_ADDR);
8071				else
8072					return (RSM_SUCCESS);
8073			}
8074#endif
8075			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8076			    "rsm_ioctl done\n"));
8077			if (ddi_copyout((caddr_t)&msg, (caddr_t)arg,
8078			    sizeof (msg), mode))
8079				return (RSMERR_BAD_ADDR);
8080			else
8081				return (RSM_SUCCESS);
8082		}
8083		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8084		    "rsm_ioctl done: %d\n", error));
8085		return (error);
8086	}
8087
8088	/* Find resource and look it in read mode */
8089	rnum = getminor(dev);
8090	res = rsmresource_lookup(rnum, RSM_NOLOCK);
8091	ASSERT(res != NULL);
8092
8093	/*
8094	 * Find command group
8095	 */
8096	switch (RSM_IOCTL_CMDGRP(cmd)) {
8097	case RSM_IOCTL_EXPORT_SEG:
8098		/*
8099		 * Export list is searched during publish, loopback and
8100		 * remote lookup call.
8101		 */
8102		seg = rsmresource_seg(res, rnum, credp,
8103		    RSM_RESOURCE_EXPORT_SEGMENT);
8104		if (seg->s_type == RSM_RESOURCE_EXPORT_SEGMENT) {
8105			error = rsmexport_ioctl(seg, &msg, cmd, arg, mode,
8106			    credp);
8107		} else { /* export ioctl on an import/barrier resource */
8108			error = RSMERR_BAD_SEG_HNDL;
8109		}
8110		break;
8111	case RSM_IOCTL_IMPORT_SEG:
8112		/* Import list is searched during remote unmap call. */
8113		seg = rsmresource_seg(res, rnum, credp,
8114		    RSM_RESOURCE_IMPORT_SEGMENT);
8115		if (seg->s_type == RSM_RESOURCE_IMPORT_SEGMENT) {
8116			error = rsmimport_ioctl(seg, &msg, cmd, arg, mode,
8117			    credp);
8118		} else  { /* import ioctl on an export/barrier resource */
8119			error = RSMERR_BAD_SEG_HNDL;
8120		}
8121		break;
8122	case RSM_IOCTL_BAR:
8123		if (res != RSMRC_RESERVED &&
8124		    res->rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT) {
8125			error = rsmbar_ioctl((rsmseg_t *)res, &msg, cmd, arg,
8126			    mode);
8127		} else { /* invalid res value */
8128			error = RSMERR_BAD_SEG_HNDL;
8129		}
8130		break;
8131	case RSM_IOCTL_BELL:
8132		if (res != RSMRC_RESERVED) {
8133			if (res->rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT)
8134				error = exportbell_ioctl((rsmseg_t *)res, cmd);
8135			else if (res->rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT)
8136				error = importbell_ioctl((rsmseg_t *)res, cmd);
8137			else /* RSM_RESOURCE_BAR */
8138				error = RSMERR_BAD_SEG_HNDL;
8139		} else { /* invalid res value */
8140			error = RSMERR_BAD_SEG_HNDL;
8141		}
8142		break;
8143	default:
8144		error = EINVAL;
8145	}
8146
8147	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_ioctl done: %d\n",
8148	    error));
8149	return (error);
8150}
8151
8152
8153/* **************************** Segment Mapping Operations ********* */
8154static rsm_mapinfo_t *
8155rsm_get_mapinfo(rsmseg_t *seg, off_t off, size_t len, off_t *dev_offset,
8156    size_t *map_len)
8157{
8158	rsm_mapinfo_t	*p;
8159	/*
8160	 * Find the correct mapinfo structure to use during the mapping
8161	 * from the seg->s_mapinfo list.
8162	 * The seg->s_mapinfo list contains in reverse order the mappings
8163	 * as returned by the RSMPI rsm_map. In rsm_devmap, we need to
8164	 * access the correct entry within this list for the mapping
8165	 * requested.
8166	 *
8167	 * The algorithm for selecting a list entry is as follows:
8168	 *
8169	 * When start_offset of an entry <= off we have found the entry
8170	 * we were looking for. Adjust the dev_offset and map_len (needs
8171	 * to be PAGESIZE aligned).
8172	 */
8173	p = seg->s_mapinfo;
8174	for (; p; p = p->next) {
8175		if (p->start_offset <= off) {
8176			*dev_offset = p->dev_offset + off - p->start_offset;
8177			*map_len = (len > p->individual_len) ?
8178			    p->individual_len : ptob(btopr(len));
8179			return (p);
8180		}
8181		p = p->next;
8182	}
8183
8184	return (NULL);
8185}
8186
8187static void
8188rsm_free_mapinfo(rsm_mapinfo_t  *mapinfo)
8189{
8190	rsm_mapinfo_t *p;
8191
8192	while (mapinfo != NULL) {
8193		p = mapinfo;
8194		mapinfo = mapinfo->next;
8195		kmem_free(p, sizeof (*p));
8196	}
8197}
8198
8199static int
8200rsmmap_map(devmap_cookie_t dhp, dev_t dev, uint_t flags, offset_t off,
8201    size_t len, void **pvtp)
8202{
8203	rsmcookie_t	*p;
8204	rsmresource_t	*res;
8205	rsmseg_t	*seg;
8206	minor_t rnum;
8207	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8208
8209	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_map enter\n"));
8210
8211	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8212	    "rsmmap_map: dhp = %x\n", dhp));
8213
8214	flags = flags;
8215
8216	rnum = getminor(dev);
8217	res = (rsmresource_t *)rsmresource_lookup(rnum, RSM_NOLOCK);
8218	ASSERT(res != NULL);
8219
8220	seg = (rsmseg_t *)res;
8221
8222	rsmseglock_acquire(seg);
8223
8224	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8225
8226	/*
8227	 * Allocate structure and add cookie to segment list
8228	 */
8229	p = kmem_alloc(sizeof (*p), KM_SLEEP);
8230
8231	p->c_dhp = dhp;
8232	p->c_off = off;
8233	p->c_len = len;
8234	p->c_next = seg->s_ckl;
8235	seg->s_ckl = p;
8236
8237	*pvtp = (void *)seg;
8238
8239	rsmseglock_release(seg);
8240
8241	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_map done\n"));
8242	return (DDI_SUCCESS);
8243}
8244
8245/*
8246 * Page fault handling is done here. The prerequisite mapping setup
8247 * has been done in rsm_devmap with calls to ddi_devmem_setup or
8248 * ddi_umem_setup
8249 */
8250static int
8251rsmmap_access(devmap_cookie_t dhp, void *pvt, offset_t offset, size_t len,
8252    uint_t type, uint_t rw)
8253{
8254	int e;
8255	rsmseg_t *seg = (rsmseg_t *)pvt;
8256	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8257
8258	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_access enter\n"));
8259
8260	rsmseglock_acquire(seg);
8261
8262	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8263
8264	while (seg->s_state == RSM_STATE_MAP_QUIESCE) {
8265		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
8266			DBG_PRINTF((category, RSM_DEBUG,
8267			    "rsmmap_access done: cv_wait INTR"));
8268			rsmseglock_release(seg);
8269			return (RSMERR_INTERRUPTED);
8270		}
8271	}
8272
8273	ASSERT(seg->s_state == RSM_STATE_DISCONNECT ||
8274	    seg->s_state == RSM_STATE_ACTIVE);
8275
8276	if (seg->s_state == RSM_STATE_DISCONNECT)
8277		seg->s_flags |= RSM_IMPORT_DUMMY;
8278
8279	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8280	    "rsmmap_access: dhp = %x\n", dhp));
8281
8282	rsmseglock_release(seg);
8283
8284	if (e = devmap_load(dhp, offset, len, type, rw)) {
8285		DBG_PRINTF((category, RSM_ERR, "devmap_load failed\n"));
8286	}
8287
8288
8289	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_access done\n"));
8290
8291	return (e);
8292}
8293
8294static int
8295rsmmap_dup(devmap_cookie_t dhp, void *oldpvt, devmap_cookie_t new_dhp,
8296	void **newpvt)
8297{
8298	rsmseg_t	*seg = (rsmseg_t *)oldpvt;
8299	rsmcookie_t	*p, *old;
8300	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8301
8302	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_dup enter\n"));
8303
8304	/*
8305	 * Same as map, create an entry to hold cookie and add it to
8306	 * connect segment list. The oldpvt is a pointer to segment.
8307	 * Return segment pointer in newpvt.
8308	 */
8309	rsmseglock_acquire(seg);
8310
8311	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8312
8313	/*
8314	 * Find old cookie
8315	 */
8316	for (old = seg->s_ckl; old != NULL; old = old->c_next) {
8317		if (old->c_dhp == dhp) {
8318			break;
8319		}
8320	}
8321	if (old == NULL) {
8322		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8323		    "rsmmap_dup done: EINVAL\n"));
8324		rsmseglock_release(seg);
8325		return (EINVAL);
8326	}
8327
8328	p = kmem_alloc(sizeof (*p), KM_SLEEP);
8329
8330	p->c_dhp = new_dhp;
8331	p->c_off = old->c_off;
8332	p->c_len = old->c_len;
8333	p->c_next = seg->s_ckl;
8334	seg->s_ckl = p;
8335
8336	*newpvt = (void *)seg;
8337
8338	rsmseglock_release(seg);
8339
8340	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_dup done\n"));
8341
8342	return (DDI_SUCCESS);
8343}
8344
8345static void
8346rsmmap_unmap(devmap_cookie_t dhp, void *pvtp, offset_t off, size_t len,
8347	devmap_cookie_t new_dhp1, void **pvtp1,
8348	devmap_cookie_t new_dhp2, void **pvtp2)
8349{
8350	/*
8351	 * Remove pvtp structure from segment list.
8352	 */
8353	rsmseg_t	*seg = (rsmseg_t *)pvtp;
8354	int freeflag;
8355
8356	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8357
8358	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_unmap enter\n"));
8359
8360	off = off; len = len;
8361	pvtp1 = pvtp1; pvtp2 = pvtp2;
8362
8363	rsmseglock_acquire(seg);
8364
8365	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8366
8367	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8368	    "rsmmap_unmap: dhp = %x\n", dhp));
8369	/*
8370	 * We can go ahead and remove the dhps even if we are in
8371	 * the MAPPING state because the dhps being removed here
8372	 * belong to a different mmap and we are holding the segment
8373	 * lock.
8374	 */
8375	if (new_dhp1 == NULL && new_dhp2 == NULL) {
8376		/* find and remove dhp handle */
8377		rsmcookie_t *tmp, **back = &seg->s_ckl;
8378
8379		while (*back != NULL) {
8380			tmp = *back;
8381			if (tmp->c_dhp == dhp) {
8382				*back = tmp->c_next;
8383				kmem_free(tmp, sizeof (*tmp));
8384				break;
8385			}
8386			back = &tmp->c_next;
8387		}
8388	} else {
8389		DBG_PRINTF((category, RSM_DEBUG_LVL2,
8390		    "rsmmap_unmap:parital unmap"
8391		    "new_dhp1 %lx, new_dhp2 %lx\n",
8392		    (size_t)new_dhp1, (size_t)new_dhp2));
8393	}
8394
8395	/*
8396	 * rsmmap_unmap is called for each mapping cookie on the list.
8397	 * When the list becomes empty and we are not in the MAPPING
8398	 * state then unmap in the rsmpi driver.
8399	 */
8400	if ((seg->s_ckl == NULL) && (seg->s_state != RSM_STATE_MAPPING))
8401		(void) rsm_unmap(seg);
8402
8403	if (seg->s_state == RSM_STATE_END && seg->s_ckl == NULL) {
8404		freeflag = 1;
8405	} else {
8406		freeflag = 0;
8407	}
8408
8409	rsmseglock_release(seg);
8410
8411	if (freeflag) {
8412		/* Free the segment structure */
8413		rsmseg_free(seg);
8414	}
8415	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmmap_unmap done\n"));
8416
8417}
8418
8419static struct devmap_callback_ctl rsmmap_ops = {
8420	DEVMAP_OPS_REV,	/* devmap_ops version number	*/
8421	rsmmap_map,	/* devmap_ops map routine */
8422	rsmmap_access,	/* devmap_ops access routine */
8423	rsmmap_dup,		/* devmap_ops dup routine		*/
8424	rsmmap_unmap,	/* devmap_ops unmap routine */
8425};
8426
8427static int
8428rsm_devmap(dev_t dev, devmap_cookie_t dhc, offset_t off, size_t len,
8429    size_t *maplen, uint_t model /*ARGSUSED*/)
8430{
8431	struct devmap_callback_ctl *callbackops = &rsmmap_ops;
8432	int		err;
8433	uint_t		maxprot;
8434	minor_t		rnum;
8435	rsmseg_t	*seg;
8436	off_t		dev_offset;
8437	size_t		cur_len;
8438	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8439
8440	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_devmap enter\n"));
8441
8442	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8443	    "rsm_devmap: off = %lx, len = %lx\n", off, len));
8444	rnum = getminor(dev);
8445	seg = (rsmseg_t *)rsmresource_lookup(rnum, RSM_NOLOCK);
8446	ASSERT(seg != NULL);
8447
8448	if (seg->s_hdr.rsmrc_type == RSM_RESOURCE_BAR) {
8449		if ((off == barrier_offset) &&
8450		    (len == barrier_size)) {
8451
8452			ASSERT(bar_va != NULL && bar_cookie != NULL);
8453
8454			/*
8455			 * The offset argument in devmap_umem_setup represents
8456			 * the offset within the kernel memory defined by the
8457			 * cookie. We use this offset as barrier_offset.
8458			 */
8459			err = devmap_umem_setup(dhc, rsm_dip, NULL, bar_cookie,
8460			    barrier_offset, len, PROT_USER|PROT_READ,
8461			    DEVMAP_DEFAULTS, 0);
8462
8463			if (err != 0) {
8464				DBG_PRINTF((category, RSM_ERR,
8465				    "rsm_devmap done: %d\n", err));
8466				return (RSMERR_MAP_FAILED);
8467			}
8468			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8469			    "rsm_devmap done: %d\n", err));
8470
8471			*maplen = barrier_size;
8472
8473			return (err);
8474		} else {
8475			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8476			    "rsm_devmap done: %d\n", err));
8477			return (RSMERR_MAP_FAILED);
8478		}
8479	}
8480
8481	ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
8482	ASSERT(seg->s_state == RSM_STATE_MAPPING);
8483
8484	/*
8485	 * Make sure we still have permission for the map operation.
8486	 */
8487	maxprot = PROT_USER;
8488	if (seg->s_mode & RSM_PERM_READ) {
8489		maxprot |= PROT_READ;
8490	}
8491
8492	if (seg->s_mode & RSM_PERM_WRITE) {
8493		maxprot |= PROT_WRITE;
8494	}
8495
8496	/*
8497	 * For each devmap call, rsmmap_map is called. This maintains driver
8498	 * private information for the mapping. Thus, if there are multiple
8499	 * devmap calls there will be multiple rsmmap_map calls and for each
8500	 * call, the mapping information will be stored.
8501	 * In case of an error during the processing of the devmap call, error
8502	 * will be returned. This error return causes the caller of rsm_devmap
8503	 * to undo all the mappings by calling rsmmap_unmap for each one.
8504	 * rsmmap_unmap will free up the private information for the requested
8505	 * mapping.
8506	 */
8507	if (seg->s_node != my_nodeid) {
8508		rsm_mapinfo_t *p;
8509
8510		p = rsm_get_mapinfo(seg, off, len, &dev_offset, &cur_len);
8511		if (p == NULL) {
8512			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8513			    "rsm_devmap: incorrect mapping info\n"));
8514			return (RSMERR_MAP_FAILED);
8515		}
8516		err = devmap_devmem_setup(dhc, p->dip,
8517		    callbackops, p->dev_register,
8518		    dev_offset, cur_len, maxprot,
8519		    DEVMAP_ALLOW_REMAP | DEVMAP_DEFAULTS, 0);
8520
8521		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8522		    "rsm_devmap: dip=%lx,dreg=%lu,doff=%lx,"
8523		    "off=%lx,len=%lx\n",
8524		    p->dip, p->dev_register, dev_offset, off, cur_len));
8525
8526		if (err != 0) {
8527			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8528			    "rsm_devmap: devmap_devmem_setup failed %d\n",
8529			    err));
8530			return (RSMERR_MAP_FAILED);
8531		}
8532		/* cur_len is always an integral multiple pagesize */
8533		ASSERT((cur_len & (PAGESIZE-1)) == 0);
8534		*maplen = cur_len;
8535		return (err);
8536
8537	} else {
8538		err = devmap_umem_setup(dhc, rsm_dip, callbackops,
8539		    seg->s_cookie, off, len, maxprot,
8540		    DEVMAP_ALLOW_REMAP|DEVMAP_DEFAULTS, 0);
8541		if (err != 0) {
8542			DBG_PRINTF((category, RSM_DEBUG,
8543			    "rsm_devmap: devmap_umem_setup failed %d\n",
8544			    err));
8545			return (RSMERR_MAP_FAILED);
8546		}
8547		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8548		    "rsm_devmap: loopback done\n"));
8549
8550		*maplen = ptob(btopr(len));
8551
8552		return (err);
8553	}
8554}
8555
8556/*
8557 * We can use the devmap framework for mapping device memory to user space by
8558 * specifying this routine in the rsm_cb_ops structure. The kernel mmap
8559 * processing calls this entry point and devmap_setup is called within this
8560 * function, which eventually calls rsm_devmap
8561 */
8562static int
8563rsm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
8564    uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
8565{
8566	int			error = 0;
8567	int			old_state;
8568	minor_t			rnum;
8569	rsmseg_t		*seg, *eseg;
8570	adapter_t		*adapter;
8571	rsm_import_share_t	*sharedp;
8572	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_DDI);
8573
8574	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_segmap enter\n"));
8575
8576	/*
8577	 * find segment
8578	 */
8579	rnum = getminor(dev);
8580	seg = (rsmseg_t *)rsmresource_lookup(rnum, RSM_LOCK);
8581
8582	if (seg == NULL) {
8583		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8584		    "rsm_segmap done: invalid segment\n"));
8585		return (EINVAL);
8586	}
8587
8588	/*
8589	 * the user is trying to map a resource that has not been
8590	 * defined yet. The library uses this to map in the
8591	 * barrier page.
8592	 */
8593	if (seg->s_hdr.rsmrc_type == RSM_RESOURCE_BAR) {
8594		rsmseglock_release(seg);
8595
8596		/*
8597		 * The mapping for the barrier page is identified
8598		 * by the special offset barrier_offset
8599		 */
8600
8601		if (off == (off_t)barrier_offset ||
8602		    len == (off_t)barrier_size) {
8603			if (bar_cookie == NULL || bar_va == NULL) {
8604				DBG_PRINTF((category, RSM_DEBUG,
8605				    "rsm_segmap: bar cookie/va is NULL\n"));
8606				return (EINVAL);
8607			}
8608
8609			error = devmap_setup(dev, (offset_t)off, as, addrp,
8610			    (size_t)len, prot, maxprot, flags,  cred);
8611
8612			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8613			    "rsm_segmap done: %d\n", error));
8614			return (error);
8615		} else {
8616			DBG_PRINTF((category, RSM_DEBUG,
8617			    "rsm_segmap: bad offset/length\n"));
8618			return (EINVAL);
8619		}
8620	}
8621
8622	/* Make sure you can only map imported segments */
8623	if (seg->s_hdr.rsmrc_type != RSM_RESOURCE_IMPORT_SEGMENT) {
8624		rsmseglock_release(seg);
8625		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8626		    "rsm_segmap done: not an import segment\n"));
8627		return (EINVAL);
8628	}
8629	/* check means library is broken */
8630	ASSERT(seg->s_hdr.rsmrc_num == rnum);
8631
8632	/* wait for the segment to become unquiesced */
8633	while (seg->s_state == RSM_STATE_CONN_QUIESCE) {
8634		if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
8635			rsmseglock_release(seg);
8636			DBG_PRINTF((category, RSM_DEBUG,
8637			    "rsm_segmap done: cv_wait INTR"));
8638			return (ENODEV);
8639		}
8640	}
8641
8642	/* wait until segment leaves the mapping state */
8643	while (seg->s_state == RSM_STATE_MAPPING)
8644		cv_wait(&seg->s_cv, &seg->s_lock);
8645
8646	/*
8647	 * we allow multiple maps of the same segment in the KA
8648	 * and it works because we do an rsmpi map of the whole
8649	 * segment during the first map and all the device mapping
8650	 * information needed in rsm_devmap is in the mapinfo list.
8651	 */
8652	if ((seg->s_state != RSM_STATE_CONNECT) &&
8653	    (seg->s_state != RSM_STATE_ACTIVE)) {
8654		rsmseglock_release(seg);
8655		DBG_PRINTF((category, RSM_DEBUG,
8656		    "rsm_segmap done: segment not connected\n"));
8657		return (ENODEV);
8658	}
8659
8660	/*
8661	 * Make sure we are not mapping a larger segment than what's
8662	 * exported
8663	 */
8664	if ((size_t)off + ptob(btopr(len)) > seg->s_len) {
8665		rsmseglock_release(seg);
8666		DBG_PRINTF((category, RSM_DEBUG,
8667		    "rsm_segmap done: off+len>seg size\n"));
8668		return (ENXIO);
8669	}
8670
8671	/*
8672	 * Make sure we still have permission for the map operation.
8673	 */
8674	maxprot = PROT_USER;
8675	if (seg->s_mode & RSM_PERM_READ) {
8676		maxprot |= PROT_READ;
8677	}
8678
8679	if (seg->s_mode & RSM_PERM_WRITE) {
8680		maxprot |= PROT_WRITE;
8681	}
8682
8683	if ((prot & maxprot) != prot) {
8684		/* No permission */
8685		rsmseglock_release(seg);
8686		DBG_PRINTF((category, RSM_DEBUG,
8687		    "rsm_segmap done: no permission\n"));
8688		return (EACCES);
8689	}
8690
8691	old_state = seg->s_state;
8692
8693	ASSERT(seg->s_share != NULL);
8694
8695	rsmsharelock_acquire(seg);
8696
8697	sharedp = seg->s_share;
8698
8699	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
8700	    "rsm_segmap:RSMSI_STATE=%d\n", sharedp->rsmsi_state));
8701
8702	if ((sharedp->rsmsi_state != RSMSI_STATE_CONNECTED) &&
8703	    (sharedp->rsmsi_state != RSMSI_STATE_MAPPED)) {
8704		rsmsharelock_release(seg);
8705		rsmseglock_release(seg);
8706		DBG_PRINTF((category, RSM_DEBUG,
8707		    "rsm_segmap done:RSMSI_STATE %d invalid\n",
8708		    sharedp->rsmsi_state));
8709		return (ENODEV);
8710	}
8711
8712	/*
8713	 * Do the map - since we want importers to share mappings
8714	 * we do the rsmpi map for the whole segment
8715	 */
8716	if (seg->s_node != my_nodeid) {
8717		uint_t dev_register;
8718		off_t dev_offset;
8719		dev_info_t *dip;
8720		size_t tmp_len;
8721		size_t total_length_mapped = 0;
8722		size_t length_to_map = seg->s_len;
8723		off_t tmp_off = 0;
8724		rsm_mapinfo_t *p;
8725
8726		/*
8727		 * length_to_map = seg->s_len is always an integral
8728		 * multiple of PAGESIZE. Length mapped in each entry in mapinfo
8729		 * list is a multiple of PAGESIZE - RSMPI map ensures this
8730		 */
8731
8732		adapter = seg->s_adapter;
8733		ASSERT(sharedp->rsmsi_state == RSMSI_STATE_CONNECTED ||
8734		    sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8735
8736		if (sharedp->rsmsi_state == RSMSI_STATE_CONNECTED) {
8737			error = 0;
8738			/* map the whole segment */
8739			while (total_length_mapped < seg->s_len) {
8740				tmp_len = 0;
8741
8742				error = adapter->rsmpi_ops->rsm_map(
8743				    seg->s_handle.in, tmp_off,
8744				    length_to_map, &tmp_len,
8745				    &dip, &dev_register, &dev_offset,
8746				    NULL, NULL);
8747
8748				if (error != 0)
8749					break;
8750
8751				/*
8752				 * Store the mapping info obtained from rsm_map
8753				 */
8754				p = kmem_alloc(sizeof (*p), KM_SLEEP);
8755				p->dev_register = dev_register;
8756				p->dev_offset = dev_offset;
8757				p->dip = dip;
8758				p->individual_len = tmp_len;
8759				p->start_offset = tmp_off;
8760				p->next = sharedp->rsmsi_mapinfo;
8761				sharedp->rsmsi_mapinfo = p;
8762
8763				total_length_mapped += tmp_len;
8764				length_to_map -= tmp_len;
8765				tmp_off += tmp_len;
8766			}
8767			seg->s_mapinfo = sharedp->rsmsi_mapinfo;
8768
8769			if (error != RSM_SUCCESS) {
8770				/* Check if this is the the first rsm_map */
8771				if (sharedp->rsmsi_mapinfo != NULL) {
8772					/*
8773					 * A single rsm_unmap undoes
8774					 * multiple rsm_maps.
8775					 */
8776					(void) seg->s_adapter->rsmpi_ops->
8777					    rsm_unmap(sharedp->rsmsi_handle);
8778					rsm_free_mapinfo(sharedp->
8779					    rsmsi_mapinfo);
8780				}
8781				sharedp->rsmsi_mapinfo = NULL;
8782				sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
8783				rsmsharelock_release(seg);
8784				rsmseglock_release(seg);
8785				DBG_PRINTF((category, RSM_DEBUG,
8786				    "rsm_segmap done: rsmpi map err %d\n",
8787				    error));
8788				ASSERT(error != RSMERR_BAD_LENGTH &&
8789				    error != RSMERR_BAD_MEM_ALIGNMENT &&
8790				    error != RSMERR_BAD_SEG_HNDL);
8791				if (error == RSMERR_UNSUPPORTED_OPERATION)
8792					return (ENOTSUP);
8793				else if (error == RSMERR_INSUFFICIENT_RESOURCES)
8794					return (EAGAIN);
8795				else if (error == RSMERR_CONN_ABORTED)
8796					return (ENODEV);
8797				else
8798					return (error);
8799			} else {
8800				sharedp->rsmsi_state = RSMSI_STATE_MAPPED;
8801			}
8802		} else {
8803			seg->s_mapinfo = sharedp->rsmsi_mapinfo;
8804		}
8805
8806		sharedp->rsmsi_mapcnt++;
8807
8808		rsmsharelock_release(seg);
8809
8810		/* move to an intermediate mapping state */
8811		seg->s_state = RSM_STATE_MAPPING;
8812		rsmseglock_release(seg);
8813
8814		error = devmap_setup(dev, (offset_t)off, as, addrp,
8815		    len, prot, maxprot, flags, cred);
8816
8817		rsmseglock_acquire(seg);
8818		ASSERT(seg->s_state == RSM_STATE_MAPPING);
8819
8820		if (error == DDI_SUCCESS) {
8821			seg->s_state = RSM_STATE_ACTIVE;
8822		} else {
8823			rsmsharelock_acquire(seg);
8824
8825			ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8826
8827			sharedp->rsmsi_mapcnt--;
8828			if (sharedp->rsmsi_mapcnt == 0) {
8829				/* unmap the shared RSMPI mapping */
8830				ASSERT(sharedp->rsmsi_handle != NULL);
8831				(void) adapter->rsmpi_ops->
8832				    rsm_unmap(sharedp->rsmsi_handle);
8833				rsm_free_mapinfo(sharedp->rsmsi_mapinfo);
8834				sharedp->rsmsi_mapinfo = NULL;
8835				sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
8836			}
8837
8838			rsmsharelock_release(seg);
8839			seg->s_state = old_state;
8840			DBG_PRINTF((category, RSM_ERR,
8841			    "rsm: devmap_setup failed %d\n", error));
8842		}
8843		cv_broadcast(&seg->s_cv);
8844		rsmseglock_release(seg);
8845		DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsm_segmap done: %d\n",
8846		    error));
8847		return (error);
8848	} else {
8849		/*
8850		 * For loopback, the export segment mapping cookie (s_cookie)
8851		 * is also used as the s_cookie value for its import segments
8852		 * during mapping.
8853		 * Note that reference counting for s_cookie of the export
8854		 * segment is not required due to the following:
8855		 * We never have a case of the export segment being destroyed,
8856		 * leaving the import segments with a stale value for the
8857		 * s_cookie field, since a force disconnect is done prior to a
8858		 * destroy of an export segment. The force disconnect causes
8859		 * the s_cookie value to be reset to NULL. Also for the
8860		 * rsm_rebind operation, we change the s_cookie value of the
8861		 * export segment as well as of all its local (loopback)
8862		 * importers.
8863		 */
8864		DBG_ADDCATEGORY(category, RSM_LOOPBACK);
8865
8866		rsmsharelock_release(seg);
8867		/*
8868		 * In order to maintain the lock ordering between the export
8869		 * and import segment locks, we need to acquire the export
8870		 * segment lock first and only then acquire the import
8871		 * segment lock.
8872		 * The above is necessary to avoid any deadlock scenarios
8873		 * with rsm_rebind which also acquires both the export
8874		 * and import segment locks in the above mentioned order.
8875		 * Based on code inspection, there seem to be no other
8876		 * situations in which both the export and import segment
8877		 * locks are acquired either in the same or opposite order
8878		 * as mentioned above.
8879		 * Thus in order to conform to the above lock order, we
8880		 * need to change the state of the import segment to
8881		 * RSM_STATE_MAPPING, release the lock. Once this is done we
8882		 * can now safely acquire the export segment lock first
8883		 * followed by the import segment lock which is as per
8884		 * the lock order mentioned above.
8885		 */
8886		/* move to an intermediate mapping state */
8887		seg->s_state = RSM_STATE_MAPPING;
8888		rsmseglock_release(seg);
8889
8890		eseg = rsmexport_lookup(seg->s_key);
8891
8892		if (eseg == NULL) {
8893			rsmseglock_acquire(seg);
8894			/*
8895			 * Revert to old_state and signal any waiters
8896			 * The shared state is not changed
8897			 */
8898
8899			seg->s_state = old_state;
8900			cv_broadcast(&seg->s_cv);
8901			rsmseglock_release(seg);
8902			DBG_PRINTF((category, RSM_DEBUG,
8903			    "rsm_segmap done: key %d not found\n", seg->s_key));
8904			return (ENODEV);
8905		}
8906
8907		rsmsharelock_acquire(seg);
8908		ASSERT(sharedp->rsmsi_state == RSMSI_STATE_CONNECTED ||
8909		    sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8910
8911		sharedp->rsmsi_mapcnt++;
8912		sharedp->rsmsi_state = RSMSI_STATE_MAPPED;
8913		rsmsharelock_release(seg);
8914
8915		ASSERT(eseg->s_cookie != NULL);
8916
8917		/*
8918		 * It is not required or necessary to acquire the import
8919		 * segment lock here to change the value of s_cookie since
8920		 * no one will touch the import segment as long as it is
8921		 * in the RSM_STATE_MAPPING state.
8922		 */
8923		seg->s_cookie = eseg->s_cookie;
8924
8925		rsmseglock_release(eseg);
8926
8927		error = devmap_setup(dev, (offset_t)off, as, addrp, (size_t)len,
8928		    prot, maxprot, flags, cred);
8929
8930		rsmseglock_acquire(seg);
8931		ASSERT(seg->s_state == RSM_STATE_MAPPING);
8932		if (error == 0) {
8933			seg->s_state = RSM_STATE_ACTIVE;
8934		} else {
8935			rsmsharelock_acquire(seg);
8936
8937			ASSERT(sharedp->rsmsi_state == RSMSI_STATE_MAPPED);
8938
8939			sharedp->rsmsi_mapcnt--;
8940			if (sharedp->rsmsi_mapcnt == 0) {
8941				sharedp->rsmsi_mapinfo = NULL;
8942				sharedp->rsmsi_state = RSMSI_STATE_CONNECTED;
8943			}
8944			rsmsharelock_release(seg);
8945			seg->s_state = old_state;
8946			seg->s_cookie = NULL;
8947		}
8948		cv_broadcast(&seg->s_cv);
8949		rsmseglock_release(seg);
8950		DBG_PRINTF((category, RSM_DEBUG_LVL2,
8951		    "rsm_segmap done: %d\n", error));
8952		return (error);
8953	}
8954}
8955
8956int
8957rsmka_null_seg_create(
8958    rsm_controller_handle_t argcp,
8959    rsm_memseg_export_handle_t *handle,
8960    size_t size,
8961    uint_t flags,
8962    rsm_memory_local_t *memory,
8963    rsm_resource_callback_t callback,
8964    rsm_resource_callback_arg_t callback_arg	/*ARGSUSED*/)
8965{
8966	return (RSM_SUCCESS);
8967}
8968
8969
8970int
8971rsmka_null_seg_destroy(
8972    rsm_memseg_export_handle_t argmemseg	/*ARGSUSED*/)
8973{
8974	return (RSM_SUCCESS);
8975}
8976
8977
8978int
8979rsmka_null_bind(
8980    rsm_memseg_export_handle_t argmemseg,
8981    off_t offset,
8982    rsm_memory_local_t *argmemory,
8983    rsm_resource_callback_t callback,
8984    rsm_resource_callback_arg_t callback_arg	/*ARGSUSED*/)
8985{
8986	return (RSM_SUCCESS);
8987}
8988
8989
8990int
8991rsmka_null_unbind(
8992    rsm_memseg_export_handle_t argmemseg,
8993    off_t offset,
8994    size_t length	/*ARGSUSED*/)
8995{
8996	return (DDI_SUCCESS);
8997}
8998
8999int
9000rsmka_null_rebind(
9001    rsm_memseg_export_handle_t argmemseg,
9002    off_t offset,
9003    rsm_memory_local_t *memory,
9004    rsm_resource_callback_t callback,
9005    rsm_resource_callback_arg_t callback_arg	/*ARGSUSED*/)
9006{
9007	return (RSM_SUCCESS);
9008}
9009
9010int
9011rsmka_null_publish(
9012    rsm_memseg_export_handle_t argmemseg,
9013    rsm_access_entry_t access_list[],
9014    uint_t access_list_length,
9015    rsm_memseg_id_t segment_id,
9016    rsm_resource_callback_t callback,
9017    rsm_resource_callback_arg_t callback_arg	/*ARGSUSED*/)
9018{
9019	return (RSM_SUCCESS);
9020}
9021
9022
9023int
9024rsmka_null_republish(
9025    rsm_memseg_export_handle_t memseg,
9026    rsm_access_entry_t access_list[],
9027    uint_t access_list_length,
9028    rsm_resource_callback_t callback,
9029    rsm_resource_callback_arg_t callback_arg	/*ARGSUSED*/)
9030{
9031	return (RSM_SUCCESS);
9032}
9033
9034int
9035rsmka_null_unpublish(
9036    rsm_memseg_export_handle_t argmemseg	/*ARGSUSED*/)
9037{
9038	return (RSM_SUCCESS);
9039}
9040
9041
9042void
9043rsmka_init_loopback()
9044{
9045	rsm_ops_t	*ops = &null_rsmpi_ops;
9046	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL | RSM_LOOPBACK);
9047
9048	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9049	    "rsmka_init_loopback enter\n"));
9050
9051	/* initialize null ops vector */
9052	ops->rsm_seg_create = rsmka_null_seg_create;
9053	ops->rsm_seg_destroy = rsmka_null_seg_destroy;
9054	ops->rsm_bind = rsmka_null_bind;
9055	ops->rsm_unbind = rsmka_null_unbind;
9056	ops->rsm_rebind = rsmka_null_rebind;
9057	ops->rsm_publish = rsmka_null_publish;
9058	ops->rsm_unpublish = rsmka_null_unpublish;
9059	ops->rsm_republish = rsmka_null_republish;
9060
9061	/* initialize attributes for loopback adapter */
9062	loopback_attr.attr_name = loopback_str;
9063	loopback_attr.attr_page_size = 0x8; /* 8K */
9064
9065	/* initialize loopback adapter */
9066	loopback_adapter.rsm_attr = loopback_attr;
9067	loopback_adapter.rsmpi_ops = &null_rsmpi_ops;
9068	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9069	    "rsmka_init_loopback done\n"));
9070}
9071
9072/* ************** DR functions ********************************** */
9073static void
9074rsm_quiesce_exp_seg(rsmresource_t *resp)
9075{
9076	int		recheck_state;
9077	rsmseg_t	*segp = (rsmseg_t *)resp;
9078	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9079	DBG_DEFINE_STR(function, "rsm_unquiesce_exp_seg");
9080
9081	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9082	    "%s enter: key=%u\n", function, segp->s_key));
9083
9084	rsmseglock_acquire(segp);
9085	do {
9086		recheck_state = 0;
9087		if ((segp->s_state == RSM_STATE_NEW_QUIESCED) ||
9088		    (segp->s_state == RSM_STATE_BIND_QUIESCED) ||
9089		    (segp->s_state == RSM_STATE_EXPORT_QUIESCING) ||
9090		    (segp->s_state == RSM_STATE_EXPORT_QUIESCED)) {
9091			rsmseglock_release(segp);
9092			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9093			    "%s done:state =%d\n", function,
9094			    segp->s_state));
9095			return;
9096		}
9097
9098		if (segp->s_state == RSM_STATE_NEW) {
9099			segp->s_state = RSM_STATE_NEW_QUIESCED;
9100			rsmseglock_release(segp);
9101			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9102			    "%s done:state =%d\n", function,
9103			    segp->s_state));
9104			return;
9105		}
9106
9107		if (segp->s_state == RSM_STATE_BIND) {
9108			/* unbind */
9109			(void) rsm_unbind_pages(segp);
9110			segp->s_state = RSM_STATE_BIND_QUIESCED;
9111			rsmseglock_release(segp);
9112			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9113			    "%s done:state =%d\n", function,
9114			    segp->s_state));
9115			return;
9116		}
9117
9118		if (segp->s_state == RSM_STATE_EXPORT) {
9119			/*
9120			 * wait for putv/getv to complete if the segp is
9121			 * a local memory handle
9122			 */
9123			while ((segp->s_state == RSM_STATE_EXPORT) &&
9124			    (segp->s_rdmacnt != 0)) {
9125				cv_wait(&segp->s_cv, &segp->s_lock);
9126			}
9127
9128			if (segp->s_state != RSM_STATE_EXPORT) {
9129				/*
9130				 * state changed need to see what it
9131				 * should be changed to.
9132				 */
9133				recheck_state = 1;
9134				continue;
9135			}
9136
9137			segp->s_state = RSM_STATE_EXPORT_QUIESCING;
9138			rsmseglock_release(segp);
9139			/*
9140			 * send SUSPEND messages - currently it will be
9141			 * done at the end
9142			 */
9143			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9144			    "%s done:state =%d\n", function,
9145			    segp->s_state));
9146			return;
9147		}
9148	} while (recheck_state);
9149
9150	rsmseglock_release(segp);
9151}
9152
9153static void
9154rsm_unquiesce_exp_seg(rsmresource_t *resp)
9155{
9156	int			ret;
9157	rsmseg_t		*segp = (rsmseg_t *)resp;
9158	rsmapi_access_entry_t	*acl;
9159	rsm_access_entry_t	*rsmpi_acl;
9160	int			acl_len;
9161	int			create_flags = 0;
9162	struct buf		*xbuf;
9163	rsm_memory_local_t	mem;
9164	adapter_t		*adapter;
9165	dev_t			sdev = 0;
9166	rsm_resource_callback_t callback_flag;
9167	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9168	DBG_DEFINE_STR(function, "rsm_unquiesce_exp_seg");
9169
9170	rsmseglock_acquire(segp);
9171
9172	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9173	    "%s enter: key=%u, state=%d\n", function, segp->s_key,
9174	    segp->s_state));
9175
9176	if ((segp->s_state == RSM_STATE_NEW) ||
9177	    (segp->s_state == RSM_STATE_BIND) ||
9178	    (segp->s_state == RSM_STATE_EXPORT)) {
9179		rsmseglock_release(segp);
9180		DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done:state=%d\n",
9181		    function, segp->s_state));
9182		return;
9183	}
9184
9185	if (segp->s_state == RSM_STATE_NEW_QUIESCED) {
9186		segp->s_state = RSM_STATE_NEW;
9187		cv_broadcast(&segp->s_cv);
9188		rsmseglock_release(segp);
9189		DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done:state=%d\n",
9190		    function, segp->s_state));
9191		return;
9192	}
9193
9194	if (segp->s_state == RSM_STATE_BIND_QUIESCED) {
9195		/* bind the segment */
9196		ret = rsm_bind_pages(&segp->s_cookie, segp->s_region.r_vaddr,
9197		    segp->s_len, segp->s_proc);
9198		if (ret == RSM_SUCCESS) { /* bind successful */
9199			segp->s_state = RSM_STATE_BIND;
9200		} else { /* bind failed - resource unavailable */
9201			segp->s_state = RSM_STATE_NEW;
9202		}
9203		cv_broadcast(&segp->s_cv);
9204		rsmseglock_release(segp);
9205		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9206		    "%s done: bind_qscd bind = %d\n", function, ret));
9207		return;
9208	}
9209
9210	while (segp->s_state == RSM_STATE_EXPORT_QUIESCING) {
9211		/* wait for the segment to move to EXPORT_QUIESCED state */
9212		cv_wait(&segp->s_cv, &segp->s_lock);
9213	}
9214
9215	if (segp->s_state == RSM_STATE_EXPORT_QUIESCED) {
9216		/* bind the segment */
9217		ret = rsm_bind_pages(&segp->s_cookie, segp->s_region.r_vaddr,
9218		    segp->s_len, segp->s_proc);
9219
9220		if (ret != RSM_SUCCESS) {
9221			/* bind failed - resource unavailable */
9222			acl_len = segp->s_acl_len;
9223			acl = segp->s_acl;
9224			rsmpi_acl = segp->s_acl_in;
9225			segp->s_acl_len = 0;
9226			segp->s_acl = NULL;
9227			segp->s_acl_in = NULL;
9228			rsmseglock_release(segp);
9229
9230			rsmexport_rm(segp);
9231			rsmacl_free(acl, acl_len);
9232			rsmpiacl_free(rsmpi_acl, acl_len);
9233
9234			rsmseglock_acquire(segp);
9235			segp->s_state = RSM_STATE_NEW;
9236			cv_broadcast(&segp->s_cv);
9237			rsmseglock_release(segp);
9238			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9239			    "%s done: exp_qscd bind failed = %d\n",
9240			    function, ret));
9241			return;
9242		}
9243		/*
9244		 * publish the segment
9245		 * if  successful
9246		 *   segp->s_state = RSM_STATE_EXPORT;
9247		 * else failed
9248		 *   segp->s_state = RSM_STATE_BIND;
9249		 */
9250
9251		/* check whether it is a local_memory_handle */
9252		if (segp->s_acl != (rsmapi_access_entry_t *)NULL) {
9253			if ((segp->s_acl[0].ae_node == my_nodeid) &&
9254			    (segp->s_acl[0].ae_permission == 0)) {
9255				segp->s_state = RSM_STATE_EXPORT;
9256				cv_broadcast(&segp->s_cv);
9257				rsmseglock_release(segp);
9258				DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9259				    "%s done:exp_qscd\n", function));
9260				return;
9261			}
9262		}
9263		xbuf = ddi_umem_iosetup(segp->s_cookie, 0, segp->s_len, B_WRITE,
9264		    sdev, 0, NULL, DDI_UMEM_SLEEP);
9265		ASSERT(xbuf != NULL);
9266
9267		mem.ms_type = RSM_MEM_BUF;
9268		mem.ms_bp = xbuf;
9269
9270		adapter = segp->s_adapter;
9271
9272		if (segp->s_flags & RSMKA_ALLOW_UNBIND_REBIND) {
9273			create_flags = RSM_ALLOW_UNBIND_REBIND;
9274		}
9275
9276		if (segp->s_flags & RSMKA_SET_RESOURCE_DONTWAIT) {
9277			callback_flag  = RSM_RESOURCE_DONTWAIT;
9278		} else {
9279			callback_flag  = RSM_RESOURCE_SLEEP;
9280		}
9281
9282		ret = adapter->rsmpi_ops->rsm_seg_create(
9283		    adapter->rsmpi_handle, &segp->s_handle.out,
9284		    segp->s_len, create_flags, &mem,
9285		    callback_flag, NULL);
9286
9287		if (ret != RSM_SUCCESS) {
9288			acl_len = segp->s_acl_len;
9289			acl = segp->s_acl;
9290			rsmpi_acl = segp->s_acl_in;
9291			segp->s_acl_len = 0;
9292			segp->s_acl = NULL;
9293			segp->s_acl_in = NULL;
9294			rsmseglock_release(segp);
9295
9296			rsmexport_rm(segp);
9297			rsmacl_free(acl, acl_len);
9298			rsmpiacl_free(rsmpi_acl, acl_len);
9299
9300			rsmseglock_acquire(segp);
9301			segp->s_state = RSM_STATE_BIND;
9302			cv_broadcast(&segp->s_cv);
9303			rsmseglock_release(segp);
9304			DBG_PRINTF((category, RSM_ERR,
9305			    "%s done: exp_qscd create failed = %d\n",
9306			    function, ret));
9307			return;
9308		}
9309
9310		ret = adapter->rsmpi_ops->rsm_publish(
9311		    segp->s_handle.out, segp->s_acl_in, segp->s_acl_len,
9312		    segp->s_segid, RSM_RESOURCE_DONTWAIT, NULL);
9313
9314		if (ret != RSM_SUCCESS) {
9315			acl_len = segp->s_acl_len;
9316			acl = segp->s_acl;
9317			rsmpi_acl = segp->s_acl_in;
9318			segp->s_acl_len = 0;
9319			segp->s_acl = NULL;
9320			segp->s_acl_in = NULL;
9321			adapter->rsmpi_ops->rsm_seg_destroy(segp->s_handle.out);
9322			rsmseglock_release(segp);
9323
9324			rsmexport_rm(segp);
9325			rsmacl_free(acl, acl_len);
9326			rsmpiacl_free(rsmpi_acl, acl_len);
9327
9328			rsmseglock_acquire(segp);
9329			segp->s_state = RSM_STATE_BIND;
9330			cv_broadcast(&segp->s_cv);
9331			rsmseglock_release(segp);
9332			DBG_PRINTF((category, RSM_ERR,
9333			    "%s done: exp_qscd publish failed = %d\n",
9334			    function, ret));
9335			return;
9336		}
9337
9338		segp->s_state = RSM_STATE_EXPORT;
9339		cv_broadcast(&segp->s_cv);
9340		rsmseglock_release(segp);
9341		DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done: exp_qscd\n",
9342		    function));
9343		return;
9344	}
9345
9346	rsmseglock_release(segp);
9347
9348	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done\n", function));
9349}
9350
9351static void
9352rsm_quiesce_imp_seg(rsmresource_t *resp)
9353{
9354	rsmseg_t	*segp = (rsmseg_t *)resp;
9355	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9356	DBG_DEFINE_STR(function, "rsm_quiesce_imp_seg");
9357
9358	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9359	    "%s enter: key=%u\n", function, segp->s_key));
9360
9361	rsmseglock_acquire(segp);
9362	segp->s_flags |= RSM_DR_INPROGRESS;
9363
9364	while (segp->s_rdmacnt != 0) {
9365		/* wait for the RDMA to complete */
9366		cv_wait(&segp->s_cv, &segp->s_lock);
9367	}
9368
9369	rsmseglock_release(segp);
9370
9371	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done\n", function));
9372
9373}
9374
9375static void
9376rsm_unquiesce_imp_seg(rsmresource_t *resp)
9377{
9378	rsmseg_t	*segp = (rsmseg_t *)resp;
9379	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9380	DBG_DEFINE_STR(function, "rsm_unquiesce_imp_seg");
9381
9382	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9383	    "%s enter: key=%u\n", function, segp->s_key));
9384
9385	rsmseglock_acquire(segp);
9386
9387	segp->s_flags &= ~RSM_DR_INPROGRESS;
9388	/* wake up any waiting putv/getv ops */
9389	cv_broadcast(&segp->s_cv);
9390
9391	rsmseglock_release(segp);
9392
9393	DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "%s done\n", function));
9394
9395
9396}
9397
9398static void
9399rsm_process_exp_seg(rsmresource_t *resp, int event)
9400{
9401	if (event == RSM_DR_QUIESCE)
9402		rsm_quiesce_exp_seg(resp);
9403	else /* UNQUIESCE */
9404		rsm_unquiesce_exp_seg(resp);
9405}
9406
9407static void
9408rsm_process_imp_seg(rsmresource_t *resp, int event)
9409{
9410	if (event == RSM_DR_QUIESCE)
9411		rsm_quiesce_imp_seg(resp);
9412	else /* UNQUIESCE */
9413		rsm_unquiesce_imp_seg(resp);
9414}
9415
9416static void
9417rsm_dr_process_local_segments(int event)
9418{
9419
9420	int i, j;
9421	rsmresource_blk_t	*blk;
9422	rsmresource_t		*p;
9423	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9424
9425	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9426	    "rsm_dr_process_local_segments enter\n"));
9427
9428	/* iterate through the resource structure */
9429
9430	rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
9431
9432	for (i = 0; i < rsm_resource.rsmrc_len; i++) {
9433		blk = rsm_resource.rsmrc_root[i];
9434		if (blk != NULL) {
9435			for (j = 0; j < RSMRC_BLKSZ; j++) {
9436				p = blk->rsmrcblk_blks[j];
9437				if ((p != NULL) && (p != RSMRC_RESERVED)) {
9438					/* valid resource */
9439					if (p->rsmrc_type ==
9440					    RSM_RESOURCE_EXPORT_SEGMENT)
9441						rsm_process_exp_seg(p, event);
9442					else if (p->rsmrc_type ==
9443					    RSM_RESOURCE_IMPORT_SEGMENT)
9444						rsm_process_imp_seg(p, event);
9445				}
9446			}
9447		}
9448	}
9449
9450	rw_exit(&rsm_resource.rsmrc_lock);
9451
9452	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9453	    "rsm_dr_process_local_segments done\n"));
9454}
9455
9456/* *************** DR callback functions ************ */
9457static void
9458rsm_dr_callback_post_add(void *arg, pgcnt_t delta /* ARGSUSED */)
9459{
9460	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9461	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9462	    "rsm_dr_callback_post_add is a no-op\n"));
9463	/* Noop */
9464}
9465
9466static int
9467rsm_dr_callback_pre_del(void *arg, pgcnt_t delta /* ARGSUSED */)
9468{
9469	int	recheck_state = 0;
9470	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9471
9472	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9473	    "rsm_dr_callback_pre_del enter\n"));
9474
9475	mutex_enter(&rsm_drv_data.drv_lock);
9476
9477	do {
9478		recheck_state = 0;
9479		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9480		    "rsm_dr_callback_pre_del:state=%d\n",
9481		    rsm_drv_data.drv_state));
9482
9483		switch (rsm_drv_data.drv_state) {
9484		case RSM_DRV_NEW:
9485			/*
9486			 * The state should usually never be RSM_DRV_NEW
9487			 * since in this state the callbacks have not yet
9488			 * been registered. So, ASSERT.
9489			 */
9490			ASSERT(0);
9491			return (0);
9492		case RSM_DRV_REG_PROCESSING:
9493			/*
9494			 * The driver is in the process of registering
9495			 * with the DR framework. So, wait till the
9496			 * registration process is complete.
9497			 */
9498			recheck_state = 1;
9499			cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9500			break;
9501		case RSM_DRV_UNREG_PROCESSING:
9502			/*
9503			 * If the state is RSM_DRV_UNREG_PROCESSING, the
9504			 * module is in the process of detaching and
9505			 * unregistering the callbacks from the DR
9506			 * framework. So, simply return.
9507			 */
9508			mutex_exit(&rsm_drv_data.drv_lock);
9509			DBG_PRINTF((category, RSM_DEBUG,
9510			    "rsm_dr_callback_pre_del:"
9511			    "pre-del on NEW/UNREG\n"));
9512			return (0);
9513		case RSM_DRV_OK:
9514			rsm_drv_data.drv_state = RSM_DRV_PREDEL_STARTED;
9515			break;
9516		case RSM_DRV_PREDEL_STARTED:
9517			/* FALLTHRU */
9518		case RSM_DRV_PREDEL_COMPLETED:
9519			/* FALLTHRU */
9520		case RSM_DRV_POSTDEL_IN_PROGRESS:
9521			recheck_state = 1;
9522			cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9523			break;
9524		case RSM_DRV_DR_IN_PROGRESS:
9525			rsm_drv_data.drv_memdel_cnt++;
9526			mutex_exit(&rsm_drv_data.drv_lock);
9527			DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9528			    "rsm_dr_callback_pre_del done\n"));
9529			return (0);
9530			/* break; */
9531		default:
9532			ASSERT(0);
9533			break;
9534		}
9535
9536	} while (recheck_state);
9537
9538	rsm_drv_data.drv_memdel_cnt++;
9539
9540	mutex_exit(&rsm_drv_data.drv_lock);
9541
9542	/* Do all the quiescing stuff here */
9543	DBG_PRINTF((category, RSM_DEBUG,
9544	    "rsm_dr_callback_pre_del: quiesce things now\n"));
9545
9546	rsm_dr_process_local_segments(RSM_DR_QUIESCE);
9547
9548	/*
9549	 * now that all local segments have been quiesced lets inform
9550	 * the importers
9551	 */
9552	rsm_send_suspend();
9553
9554	/*
9555	 * In response to the suspend message the remote node(s) will process
9556	 * the segments and send a suspend_complete message. Till all
9557	 * the nodes send the suspend_complete message we wait in the
9558	 * RSM_DRV_PREDEL_STARTED state. In the exporter_quiesce
9559	 * function we transition to the RSM_DRV_PREDEL_COMPLETED state.
9560	 */
9561	mutex_enter(&rsm_drv_data.drv_lock);
9562
9563	while (rsm_drv_data.drv_state == RSM_DRV_PREDEL_STARTED) {
9564		cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9565	}
9566
9567	ASSERT(rsm_drv_data.drv_state == RSM_DRV_PREDEL_COMPLETED);
9568
9569	rsm_drv_data.drv_state = RSM_DRV_DR_IN_PROGRESS;
9570	cv_broadcast(&rsm_drv_data.drv_cv);
9571
9572	mutex_exit(&rsm_drv_data.drv_lock);
9573
9574	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9575	    "rsm_dr_callback_pre_del done\n"));
9576
9577	return (0);
9578}
9579
9580static void
9581rsm_dr_callback_post_del(void *arg, pgcnt_t delta, int cancelled /* ARGSUSED */)
9582{
9583	int	recheck_state = 0;
9584	DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
9585
9586	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9587	    "rsm_dr_callback_post_del enter\n"));
9588
9589	mutex_enter(&rsm_drv_data.drv_lock);
9590
9591	do {
9592		recheck_state = 0;
9593		DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9594		    "rsm_dr_callback_post_del:state=%d\n",
9595		    rsm_drv_data.drv_state));
9596
9597		switch (rsm_drv_data.drv_state) {
9598		case RSM_DRV_NEW:
9599			/*
9600			 * The driver state cannot not be RSM_DRV_NEW
9601			 * since in this state the callbacks have not
9602			 * yet been registered.
9603			 */
9604			ASSERT(0);
9605			return;
9606		case RSM_DRV_REG_PROCESSING:
9607			/*
9608			 * The driver is in the process of registering with
9609			 * the DR framework. Wait till the registration is
9610			 * complete.
9611			 */
9612			recheck_state = 1;
9613			cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9614			break;
9615		case RSM_DRV_UNREG_PROCESSING:
9616			/*
9617			 * RSM_DRV_UNREG_PROCESSING state means the module
9618			 * is detaching and unregistering the callbacks
9619			 * from the DR framework. So simply return.
9620			 */
9621			/* FALLTHRU */
9622		case RSM_DRV_OK:
9623			/*
9624			 * RSM_DRV_OK means we missed the pre-del
9625			 * corresponding to this post-del coz we had not
9626			 * registered yet, so simply return.
9627			 */
9628			mutex_exit(&rsm_drv_data.drv_lock);
9629			DBG_PRINTF((category, RSM_DEBUG,
9630			    "rsm_dr_callback_post_del:"
9631			    "post-del on OK/UNREG\n"));
9632			return;
9633			/* break; */
9634		case RSM_DRV_PREDEL_STARTED:
9635			/* FALLTHRU */
9636		case RSM_DRV_PREDEL_COMPLETED:
9637			/* FALLTHRU */
9638		case RSM_DRV_POSTDEL_IN_PROGRESS:
9639			recheck_state = 1;
9640			cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
9641			break;
9642		case RSM_DRV_DR_IN_PROGRESS:
9643			rsm_drv_data.drv_memdel_cnt--;
9644			if (rsm_drv_data.drv_memdel_cnt > 0) {
9645				mutex_exit(&rsm_drv_data.drv_lock);
9646				DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9647				    "rsm_dr_callback_post_del done:\n"));
9648				return;
9649			}
9650			rsm_drv_data.drv_state = RSM_DRV_POSTDEL_IN_PROGRESS;
9651			break;
9652		default:
9653			ASSERT(0);
9654			return;
9655			/* break; */
9656		}
9657	} while (recheck_state);
9658
9659	mutex_exit(&rsm_drv_data.drv_lock);
9660
9661	/* Do all the unquiescing stuff here */
9662	DBG_PRINTF((category, RSM_DEBUG,
9663	    "rsm_dr_callback_post_del: unquiesce things now\n"));
9664
9665	rsm_dr_process_local_segments(RSM_DR_UNQUIESCE);
9666
9667	/*
9668	 * now that all local segments have been unquiesced lets inform
9669	 * the importers
9670	 */
9671	rsm_send_resume();
9672
9673	mutex_enter(&rsm_drv_data.drv_lock);
9674
9675	rsm_drv_data.drv_state = RSM_DRV_OK;
9676
9677	cv_broadcast(&rsm_drv_data.drv_cv);
9678
9679	mutex_exit(&rsm_drv_data.drv_lock);
9680
9681	DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
9682	    "rsm_dr_callback_post_del done\n"));
9683
9684	return;
9685
9686}
9687