rdsv3_impl.c revision 12198:4db936bda957
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24#include <sys/types.h>
25#include <sys/stream.h>
26#include <sys/dlpi.h>
27#include <sys/stropts.h>
28#include <sys/strsun.h>
29#include <sys/sysmacros.h>
30#include <sys/strlog.h>
31#include <sys/ddi.h>
32#include <sys/cmn_err.h>
33#include <sys/socket.h>
34#include <net/if.h>
35#include <net/if_types.h>
36#include <netinet/in.h>
37#include <sys/ethernet.h>
38#include <inet/arp.h>
39#include <inet/ip.h>
40#include <inet/ip6.h>
41#include <inet/ip_ire.h>
42#include <inet/ip_if.h>
43#include <inet/ip_ftable.h>
44
45#include <sys/sunddi.h>
46#include <sys/ksynch.h>
47
48#include <sys/rds.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/sockio.h>
52#include <sys/sysmacros.h>
53#include <inet/common.h>
54#include <inet/ip.h>
55#include <net/if_types.h>
56
57#include <sys/ib/clients/rdsv3/rdsv3.h>
58#include <sys/ib/clients/rdsv3/rdma.h>
59#include <sys/ib/clients/rdsv3/ib.h>
60#include <sys/ib/clients/rdsv3/rdsv3_impl.h>
61#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
62
63#include <sys/dls.h>
64#include <sys/mac.h>
65#include <sys/mac_client.h>
66#include <sys/mac_provider.h>
67#include <sys/mac_client_priv.h>
68
69ddi_taskq_t		*rdsv3_taskq = NULL;
70extern kmem_cache_t	*rdsv3_alloc_cache;
71
72extern unsigned int 	ip_ocsum(ushort_t *address, int halfword_count,
73    unsigned int sum);
74
75/*
76 * Check if the IP interface named by `lifrp' is RDS-capable.
77 */
78boolean_t
79rdsv3_capable_interface(struct lifreq *lifrp)
80{
81	char	ifname[LIFNAMSIZ];
82	char	drv[MAXLINKNAMELEN];
83	uint_t	ppa;
84	char 	*cp;
85
86	RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter");
87
88	if (lifrp->lifr_type == IFT_IB)
89		return (B_TRUE);
90
91	/*
92	 * Strip off the logical interface portion before getting
93	 * intimate with the name.
94	 */
95	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
96	if ((cp = strchr(ifname, ':')) != NULL)
97		*cp = '\0';
98
99	if (strcmp("lo0", ifname) == 0) {
100		/*
101		 * loopback is considered RDS-capable
102		 */
103		return (B_TRUE);
104	}
105
106	return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
107	    rdsv3_if_lookup_by_name(drv));
108}
109
110int
111rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
112{
113	struct lifnum		lifn;
114	struct lifconf		lifc;
115	struct lifreq		*lp, *rlp, lifr;
116	int			rval = 0;
117	int			numifs;
118	int			bufsize, rbufsize;
119	void			*buf, *rbuf;
120	int			i, j, n, rc;
121
122	*ipaddrs = NULL;
123	*size = 0;
124	*nifs = 0;
125
126	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter");
127
128retry_count:
129	/* snapshot the current number of interfaces */
130	lifn.lifn_family = PF_UNSPEC;
131	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
132	lifn.lifn_count = 0;
133	rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval,
134	    CRED());
135	if (rval != 0) {
136		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
137		    "ksocket_ioctl returned: %d", rval);
138		return (rval);
139	}
140
141	numifs = lifn.lifn_count;
142	if (numifs <= 0) {
143		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found");
144		return (0);
145	}
146
147	/* allocate extra room in case more interfaces appear */
148	numifs += 10;
149
150	/* get the interface names and ip addresses */
151	bufsize = numifs * sizeof (struct lifreq);
152	buf = kmem_alloc(bufsize, KM_SLEEP);
153
154	lifc.lifc_family = AF_UNSPEC;
155	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
156	lifc.lifc_len = bufsize;
157	lifc.lifc_buf = buf;
158	rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
159	if (rc != 0) {
160		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed");
161		kmem_free(buf, bufsize);
162		return (rc);
163	}
164	/* if our extra room is used up, try again */
165	if (bufsize <= lifc.lifc_len) {
166		kmem_free(buf, bufsize);
167		buf = NULL;
168		goto retry_count;
169	}
170	/* calc actual number of ifconfs */
171	n = lifc.lifc_len / sizeof (struct lifreq);
172
173	/*
174	 * Count the RDS interfaces
175	 */
176	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
177
178		/*
179		 * Copy as the SIOCGLIFFLAGS ioctl is destructive
180		 */
181		bcopy(lp, &lifr, sizeof (struct lifreq));
182		/*
183		 * fetch the flags using the socket of the correct family
184		 */
185		switch (lifr.lifr_addr.ss_family) {
186		case AF_INET:
187			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
188			    &rval, CRED());
189			break;
190		default:
191			continue;
192		}
193
194		if (rc != 0) continue;
195
196		/*
197		 * If we got the flags, skip uninteresting
198		 * interfaces based on flags
199		 */
200		if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
201			continue;
202		if (lifr.lifr_flags &
203		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
204			continue;
205		if (!rdsv3_capable_interface(&lifr))
206			continue;
207		j++;
208	}
209
210	if (j <= 0) {
211		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces");
212		kmem_free(buf, bufsize);
213		return (rval);
214	}
215
216	numifs = j;
217
218	/* This is the buffer we pass back */
219	rbufsize = numifs * sizeof (struct lifreq);
220	rbuf = kmem_alloc(rbufsize, KM_SLEEP);
221	rlp = (struct lifreq *)rbuf;
222
223	/*
224	 * Examine the array of interfaces and filter uninteresting ones
225	 */
226	for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
227
228		/*
229		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
230		 */
231		bcopy(lp, &lifr, sizeof (struct lifreq));
232		/*
233		 * fetch the flags using the socket of the correct family
234		 */
235		switch (lifr.lifr_addr.ss_family) {
236		case AF_INET:
237			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
238			    &rval, CRED());
239			break;
240		default:
241			continue;
242		}
243
244
245		if (rc != 0) {
246			RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
247			    "ksocket_ioctl failed" " for %s", lifr.lifr_name);
248			continue;
249		}
250
251		/*
252		 * If we got the flags, skip uninteresting
253		 * interfaces based on flags
254		 */
255		if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
256			continue;
257		if (lifr.lifr_flags &
258		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
259			continue;
260		if (!rdsv3_capable_interface(&lifr))
261			continue;
262
263		/* save the record */
264		bcopy(lp, rlp, sizeof (struct lifreq));
265		rlp++;
266	}
267
268	kmem_free(buf, bufsize);
269
270	*ipaddrs = rbuf;
271	*size = rbufsize;
272	*nifs = numifs;
273
274	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return");
275
276	return (rval);
277}
278
279/*
280 * Check if the IP interface named by `ifrp' is RDS-capable.
281 */
282boolean_t
283rdsv3_capable_interface_old(struct ifreq *ifrp)
284{
285	char	ifname[IFNAMSIZ];
286	char	drv[MAXLINKNAMELEN];
287	uint_t	ppa;
288	char 	*cp;
289
290	RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter");
291
292	/*
293	 * Strip off the logical interface portion before getting
294	 * intimate with the name.
295	 */
296	(void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ);
297	if ((cp = strchr(ifname, ':')) != NULL)
298		*cp = '\0';
299
300	RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname);
301
302	if ((strcmp("lo0", ifname) == 0) ||
303	    (strncmp("ibd", ifname, 3) == 0)) {
304		/*
305		 * loopback and IB are considered RDS-capable
306		 */
307		return (B_TRUE);
308	}
309
310	return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
311	    rdsv3_if_lookup_by_name(drv));
312}
313
314int
315rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
316{
317	uint_t			ifn;
318	struct ifconf		ifc;
319	struct ifreq		*lp, *rlp, ifr;
320	int			rval = 0;
321	int			numifs;
322	int			bufsize, rbufsize;
323	void			*buf, *rbuf;
324	int			i, j, n, rc;
325
326	*ipaddrs = NULL;
327	*size = 0;
328	*nifs = 0;
329
330	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter");
331
332retry_count:
333	rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval,
334	    CRED());
335	if (rval != 0) {
336		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
337		    "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval);
338		return (rval);
339	}
340
341	numifs = ifn;
342	if (numifs <= 0) {
343		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found");
344		return (0);
345	}
346
347	/* allocate extra room in case more interfaces appear */
348	numifs += 10;
349
350	/* get the interface names and ip addresses */
351	bufsize = numifs * sizeof (struct ifreq);
352	buf = kmem_alloc(bufsize, KM_SLEEP);
353
354	ifc.ifc_len = bufsize;
355	ifc.ifc_buf = buf;
356	rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED());
357	if (rc != 0) {
358		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
359		    "SIOCGLIFCONF failed: %d", rc);
360		kmem_free(buf, bufsize);
361		return (rc);
362	}
363	/* if our extra room is used up, try again */
364	if (bufsize <= ifc.ifc_len) {
365		kmem_free(buf, bufsize);
366		buf = NULL;
367		goto retry_count;
368	}
369	/* calc actual number of ifconfs */
370	n = ifc.ifc_len / sizeof (struct ifreq);
371
372	/*
373	 * Count the RDS interfaces
374	 */
375	for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
376
377		/*
378		 * Copy as the SIOCGIFFLAGS ioctl is destructive
379		 */
380		bcopy(lp, &ifr, sizeof (struct ifreq));
381		/*
382		 * fetch the flags using the socket of the correct family
383		 */
384		switch (ifr.ifr_addr.sa_family) {
385		case AF_INET:
386			rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
387			    &rval, CRED());
388			break;
389		default:
390			continue;
391		}
392
393		if (rc != 0) continue;
394
395		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
396		    "1. ifr_name: %s, flags: %d", ifr.ifr_name,
397		    (ushort_t)ifr.ifr_flags);
398
399		/*
400		 * If we got the flags, skip uninteresting
401		 * interfaces based on flags
402		 */
403		if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
404			continue;
405		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
406		    "2. ifr_name: %s, flags: %d", ifr.ifr_name,
407		    (ushort_t)ifr.ifr_flags);
408		if (((ushort_t)ifr.ifr_flags) &
409		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
410			continue;
411		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
412		    "3. ifr_name: %s, flags: %d", ifr.ifr_name,
413		    (ushort_t)ifr.ifr_flags);
414		if (!rdsv3_capable_interface_old(&ifr))
415			continue;
416		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
417		    "4. ifr_name: %s, flags: %d", ifr.ifr_name,
418		    (ushort_t)ifr.ifr_flags);
419		j++;
420	}
421
422	if (j <= 0) {
423		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces");
424		kmem_free(buf, bufsize);
425		return (rval);
426	}
427
428	numifs = j;
429
430	/* This is the buffer we pass back */
431	rbufsize = numifs * sizeof (struct ifreq);
432	rbuf = kmem_alloc(rbufsize, KM_SLEEP);
433	rlp = (struct ifreq *)rbuf;
434
435	/*
436	 * Examine the array of interfaces and filter uninteresting ones
437	 */
438	for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
439
440		/*
441		 * Copy the address as the SIOCGIFFLAGS ioctl is destructive
442		 */
443		bcopy(lp, &ifr, sizeof (struct ifreq));
444		/*
445		 * fetch the flags using the socket of the correct family
446		 */
447		switch (ifr.ifr_addr.sa_family) {
448		case AF_INET:
449			rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
450			    &rval, CRED());
451			break;
452		default:
453			continue;
454		}
455
456
457		if (rc != 0) {
458			RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
459			    "ksocket_ioctl failed: %d for %s",
460			    rc, ifr.ifr_name);
461			continue;
462		}
463
464		/*
465		 * If we got the flags, skip uninteresting
466		 * interfaces based on flags
467		 */
468		if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
469			continue;
470		if (((ushort_t)ifr.ifr_flags) &
471		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
472			continue;
473		if (!rdsv3_capable_interface_old(&ifr))
474			continue;
475
476		/* save the record */
477		bcopy(lp, rlp, sizeof (struct ifreq));
478		rlp++;
479	}
480
481	kmem_free(buf, bufsize);
482
483	*ipaddrs = rbuf;
484	*size = rbufsize;
485	*nifs = numifs;
486
487	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return");
488
489	return (rval);
490}
491
492boolean_t
493rdsv3_isloopback(ipaddr_t addr)
494{
495	ip_stack_t *ipst;
496
497	ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
498	ASSERT(ipst != NULL);
499	if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) {
500		netstack_rele(ipst->ips_netstack);
501		return (B_FALSE);
502	}
503	netstack_rele(ipst->ips_netstack);
504	return (B_TRUE);
505}
506
507/*
508 * Work Queue Implementation
509 */
510
511#define	RDSV3_WQ_THREAD_IDLE		0
512#define	RDSV3_WQ_THREAD_RUNNING		1
513#define	RDSV3_WQ_THREAD_FLUSHING	2
514#define	RDSV3_WQ_THREAD_EXITING		3
515
516/* worker thread */
517void
518rdsv3_worker_thread(void *arg)
519{
520	rdsv3_workqueue_struct_t *wq = arg;
521	rdsv3_work_t *work;
522
523	RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq);
524
525	mutex_enter(&wq->wq_lock);
526	work = list_remove_head(&wq->wq_queue);
527	while (work) {
528		mutex_exit(&wq->wq_lock);
529
530		/* process work */
531		work->func(work);
532
533		mutex_enter(&wq->wq_lock);
534		work = list_remove_head(&wq->wq_queue);
535	}
536
537	/* No more work, go home, until called again */
538	if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) {
539		wq->wq_state = RDSV3_WQ_THREAD_IDLE;
540	}
541	mutex_exit(&wq->wq_lock);
542
543	RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq);
544}
545
546/* XXX */
547void
548rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq)
549{
550	RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq);
551
552	mutex_enter(&wq->wq_lock);
553	switch (wq->wq_state) {
554	case RDSV3_WQ_THREAD_IDLE:
555		/* nothing to do */
556		ASSERT(list_is_empty(&wq->wq_queue));
557		break;
558
559	case RDSV3_WQ_THREAD_RUNNING:
560		wq->wq_state = RDSV3_WQ_THREAD_FLUSHING;
561		/* FALLTHRU */
562	case RDSV3_WQ_THREAD_FLUSHING:
563		/* already flushing, wait until the flushing is complete */
564		do {
565			mutex_exit(&wq->wq_lock);
566			delay(drv_usectohz(1000000));
567			mutex_enter(&wq->wq_lock);
568		} while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
569		break;
570	case RDSV3_WQ_THREAD_EXITING:
571		mutex_exit(&wq->wq_lock);
572		rdsv3_worker_thread(wq);
573		return;
574	}
575	mutex_exit(&wq->wq_lock);
576
577	RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq);
578}
579
580void
581rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp)
582{
583	RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp);
584
585	mutex_enter(&wq->wq_lock);
586
587	if (list_link_active(&wp->work_item)) {
588		/* This is already in the queue, ignore this call */
589		mutex_exit(&wq->wq_lock);
590		RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp);
591		return;
592	}
593
594	switch (wq->wq_state) {
595	case RDSV3_WQ_THREAD_RUNNING:
596		list_insert_tail(&wq->wq_queue, wp);
597		mutex_exit(&wq->wq_lock);
598		break;
599
600	case RDSV3_WQ_THREAD_FLUSHING:
601		do {
602			mutex_exit(&wq->wq_lock);
603			delay(drv_usectohz(1000000));
604			mutex_enter(&wq->wq_lock);
605		} while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
606
607		if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) {
608			list_insert_tail(&wq->wq_queue, wp);
609			mutex_exit(&wq->wq_lock);
610			break;
611		}
612		/* FALLTHRU */
613
614	case RDSV3_WQ_THREAD_IDLE:
615		list_insert_tail(&wq->wq_queue, wp);
616		wq->wq_state = RDSV3_WQ_THREAD_RUNNING;
617		mutex_exit(&wq->wq_lock);
618
619		(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq,
620		    DDI_SLEEP);
621		break;
622
623	case RDSV3_WQ_THREAD_EXITING:
624		mutex_exit(&wq->wq_lock);
625		break;
626	}
627
628	RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp);
629}
630
631/* timeout handler for delayed work queuing */
632void
633rdsv3_work_timeout_handler(void *arg)
634{
635	rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg;
636
637	RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
638	    "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work);
639
640	mutex_enter(&dwp->lock);
641	dwp->timeid = 0;
642	mutex_exit(&dwp->lock);
643
644	mutex_enter(&dwp->wq->wq_lock);
645	dwp->wq->wq_pending--;
646	if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
647		mutex_exit(&dwp->wq->wq_lock);
648		return;
649	}
650	mutex_exit(&dwp->wq->wq_lock);
651
652	rdsv3_queue_work(dwp->wq, &dwp->work);
653
654	RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
655	    "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work);
656}
657
658void
659rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq,
660    rdsv3_delayed_work_t *dwp, uint_t delay)
661{
662	RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
663	    "Enter(wq: %p, wp: %p)", wq, dwp);
664
665	if (delay == 0) {
666		rdsv3_queue_work(wq, &dwp->work);
667		return;
668	}
669
670	mutex_enter(&wq->wq_lock);
671	if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
672		mutex_exit(&wq->wq_lock);
673		RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
674		    "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp);
675		return;
676	}
677	wq->wq_pending++;
678	mutex_exit(&wq->wq_lock);
679
680	mutex_enter(&dwp->lock);
681	if (dwp->timeid == 0) {
682		dwp->wq = wq;
683		dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp,
684		    jiffies + (delay * rdsv3_one_sec_in_hz));
685		mutex_exit(&dwp->lock);
686	} else {
687		mutex_exit(&dwp->lock);
688		RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p",
689		    dwp);
690		mutex_enter(&wq->wq_lock);
691		wq->wq_pending--;
692		mutex_exit(&wq->wq_lock);
693	}
694
695	RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
696	    "Return(wq: %p, wp: %p)", wq, dwp);
697}
698
699void
700rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp)
701{
702	RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
703	    "Enter(wq: %p, dwp: %p)", dwp->wq, dwp);
704
705	mutex_enter(&dwp->lock);
706	if (dwp->timeid != 0) {
707		(void) untimeout(dwp->timeid);
708		dwp->timeid = 0;
709	} else {
710		RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
711		    "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp);
712		mutex_exit(&dwp->lock);
713		return;
714	}
715	mutex_exit(&dwp->lock);
716
717	mutex_enter(&dwp->wq->wq_lock);
718	dwp->wq->wq_pending--;
719	mutex_exit(&dwp->wq->wq_lock);
720
721	RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
722	    "Return(wq: %p, dwp: %p)", dwp->wq, dwp);
723}
724
725void
726rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq)
727{
728	RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter");
729
730	ASSERT(wq);
731
732	mutex_enter(&wq->wq_lock);
733	wq->wq_state = RDSV3_WQ_THREAD_EXITING;
734
735	while (wq->wq_pending > 0) {
736		mutex_exit(&wq->wq_lock);
737		delay(drv_usectohz(1000000));
738		mutex_enter(&wq->wq_lock);
739	};
740	mutex_exit(&wq->wq_lock);
741
742	rdsv3_flush_workqueue(wq);
743
744	list_destroy(&wq->wq_queue);
745	mutex_destroy(&wq->wq_lock);
746	kmem_free(wq, sizeof (rdsv3_workqueue_struct_t));
747
748	ASSERT(rdsv3_taskq);
749	ddi_taskq_destroy(rdsv3_taskq);
750
751	wq = NULL;
752	rdsv3_taskq = NULL;
753
754	RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return");
755}
756
757/* ARGSUSED */
758void
759rdsv3_rdma_init_worker(struct rdsv3_work_s *work)
760{
761	rdsv3_rdma_init();
762}
763
764#define	RDSV3_NUM_TASKQ_THREADS	4
765rdsv3_workqueue_struct_t *
766rdsv3_create_task_workqueue(char *name)
767{
768	rdsv3_workqueue_struct_t	*wq;
769
770	RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)",
771	    rdsv3_dev_info);
772
773	rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name,
774	    RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0);
775	if (rdsv3_taskq == NULL) {
776		RDSV3_DPRINTF1(__FILE__,
777		    "ddi_taskq_create failed for rdsv3_taskq");
778		return (NULL);
779	}
780
781	wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP);
782	if (wq == NULL) {
783		RDSV3_DPRINTF1(__FILE__, "kmem_zalloc failed for wq");
784		ddi_taskq_destroy(rdsv3_taskq);
785		return (NULL);
786	}
787
788	list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s),
789	    offsetof(struct rdsv3_work_s, work_item));
790	mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL);
791	wq->wq_state = RDSV3_WQ_THREAD_IDLE;
792	wq->wq_pending = 0;
793	rdsv3_one_sec_in_hz = drv_usectohz(1000000);
794
795	RDSV3_DPRINTF2("create_singlethread_workqueue", "Return");
796
797	return (wq);
798}
799
800/*
801 * Implementation for struct sock
802 */
803
804void
805rdsv3_sock_exit_data(struct rsock *sk)
806{
807	struct rdsv3_sock *rs = sk->sk_protinfo;
808
809	RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
810
811	ASSERT(rs != NULL);
812	ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD));
813
814	rs->rs_sk = NULL;
815
816	list_destroy(&rs->rs_send_queue);
817	list_destroy(&rs->rs_notify_queue);
818	list_destroy(&rs->rs_recv_queue);
819
820	rw_destroy(&rs->rs_recv_lock);
821	mutex_destroy(&rs->rs_lock);
822
823	mutex_destroy(&rs->rs_rdma_lock);
824	avl_destroy(&rs->rs_rdma_keys);
825
826	rdsv3_exit_waitqueue(sk->sk_sleep);
827	kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t));
828	mutex_destroy(&sk->sk_lock);
829
830	kmem_cache_free(rdsv3_alloc_cache, sk);
831	RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
832}
833
834/* XXX - figure out right values */
835#define	RDSV3_RECV_HIWATER	(256 * 1024)
836#define	RDSV3_RECV_LOWATER	128
837#define	RDSV3_XMIT_HIWATER	(256 * 1024)
838#define	RDSV3_XMIT_LOWATER	1024
839
840struct rsock *
841rdsv3_sk_alloc()
842{
843	struct rsock *sk;
844
845	sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP);
846	if (sk == NULL) {
847		RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed");
848		return (NULL);
849	}
850
851	bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock));
852	return (sk);
853}
854
855void
856rdsv3_sock_init_data(struct rsock *sk)
857{
858	sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP);
859	rdsv3_init_waitqueue(sk->sk_sleep);
860
861	mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL);
862	sk->sk_refcount = 1;
863	sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1);
864	sk->sk_sndbuf = RDSV3_XMIT_HIWATER;
865	sk->sk_rcvbuf = RDSV3_RECV_HIWATER;
866}
867
868/* XXX - not complete */
869void
870rdsv3_poll_wait(struct rsock *sk, rdsv3_wait_queue_t *waitq, short events)
871{
872	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
873
874	if (events & POLLIN) {
875		rw_enter(&rs->rs_recv_lock, RW_READER);
876		while (list_is_empty(&rs->rs_recv_queue) &&
877		    list_is_empty(&rs->rs_notify_queue)) {
878			rw_exit(&rs->rs_recv_lock);
879			mutex_enter(&waitq->waitq_mutex);
880			(void) cv_wait_sig(&waitq->waitq_cv,
881			    &waitq->waitq_mutex);
882			mutex_exit(&waitq->waitq_mutex);
883			rw_enter(&rs->rs_recv_lock, RW_READER);
884		}
885		rw_exit(&rs->rs_recv_lock);
886	}
887}
888
889/*
890 * Connection cache
891 */
892/* ARGSUSED */
893int
894rdsv3_conn_constructor(void *buf, void *arg, int kmflags)
895{
896	struct rdsv3_connection *conn = buf;
897
898	bzero(conn, sizeof (struct rdsv3_connection));
899
900	conn->c_next_tx_seq = 1;
901	mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL);
902	mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL);
903	list_create(&conn->c_send_queue, sizeof (struct rdsv3_message),
904	    offsetof(struct rdsv3_message, m_conn_item));
905	list_create(&conn->c_retrans, sizeof (struct rdsv3_message),
906	    offsetof(struct rdsv3_message, m_conn_item));
907	return (0);
908}
909
910/* ARGSUSED */
911void
912rdsv3_conn_destructor(void *buf, void *arg)
913{
914	struct rdsv3_connection *conn = buf;
915
916	ASSERT(list_is_empty(&conn->c_send_queue));
917	ASSERT(list_is_empty(&conn->c_retrans));
918	list_destroy(&conn->c_send_queue);
919	list_destroy(&conn->c_retrans);
920	mutex_destroy(&conn->c_send_lock);
921	mutex_destroy(&conn->c_lock);
922}
923
924int
925rdsv3_conn_compare(const void *conn1, const void *conn2)
926{
927	uint32_be_t	laddr1, faddr1, laddr2, faddr2;
928
929	laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr;
930	laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr;
931
932	if (laddr1 == laddr2) {
933		faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr;
934		faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr;
935		if (faddr1 == faddr2)
936			return (0);
937		if (faddr1 < faddr2)
938			return (-1);
939		return (1);
940	}
941
942	if (laddr1 < laddr2)
943		return (-1);
944
945	return (1);
946}
947
948/* loop.c */
949extern kmutex_t loop_conns_lock;
950extern list_t loop_conns;
951
952struct rdsv3_loop_connection
953{
954	struct list_node loop_node;
955	struct rdsv3_connection *conn;
956};
957
958void
959rdsv3_loop_init(void)
960{
961	list_create(&loop_conns, sizeof (struct rdsv3_loop_connection),
962	    offsetof(struct rdsv3_loop_connection, loop_node));
963	mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL);
964}
965
966/* rdma.c */
967/* IB Rkey is used here for comparison */
968int
969rdsv3_mr_compare(const void *mr1, const void *mr2)
970{
971	uint32_t key1 = *(uint32_t *)mr1;
972	uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key;
973
974	if (key1 < key2)
975		return (-1);
976	if (key1 > key2)
977		return (1);
978	return (0);
979}
980
981/* transport.c */
982extern list_t			transports;
983extern krwlock_t		trans_sem;
984
985void
986rdsv3_trans_exit(void)
987{
988	struct rdsv3_transport *trans;
989
990	RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter");
991
992	/* currently, only IB transport */
993	rw_enter(&trans_sem, RW_READER);
994	if (!list_is_empty(&transports))
995		trans = list_head(&transports);
996	else
997		trans = NULL;
998	rw_exit(&trans_sem);
999
1000	/* trans->exit() will remove the trans from the list */
1001	if (trans)
1002		trans->exit();
1003
1004	list_destroy(&transports);
1005	rw_destroy(&trans_sem);
1006
1007	RDSV3_DPRINTF2("rdsv3_trans_exit", "Return");
1008}
1009
1010void
1011rdsv3_trans_init()
1012{
1013	RDSV3_DPRINTF2("rdsv3_trans_init", "Enter");
1014
1015	list_create(&transports, sizeof (struct rdsv3_transport),
1016	    offsetof(struct rdsv3_transport, t_item));
1017	rw_init(&trans_sem, NULL, RW_DRIVER, NULL);
1018
1019	RDSV3_DPRINTF2("rdsv3_trans_init", "Return");
1020}
1021
1022int
1023rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size,
1024	void *payload)
1025{
1026	struct cmsghdr *cp;
1027	char *bp;
1028	size_t cmlen;
1029	size_t cmspace;
1030	size_t bufsz;
1031
1032	RDSV3_DPRINTF4("rdsv3_put_cmsg",
1033	    "Enter(msg: %p level: %d type: %d sz: %d)",
1034	    msg, level, type, size);
1035
1036	if (msg == NULL || msg->msg_controllen == 0 || payload == NULL) {
1037		return (0);
1038	}
1039	/* check for first cmsg or this is another cmsg to be appended */
1040	if (msg->msg_control == NULL)
1041		msg->msg_controllen = 0;
1042
1043	cmlen = CMSG_LEN(size);
1044	cmspace = CMSG_SPACE(size);
1045	bufsz = msg->msg_controllen + cmspace;
1046
1047	/* extend the existing cmsg to append the next cmsg */
1048	bp = kmem_alloc(bufsz, KM_SLEEP);
1049	if (msg->msg_control) {
1050		bcopy(msg->msg_control, bp, msg->msg_controllen);
1051		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
1052	}
1053
1054	/* assign payload the proper cmsg location */
1055	cp = (struct cmsghdr *)(bp + msg->msg_controllen);
1056	cp->cmsg_len = cmlen;
1057	cp->cmsg_level = level;
1058	cp->cmsg_type = type;
1059
1060	bcopy(payload, CMSG_DATA(cp), cmlen -
1061	    (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr)));
1062
1063	msg->msg_control = bp;
1064	msg->msg_controllen = bufsz;
1065
1066	RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len);
1067
1068	return (0);
1069}
1070
1071/* bind.c */
1072extern kmutex_t rdsv3_bind_lock;
1073extern avl_tree_t rdsv3_bind_tree;
1074
1075/* ARGSUSED */
1076int
1077rdsv3_verify_bind_address(ipaddr_t addr)
1078{
1079	return (1);
1080}
1081
1082/* XXX - need to enhance to compare IP address and port */
1083int
1084rdsv3_bind_node_compare(const void *a, const void *b)
1085{
1086	uint16_be_t			port = *(in_port_t *)a;
1087	struct rdsv3_sock		*rs = (struct rdsv3_sock *)b;
1088
1089	RDSV3_DPRINTF5("rdsv3_bind_node_compare", "Enter (%x %x)", port,
1090	    rs->rs_bound_port);
1091
1092	if (port > rs->rs_bound_port)
1093		return (+1);
1094	else if (port < rs->rs_bound_port)
1095		return (-1);
1096
1097	return (0);
1098}
1099
1100void
1101rdsv3_bind_tree_init()
1102{
1103	RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter");
1104
1105	mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL);
1106	avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare,
1107	    sizeof (struct rdsv3_sock),
1108	    offsetof(struct rdsv3_sock, rs_bound_node));
1109
1110	RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return");
1111}
1112
1113void
1114rdsv3_bind_tree_exit()
1115{
1116	RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter");
1117
1118	ASSERT(avl_is_empty(&rdsv3_bind_tree));
1119	avl_destroy(&rdsv3_bind_tree);
1120	mutex_destroy(&rdsv3_bind_lock);
1121
1122	RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return");
1123}
1124
1125/* checksum */
1126uint16_t
1127rdsv3_ip_fast_csum(void *hdr, size_t length)
1128{
1129	return (0xffff &
1130	    (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0)));
1131}
1132
1133/* scatterlist implementation */
1134/* ARGSUSED */
1135caddr_t
1136rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat,
1137    uint_t offset)
1138{
1139	return (0);
1140}
1141
1142uint_t
1143rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat,
1144    uint_t num)
1145{
1146	struct rdsv3_scatterlist *s, *first;
1147	ibt_iov_t *iov;
1148	ibt_wr_ds_t *sgl;
1149	ibt_iov_attr_t iov_attr;
1150	ibt_send_wr_t swr;
1151	uint_t i;
1152
1153	RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num);
1154
1155	s = first = &scat[0];
1156	ASSERT(first->mihdl == NULL);
1157
1158	iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP);
1159	sgl = kmem_zalloc((num * 2) *  sizeof (ibt_wr_ds_t), KM_SLEEP);
1160
1161	for (i = 0; i < num; i++, s++) {
1162		iov[i].iov_addr = s->vaddr;
1163		iov[i].iov_len = s->length;
1164	}
1165
1166	iov_attr.iov_as = NULL;
1167	iov_attr.iov = iov;
1168	iov_attr.iov_buf = NULL;
1169	iov_attr.iov_list_len = num;
1170	iov_attr.iov_wr_nds = num * 2;
1171	iov_attr.iov_lso_hdr_sz = 0;
1172	iov_attr.iov_flags = IBT_IOV_SLEEP;
1173
1174	swr.wr_sgl = sgl;
1175
1176	i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev),
1177	    &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl);
1178	kmem_free(iov, num * sizeof (ibt_iov_t));
1179	if (i != IBT_SUCCESS) {
1180		RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg",
1181		    "ibt_map_mem_iov returned: %d", i);
1182		return (0);
1183	}
1184
1185	s = first;
1186	for (i = 0; i < num; i++, s++, sgl++) {
1187		s->sgl = sgl;
1188	}
1189
1190	return (num);
1191}
1192
1193void
1194rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat,
1195    uint_t num)
1196{
1197	/* Zero length messages have no scatter gather entries */
1198	if (num != 0) {
1199		ASSERT(scat->mihdl != NULL);
1200		ASSERT(scat->sgl != NULL);
1201
1202		(void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl);
1203
1204		kmem_free(scat->sgl, (num * 2)  * sizeof (ibt_wr_ds_t));
1205		scat->sgl = NULL;
1206		scat->mihdl = NULL;
1207	}
1208}
1209
1210int
1211rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
1212{
1213	caddr_t addr;
1214	size_t size;
1215	ibt_mr_attr_t mr_attr;
1216	ibt_mr_desc_t mr_desc;
1217	ibt_mr_hdl_t mr_hdl;
1218	int ret;
1219
1220	RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev);
1221
1222	ASSERT(ic->i_mr == NULL);
1223
1224	size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) *
1225	    sizeof (struct rdsv3_header);
1226
1227	addr = kmem_zalloc(size, KM_NOSLEEP);
1228	if (addr == NULL)
1229		return (-1);
1230
1231	mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr;
1232	mr_attr.mr_len = size;
1233	mr_attr.mr_as = NULL;
1234	mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1235	ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd),
1236	    &mr_attr, &mr_hdl, &mr_desc);
1237	if (ret != IBT_SUCCESS) {
1238		RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs",
1239		    "ibt_register_mr returned: " "%d", ret);
1240		return (-1);
1241	}
1242
1243	ic->i_mr =
1244	    (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr),
1245	    KM_SLEEP);
1246	ic->i_mr->addr = addr;
1247	ic->i_mr->size = size;
1248	ic->i_mr->hdl =	mr_hdl;
1249	ic->i_mr->lkey = mr_desc.md_lkey;
1250
1251	ic->i_send_hdrs = (struct rdsv3_header *)addr;
1252	ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr;
1253
1254	ic->i_recv_hdrs = (struct rdsv3_header *)(addr +
1255	    (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
1256	ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr +
1257	    (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
1258	ic->i_recv_tasklet_cpuid = -1;
1259
1260	ic->i_ack = (struct rdsv3_header *)(addr +
1261	    ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
1262	    sizeof (struct rdsv3_header)));
1263	ic->i_ack_dma = (uint64_t)(uintptr_t)(addr +
1264	    ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
1265	    sizeof (struct rdsv3_header)));
1266
1267	RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev);
1268
1269	return (0);
1270}
1271
1272void
1273rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
1274{
1275	RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev);
1276	ASSERT(ic->i_mr != NULL);
1277
1278	ic->i_send_hdrs = NULL;
1279	ic->i_send_hdrs_dma = NULL;
1280
1281	ic->i_recv_hdrs = NULL;
1282	ic->i_recv_hdrs_dma = NULL;
1283
1284	ic->i_ack = NULL;
1285	ic->i_ack_dma = NULL;
1286
1287	(void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl);
1288
1289	kmem_free(ic->i_mr->addr, ic->i_mr->size);
1290	kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr));
1291
1292	ic->i_mr = NULL;
1293	RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev);
1294}
1295