cong.c revision 12198:4db936bda957
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*
26 * Copyright (c) 2007 Oracle.  All rights reserved.
27 *
28 * This software is available to you under a choice of one of two
29 * licenses.  You may choose to be licensed under the terms of the GNU
30 * General Public License (GPL) Version 2, available from the file
31 * COPYING in the main directory of this source tree, or the
32 * OpenIB.org BSD license below:
33 *
34 *     Redistribution and use in source and binary forms, with or
35 *     without modification, are permitted provided that the following
36 *     conditions are met:
37 *
38 *      - Redistributions of source code must retain the above
39 *        copyright notice, this list of conditions and the following
40 *        disclaimer.
41 *
42 *      - Redistributions in binary form must reproduce the above
43 *        copyright notice, this list of conditions and the following
44 *        disclaimer in the documentation and/or other materials
45 *        provided with the distribution.
46 *
47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54 * SOFTWARE.
55 *
56 */
57#include <sys/rds.h>
58
59#include <sys/ib/clients/rdsv3/rdsv3.h>
60#include <sys/ib/clients/rdsv3/rdsv3_impl.h>
61#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
62
63/*
64 * This file implements the receive side of the unconventional congestion
65 * management in RDS.
66 *
67 * Messages waiting in the receive queue on the receiving socket are accounted
68 * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
69 * message are accounted for.  If the number of bytes queued equals or exceeds
70 * rcvbuf then the socket is congested.  All sends attempted to this socket's
71 * address should return block or return -EWOULDBLOCK.
72 *
73 * Applications are expected to be reasonably tuned such that this situation
74 * very rarely occurs.  An application encountering this "back-pressure" is
75 * considered a bug.
76 *
77 * This is implemented by having each node maintain bitmaps which indicate
78 * which ports on bound addresses are congested.  As the bitmap changes it is
79 * sent through all the connections which terminate in the local address of the
80 * bitmap which changed.
81 *
82 * The bitmaps are allocated as connections are brought up.  This avoids
83 * allocation in the interrupt handling path which queues messages on sockets.
84 * The dense bitmaps let transports send the entire bitmap on any bitmap change
85 * reasonably efficiently.  This is much easier to implement than some
86 * finer-grained communication of per-port congestion.  The sender does a very
87 * inexpensive bit test to test if the port it's about to send to is congested
88 * or not.
89 */
90
91/*
92 * Interaction with poll is a tad tricky. We want all processes stuck in
93 * poll to wake up and check whether a congested destination became uncongested.
94 * The really sad thing is we have no idea which destinations the application
95 * wants to send to - we don't even know which rdsv3_connections are involved.
96 * So until we implement a more flexible rds poll interface, we have to make
97 * do with this:
98 * We maintain a global counter that is incremented each time a congestion map
99 * update is received. Each rds socket tracks this value, and if rdsv3_poll
100 * finds that the saved generation number is smaller than the global generation
101 * number, it wakes up the process.
102 */
103static atomic_t		rdsv3_cong_generation = ATOMIC_INIT(0);
104
105/*
106 * Congestion monitoring
107 */
108static struct list rdsv3_cong_monitor;
109static krwlock_t rdsv3_cong_monitor_lock;
110
111/*
112 * Yes, a global lock.  It's used so infrequently that it's worth keeping it
113 * global to simplify the locking.  It's only used in the following
114 * circumstances:
115 *
116 *  - on connection buildup to associate a conn with its maps
117 *  - on map changes to inform conns of a new map to send
118 *
119 *  It's sadly ordered under the socket callback lock and the connection lock.
120 *  Receive paths can mark ports congested from interrupt context so the
121 *  lock masks interrupts.
122 */
123static kmutex_t rdsv3_cong_lock;
124static struct avl_tree rdsv3_cong_tree;
125
126static struct rdsv3_cong_map *
127rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
128{
129	struct rdsv3_cong_map *map;
130	avl_index_t where;
131
132	if (insert) {
133		map = avl_find(&rdsv3_cong_tree, insert, &where);
134		if (map == NULL) {
135			avl_insert(&rdsv3_cong_tree, insert, where);
136			return (NULL);
137		}
138	} else {
139		struct rdsv3_cong_map map1;
140		map1.m_addr = addr;
141		map = avl_find(&rdsv3_cong_tree, &map1, &where);
142	}
143
144	return (map);
145}
146
147/*
148 * There is only ever one bitmap for any address.  Connections try and allocate
149 * these bitmaps in the process getting pointers to them.  The bitmaps are only
150 * ever freed as the module is removed after all connections have been freed.
151 */
152static struct rdsv3_cong_map *
153rdsv3_cong_from_addr(uint32_be_t addr)
154{
155	struct rdsv3_cong_map *map;
156	struct rdsv3_cong_map *ret = NULL;
157	unsigned long zp;
158	unsigned long i;
159
160	RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
161
162	map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
163	if (map == NULL)
164		return (NULL);
165
166	map->m_addr = addr;
167	rdsv3_init_waitqueue(&map->m_waitq);
168	list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
169	    offsetof(struct rdsv3_connection, c_map_item));
170
171	for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
172		zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
173		if (zp == 0)
174			goto out;
175		map->m_page_addrs[i] = zp;
176	}
177
178	mutex_enter(&rdsv3_cong_lock);
179	ret = rdsv3_cong_tree_walk(addr, map);
180	mutex_exit(&rdsv3_cong_lock);
181
182	if (ret == NULL) {
183		ret = map;
184		map = NULL;
185	}
186
187out:
188	if (map) {
189		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
190		    i++)
191			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
192		kmem_free(map, sizeof (*map));
193	}
194
195	RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
196	    ret, ntohl(addr));
197
198	return (ret);
199}
200
201/*
202 * Put the conn on its local map's list.  This is called when the conn is
203 * really added to the hash.  It's nested under the rdsv3_conn_lock, sadly.
204 */
205void
206rdsv3_cong_add_conn(struct rdsv3_connection *conn)
207{
208	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
209
210	RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
211	    conn, conn->c_lcong);
212	mutex_enter(&rdsv3_cong_lock);
213	list_insert_tail(&conn->c_lcong->m_conn_list, conn);
214	mutex_exit(&rdsv3_cong_lock);
215
216	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
217}
218
219void
220rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
221{
222	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
223
224	RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
225	    conn, conn->c_lcong);
226	mutex_enter(&rdsv3_cong_lock);
227	list_remove_node(&conn->c_map_item);
228	mutex_exit(&rdsv3_cong_lock);
229
230	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
231}
232
233int
234rdsv3_cong_get_maps(struct rdsv3_connection *conn)
235{
236	conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
237	conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
238
239	if (conn->c_lcong == NULL || conn->c_fcong == NULL)
240		return (-ENOMEM);
241
242	return (0);
243}
244
245void
246rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
247{
248	struct rdsv3_connection *conn;
249
250	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
251
252	mutex_enter(&rdsv3_cong_lock);
253
254	RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
255		if (!test_and_set_bit(0, &conn->c_map_queued)) {
256			rdsv3_stats_inc(s_cong_update_queued);
257			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
258		}
259	}
260
261	mutex_exit(&rdsv3_cong_lock);
262
263	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
264}
265
266void
267rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
268{
269	RDSV3_DPRINTF4("rdsv3_cong_map_updated",
270	    "waking map %p for %u.%u.%u.%u",
271	    map, NIPQUAD(map->m_addr));
272	rdsv3_stats_inc(s_cong_update_received);
273	atomic_add_32(&rdsv3_cong_generation, 1);
274#if 0
275XXX
276	if (waitqueue_active(&map->m_waitq))
277#endif
278		rdsv3_wake_up(&map->m_waitq);
279#if 0
280XXX
281	if (waitqueue_active(&rds_poll_waitq))
282#endif
283		rdsv3_wake_up_all(&rdsv3_poll_waitq);
284
285	if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
286		struct rdsv3_sock *rs;
287
288		rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
289		RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
290		    rs_cong_list) {
291			mutex_enter(&rs->rs_lock);
292			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
293			rs->rs_cong_mask &= ~portmask;
294			mutex_exit(&rs->rs_lock);
295			if (rs->rs_cong_notify)
296				rdsv3_wake_sk_sleep(rs);
297		}
298		rw_exit(&rdsv3_cong_monitor_lock);
299	}
300
301	RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
302}
303
304int
305rdsv3_cong_updated_since(unsigned long *recent)
306{
307	unsigned long gen = atomic_get(&rdsv3_cong_generation);
308
309	if (*recent == gen)
310		return (0);
311	*recent = gen;
312	return (1);
313}
314
315/*
316 * These should be using generic_{test,__{clear,set}}_le_bit() but some old
317 * kernels don't have them.  Sigh.
318 */
319#if defined(sparc)
320#define	LE_BIT_XOR	((BITS_PER_LONG-1) & ~0x7)
321#else
322#define	LE_BIT_XOR	0
323#endif
324
325/*
326 * We're called under the locking that protects the sockets receive buffer
327 * consumption.  This makes it a lot easier for the caller to only call us
328 * when it knows that an existing set bit needs to be cleared, and vice versa.
329 * We can't block and we need to deal with concurrent sockets working against
330 * the same per-address map.
331 */
332void
333rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
334{
335	unsigned long i;
336	unsigned long off;
337
338	RDSV3_DPRINTF4("rdsv3_cong_set_bit",
339	    "setting congestion for %u.%u.%u.%u:%u in map %p",
340	    NIPQUAD(map->m_addr), ntohs(port), map);
341
342	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
343	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
344
345	set_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]);
346}
347
348void
349rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
350{
351	unsigned long i;
352	unsigned long off;
353
354	RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
355	    "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
356	    NIPQUAD(map->m_addr), ntohs(port), map);
357
358	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
359	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
360
361	clear_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]);
362}
363
364static int
365rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
366{
367	unsigned long i;
368	unsigned long off;
369
370	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
371	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
372
373	RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
374	    ntohs(port), i, off);
375
376	return (test_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]));
377}
378
379#undef LE_BIT_XOR
380
381void
382rdsv3_cong_add_socket(struct rdsv3_sock *rs)
383{
384	RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
385
386	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
387	if (!list_link_active(&rs->rs_cong_list))
388		list_insert_head(&rdsv3_cong_monitor, rs);
389	rw_exit(&rdsv3_cong_monitor_lock);
390}
391
392void
393rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
394{
395	struct rdsv3_cong_map *map;
396
397	RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
398
399	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
400	list_remove_node(&rs->rs_cong_list);
401	rw_exit(&rdsv3_cong_monitor_lock);
402
403	/* update congestion map for now-closed port */
404	mutex_enter(&rdsv3_cong_lock);
405	map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
406	mutex_exit(&rdsv3_cong_lock);
407
408	if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
409		rdsv3_cong_clear_bit(map, rs->rs_bound_port);
410		rdsv3_cong_queue_updates(map);
411	}
412}
413
414int
415rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
416    struct rdsv3_sock *rs)
417{
418	int	ret = 0;
419
420	RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
421	    rs, nonblock);
422
423	if (!rdsv3_cong_test_bit(map, port))
424		return (0);
425	if (nonblock) {
426		if (rs && rs->rs_cong_monitor) {
427			/*
428			 * It would have been nice to have an atomic set_bit on
429			 * a uint64_t.
430			 */
431			mutex_enter(&rs->rs_lock);
432			rs->rs_cong_mask |=
433			    RDSV3_CONG_MONITOR_MASK(ntohs(port));
434			mutex_exit(&rs->rs_lock);
435
436			/*
437			 * Test again - a congestion update may have arrived in
438			 * the meantime.
439			 */
440			if (!rdsv3_cong_test_bit(map, port))
441				return (0);
442		}
443		rdsv3_stats_inc(s_cong_send_error);
444		return (-ENOBUFS);
445	}
446
447	rdsv3_stats_inc(s_cong_send_blocked);
448	RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
449	    map, ntohs(port));
450
451	mutex_enter(&map->m_waitq.waitq_mutex);
452	while (rdsv3_cong_test_bit(map, port)) {
453		if (cv_wait_sig(&map->m_waitq.waitq_cv,
454		    &map->m_waitq.waitq_mutex) == 0) {
455			ret = -ERESTART;
456			break;
457		}
458	}
459	mutex_exit(&map->m_waitq.waitq_mutex);
460
461	return (ret);
462}
463
464void
465rdsv3_cong_exit(void)
466{
467	struct rdsv3_cong_map *map;
468	unsigned long i;
469
470	RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
471
472	while ((map = avl_first(&rdsv3_cong_tree))) {
473		RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
474		avl_remove(&rdsv3_cong_tree, map);
475		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
476		    i++)
477			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
478		kmem_free(map, sizeof (*map));
479	}
480
481	RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
482}
483
484/*
485 * Allocate a RDS message containing a congestion update.
486 */
487struct rdsv3_message *
488rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
489{
490	struct rdsv3_cong_map *map = conn->c_lcong;
491	struct rdsv3_message *rm;
492
493	rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
494	if (!IS_ERR(rm))
495		rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
496
497	return (rm);
498}
499
500static int
501rdsv3_cong_compare(const void *map1, const void *map2)
502{
503#define	addr1	((struct rdsv3_cong_map *)map1)->m_addr
504#define	addr2	((struct rdsv3_cong_map *)map2)->m_addr
505
506	if (addr1 < addr2)
507		return (-1);
508	if (addr1 > addr2)
509		return (1);
510	return (0);
511}
512
513void
514rdsv3_cong_init(void)
515{
516	list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
517	    offsetof(struct rdsv3_sock, rs_cong_list));
518	rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
519	mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
520	avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
521	    sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
522	    m_rb_node));
523}
524