cong.c revision 12198:4db936bda957
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25/* 26 * Copyright (c) 2007 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57#include <sys/rds.h> 58 59#include <sys/ib/clients/rdsv3/rdsv3.h> 60#include <sys/ib/clients/rdsv3/rdsv3_impl.h> 61#include <sys/ib/clients/rdsv3/rdsv3_debug.h> 62 63/* 64 * This file implements the receive side of the unconventional congestion 65 * management in RDS. 66 * 67 * Messages waiting in the receive queue on the receiving socket are accounted 68 * against the sockets SO_RCVBUF option value. Only the payload bytes in the 69 * message are accounted for. If the number of bytes queued equals or exceeds 70 * rcvbuf then the socket is congested. All sends attempted to this socket's 71 * address should return block or return -EWOULDBLOCK. 72 * 73 * Applications are expected to be reasonably tuned such that this situation 74 * very rarely occurs. An application encountering this "back-pressure" is 75 * considered a bug. 76 * 77 * This is implemented by having each node maintain bitmaps which indicate 78 * which ports on bound addresses are congested. As the bitmap changes it is 79 * sent through all the connections which terminate in the local address of the 80 * bitmap which changed. 81 * 82 * The bitmaps are allocated as connections are brought up. This avoids 83 * allocation in the interrupt handling path which queues messages on sockets. 84 * The dense bitmaps let transports send the entire bitmap on any bitmap change 85 * reasonably efficiently. This is much easier to implement than some 86 * finer-grained communication of per-port congestion. The sender does a very 87 * inexpensive bit test to test if the port it's about to send to is congested 88 * or not. 89 */ 90 91/* 92 * Interaction with poll is a tad tricky. We want all processes stuck in 93 * poll to wake up and check whether a congested destination became uncongested. 94 * The really sad thing is we have no idea which destinations the application 95 * wants to send to - we don't even know which rdsv3_connections are involved. 96 * So until we implement a more flexible rds poll interface, we have to make 97 * do with this: 98 * We maintain a global counter that is incremented each time a congestion map 99 * update is received. Each rds socket tracks this value, and if rdsv3_poll 100 * finds that the saved generation number is smaller than the global generation 101 * number, it wakes up the process. 102 */ 103static atomic_t rdsv3_cong_generation = ATOMIC_INIT(0); 104 105/* 106 * Congestion monitoring 107 */ 108static struct list rdsv3_cong_monitor; 109static krwlock_t rdsv3_cong_monitor_lock; 110 111/* 112 * Yes, a global lock. It's used so infrequently that it's worth keeping it 113 * global to simplify the locking. It's only used in the following 114 * circumstances: 115 * 116 * - on connection buildup to associate a conn with its maps 117 * - on map changes to inform conns of a new map to send 118 * 119 * It's sadly ordered under the socket callback lock and the connection lock. 120 * Receive paths can mark ports congested from interrupt context so the 121 * lock masks interrupts. 122 */ 123static kmutex_t rdsv3_cong_lock; 124static struct avl_tree rdsv3_cong_tree; 125 126static struct rdsv3_cong_map * 127rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert) 128{ 129 struct rdsv3_cong_map *map; 130 avl_index_t where; 131 132 if (insert) { 133 map = avl_find(&rdsv3_cong_tree, insert, &where); 134 if (map == NULL) { 135 avl_insert(&rdsv3_cong_tree, insert, where); 136 return (NULL); 137 } 138 } else { 139 struct rdsv3_cong_map map1; 140 map1.m_addr = addr; 141 map = avl_find(&rdsv3_cong_tree, &map1, &where); 142 } 143 144 return (map); 145} 146 147/* 148 * There is only ever one bitmap for any address. Connections try and allocate 149 * these bitmaps in the process getting pointers to them. The bitmaps are only 150 * ever freed as the module is removed after all connections have been freed. 151 */ 152static struct rdsv3_cong_map * 153rdsv3_cong_from_addr(uint32_be_t addr) 154{ 155 struct rdsv3_cong_map *map; 156 struct rdsv3_cong_map *ret = NULL; 157 unsigned long zp; 158 unsigned long i; 159 160 RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr)); 161 162 map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP); 163 if (map == NULL) 164 return (NULL); 165 166 map->m_addr = addr; 167 rdsv3_init_waitqueue(&map->m_waitq); 168 list_create(&map->m_conn_list, sizeof (struct rdsv3_connection), 169 offsetof(struct rdsv3_connection, c_map_item)); 170 171 for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) { 172 zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP); 173 if (zp == 0) 174 goto out; 175 map->m_page_addrs[i] = zp; 176 } 177 178 mutex_enter(&rdsv3_cong_lock); 179 ret = rdsv3_cong_tree_walk(addr, map); 180 mutex_exit(&rdsv3_cong_lock); 181 182 if (ret == NULL) { 183 ret = map; 184 map = NULL; 185 } 186 187out: 188 if (map) { 189 for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i]; 190 i++) 191 kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE); 192 kmem_free(map, sizeof (*map)); 193 } 194 195 RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x", 196 ret, ntohl(addr)); 197 198 return (ret); 199} 200 201/* 202 * Put the conn on its local map's list. This is called when the conn is 203 * really added to the hash. It's nested under the rdsv3_conn_lock, sadly. 204 */ 205void 206rdsv3_cong_add_conn(struct rdsv3_connection *conn) 207{ 208 RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn); 209 210 RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p", 211 conn, conn->c_lcong); 212 mutex_enter(&rdsv3_cong_lock); 213 list_insert_tail(&conn->c_lcong->m_conn_list, conn); 214 mutex_exit(&rdsv3_cong_lock); 215 216 RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn); 217} 218 219void 220rdsv3_cong_remove_conn(struct rdsv3_connection *conn) 221{ 222 RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn); 223 224 RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p", 225 conn, conn->c_lcong); 226 mutex_enter(&rdsv3_cong_lock); 227 list_remove_node(&conn->c_map_item); 228 mutex_exit(&rdsv3_cong_lock); 229 230 RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn); 231} 232 233int 234rdsv3_cong_get_maps(struct rdsv3_connection *conn) 235{ 236 conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr); 237 conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr); 238 239 if (conn->c_lcong == NULL || conn->c_fcong == NULL) 240 return (-ENOMEM); 241 242 return (0); 243} 244 245void 246rdsv3_cong_queue_updates(struct rdsv3_cong_map *map) 247{ 248 struct rdsv3_connection *conn; 249 250 RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map); 251 252 mutex_enter(&rdsv3_cong_lock); 253 254 RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) { 255 if (!test_and_set_bit(0, &conn->c_map_queued)) { 256 rdsv3_stats_inc(s_cong_update_queued); 257 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 258 } 259 } 260 261 mutex_exit(&rdsv3_cong_lock); 262 263 RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map); 264} 265 266void 267rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask) 268{ 269 RDSV3_DPRINTF4("rdsv3_cong_map_updated", 270 "waking map %p for %u.%u.%u.%u", 271 map, NIPQUAD(map->m_addr)); 272 rdsv3_stats_inc(s_cong_update_received); 273 atomic_add_32(&rdsv3_cong_generation, 1); 274#if 0 275XXX 276 if (waitqueue_active(&map->m_waitq)) 277#endif 278 rdsv3_wake_up(&map->m_waitq); 279#if 0 280XXX 281 if (waitqueue_active(&rds_poll_waitq)) 282#endif 283 rdsv3_wake_up_all(&rdsv3_poll_waitq); 284 285 if (portmask && !list_is_empty(&rdsv3_cong_monitor)) { 286 struct rdsv3_sock *rs; 287 288 rw_enter(&rdsv3_cong_monitor_lock, RW_READER); 289 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor, 290 rs_cong_list) { 291 mutex_enter(&rs->rs_lock); 292 rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); 293 rs->rs_cong_mask &= ~portmask; 294 mutex_exit(&rs->rs_lock); 295 if (rs->rs_cong_notify) 296 rdsv3_wake_sk_sleep(rs); 297 } 298 rw_exit(&rdsv3_cong_monitor_lock); 299 } 300 301 RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map); 302} 303 304int 305rdsv3_cong_updated_since(unsigned long *recent) 306{ 307 unsigned long gen = atomic_get(&rdsv3_cong_generation); 308 309 if (*recent == gen) 310 return (0); 311 *recent = gen; 312 return (1); 313} 314 315/* 316 * These should be using generic_{test,__{clear,set}}_le_bit() but some old 317 * kernels don't have them. Sigh. 318 */ 319#if defined(sparc) 320#define LE_BIT_XOR ((BITS_PER_LONG-1) & ~0x7) 321#else 322#define LE_BIT_XOR 0 323#endif 324 325/* 326 * We're called under the locking that protects the sockets receive buffer 327 * consumption. This makes it a lot easier for the caller to only call us 328 * when it knows that an existing set bit needs to be cleared, and vice versa. 329 * We can't block and we need to deal with concurrent sockets working against 330 * the same per-address map. 331 */ 332void 333rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port) 334{ 335 unsigned long i; 336 unsigned long off; 337 338 RDSV3_DPRINTF4("rdsv3_cong_set_bit", 339 "setting congestion for %u.%u.%u.%u:%u in map %p", 340 NIPQUAD(map->m_addr), ntohs(port), map); 341 342 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; 343 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; 344 345 set_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]); 346} 347 348void 349rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port) 350{ 351 unsigned long i; 352 unsigned long off; 353 354 RDSV3_DPRINTF4("rdsv3_cong_clear_bit", 355 "clearing congestion for %u.%u.%u.%u:%u in map %p\n", 356 NIPQUAD(map->m_addr), ntohs(port), map); 357 358 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; 359 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; 360 361 clear_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]); 362} 363 364static int 365rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port) 366{ 367 unsigned long i; 368 unsigned long off; 369 370 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS; 371 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS; 372 373 RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx", 374 ntohs(port), i, off); 375 376 return (test_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i])); 377} 378 379#undef LE_BIT_XOR 380 381void 382rdsv3_cong_add_socket(struct rdsv3_sock *rs) 383{ 384 RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs); 385 386 rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER); 387 if (!list_link_active(&rs->rs_cong_list)) 388 list_insert_head(&rdsv3_cong_monitor, rs); 389 rw_exit(&rdsv3_cong_monitor_lock); 390} 391 392void 393rdsv3_cong_remove_socket(struct rdsv3_sock *rs) 394{ 395 struct rdsv3_cong_map *map; 396 397 RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs); 398 399 rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER); 400 list_remove_node(&rs->rs_cong_list); 401 rw_exit(&rdsv3_cong_monitor_lock); 402 403 /* update congestion map for now-closed port */ 404 mutex_enter(&rdsv3_cong_lock); 405 map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL); 406 mutex_exit(&rdsv3_cong_lock); 407 408 if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) { 409 rdsv3_cong_clear_bit(map, rs->rs_bound_port); 410 rdsv3_cong_queue_updates(map); 411 } 412} 413 414int 415rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock, 416 struct rdsv3_sock *rs) 417{ 418 int ret = 0; 419 420 RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)", 421 rs, nonblock); 422 423 if (!rdsv3_cong_test_bit(map, port)) 424 return (0); 425 if (nonblock) { 426 if (rs && rs->rs_cong_monitor) { 427 /* 428 * It would have been nice to have an atomic set_bit on 429 * a uint64_t. 430 */ 431 mutex_enter(&rs->rs_lock); 432 rs->rs_cong_mask |= 433 RDSV3_CONG_MONITOR_MASK(ntohs(port)); 434 mutex_exit(&rs->rs_lock); 435 436 /* 437 * Test again - a congestion update may have arrived in 438 * the meantime. 439 */ 440 if (!rdsv3_cong_test_bit(map, port)) 441 return (0); 442 } 443 rdsv3_stats_inc(s_cong_send_error); 444 return (-ENOBUFS); 445 } 446 447 rdsv3_stats_inc(s_cong_send_blocked); 448 RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u", 449 map, ntohs(port)); 450 451 mutex_enter(&map->m_waitq.waitq_mutex); 452 while (rdsv3_cong_test_bit(map, port)) { 453 if (cv_wait_sig(&map->m_waitq.waitq_cv, 454 &map->m_waitq.waitq_mutex) == 0) { 455 ret = -ERESTART; 456 break; 457 } 458 } 459 mutex_exit(&map->m_waitq.waitq_mutex); 460 461 return (ret); 462} 463 464void 465rdsv3_cong_exit(void) 466{ 467 struct rdsv3_cong_map *map; 468 unsigned long i; 469 470 RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter"); 471 472 while ((map = avl_first(&rdsv3_cong_tree))) { 473 RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map); 474 avl_remove(&rdsv3_cong_tree, map); 475 for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i]; 476 i++) 477 kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE); 478 kmem_free(map, sizeof (*map)); 479 } 480 481 RDSV3_DPRINTF4("rdsv3_cong_exit", "Return"); 482} 483 484/* 485 * Allocate a RDS message containing a congestion update. 486 */ 487struct rdsv3_message * 488rdsv3_cong_update_alloc(struct rdsv3_connection *conn) 489{ 490 struct rdsv3_cong_map *map = conn->c_lcong; 491 struct rdsv3_message *rm; 492 493 rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES); 494 if (!IS_ERR(rm)) 495 rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP; 496 497 return (rm); 498} 499 500static int 501rdsv3_cong_compare(const void *map1, const void *map2) 502{ 503#define addr1 ((struct rdsv3_cong_map *)map1)->m_addr 504#define addr2 ((struct rdsv3_cong_map *)map2)->m_addr 505 506 if (addr1 < addr2) 507 return (-1); 508 if (addr1 > addr2) 509 return (1); 510 return (0); 511} 512 513void 514rdsv3_cong_init(void) 515{ 516 list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock), 517 offsetof(struct rdsv3_sock, rs_cong_list)); 518 rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL); 519 mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL); 520 avl_create(&rdsv3_cong_tree, rdsv3_cong_compare, 521 sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map, 522 m_rb_node)); 523} 524