112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM */
412198SEiji.Ota@Sun.COM
512198SEiji.Ota@Sun.COM /*
612763SGiri.Adari@Sun.COM * This file contains code imported from the OFED rds source file cong.c
712763SGiri.Adari@Sun.COM * Oracle elects to have and use the contents of cong.c under and governed
812763SGiri.Adari@Sun.COM * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM */
1112763SGiri.Adari@Sun.COM
1212763SGiri.Adari@Sun.COM
1312763SGiri.Adari@Sun.COM /*
1412198SEiji.Ota@Sun.COM * Copyright (c) 2007 Oracle. All rights reserved.
1512198SEiji.Ota@Sun.COM *
1612198SEiji.Ota@Sun.COM * This software is available to you under a choice of one of two
1712198SEiji.Ota@Sun.COM * licenses. You may choose to be licensed under the terms of the GNU
1812198SEiji.Ota@Sun.COM * General Public License (GPL) Version 2, available from the file
1912198SEiji.Ota@Sun.COM * COPYING in the main directory of this source tree, or the
2012198SEiji.Ota@Sun.COM * OpenIB.org BSD license below:
2112198SEiji.Ota@Sun.COM *
2212198SEiji.Ota@Sun.COM * Redistribution and use in source and binary forms, with or
2312198SEiji.Ota@Sun.COM * without modification, are permitted provided that the following
2412198SEiji.Ota@Sun.COM * conditions are met:
2512198SEiji.Ota@Sun.COM *
2612198SEiji.Ota@Sun.COM * - Redistributions of source code must retain the above
2712198SEiji.Ota@Sun.COM * copyright notice, this list of conditions and the following
2812198SEiji.Ota@Sun.COM * disclaimer.
2912198SEiji.Ota@Sun.COM *
3012198SEiji.Ota@Sun.COM * - Redistributions in binary form must reproduce the above
3112198SEiji.Ota@Sun.COM * copyright notice, this list of conditions and the following
3212198SEiji.Ota@Sun.COM * disclaimer in the documentation and/or other materials
3312198SEiji.Ota@Sun.COM * provided with the distribution.
3412198SEiji.Ota@Sun.COM *
3512198SEiji.Ota@Sun.COM * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3612198SEiji.Ota@Sun.COM * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3712198SEiji.Ota@Sun.COM * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3812198SEiji.Ota@Sun.COM * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3912198SEiji.Ota@Sun.COM * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
4012198SEiji.Ota@Sun.COM * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4112198SEiji.Ota@Sun.COM * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4212198SEiji.Ota@Sun.COM * SOFTWARE.
4312198SEiji.Ota@Sun.COM *
4412198SEiji.Ota@Sun.COM */
4512198SEiji.Ota@Sun.COM #include <sys/rds.h>
4612198SEiji.Ota@Sun.COM
4712198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3.h>
4812198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
4912198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5012198SEiji.Ota@Sun.COM
5112198SEiji.Ota@Sun.COM /*
5212198SEiji.Ota@Sun.COM * This file implements the receive side of the unconventional congestion
5312198SEiji.Ota@Sun.COM * management in RDS.
5412198SEiji.Ota@Sun.COM *
5512198SEiji.Ota@Sun.COM * Messages waiting in the receive queue on the receiving socket are accounted
5612198SEiji.Ota@Sun.COM * against the sockets SO_RCVBUF option value. Only the payload bytes in the
5712198SEiji.Ota@Sun.COM * message are accounted for. If the number of bytes queued equals or exceeds
5812198SEiji.Ota@Sun.COM * rcvbuf then the socket is congested. All sends attempted to this socket's
5912198SEiji.Ota@Sun.COM * address should return block or return -EWOULDBLOCK.
6012198SEiji.Ota@Sun.COM *
6112198SEiji.Ota@Sun.COM * Applications are expected to be reasonably tuned such that this situation
6212198SEiji.Ota@Sun.COM * very rarely occurs. An application encountering this "back-pressure" is
6312198SEiji.Ota@Sun.COM * considered a bug.
6412198SEiji.Ota@Sun.COM *
6512198SEiji.Ota@Sun.COM * This is implemented by having each node maintain bitmaps which indicate
6612198SEiji.Ota@Sun.COM * which ports on bound addresses are congested. As the bitmap changes it is
6712198SEiji.Ota@Sun.COM * sent through all the connections which terminate in the local address of the
6812198SEiji.Ota@Sun.COM * bitmap which changed.
6912198SEiji.Ota@Sun.COM *
7012198SEiji.Ota@Sun.COM * The bitmaps are allocated as connections are brought up. This avoids
7112198SEiji.Ota@Sun.COM * allocation in the interrupt handling path which queues messages on sockets.
7212198SEiji.Ota@Sun.COM * The dense bitmaps let transports send the entire bitmap on any bitmap change
7312198SEiji.Ota@Sun.COM * reasonably efficiently. This is much easier to implement than some
7412198SEiji.Ota@Sun.COM * finer-grained communication of per-port congestion. The sender does a very
7512198SEiji.Ota@Sun.COM * inexpensive bit test to test if the port it's about to send to is congested
7612198SEiji.Ota@Sun.COM * or not.
7712198SEiji.Ota@Sun.COM */
7812198SEiji.Ota@Sun.COM
7912198SEiji.Ota@Sun.COM /*
8012198SEiji.Ota@Sun.COM * Interaction with poll is a tad tricky. We want all processes stuck in
8112198SEiji.Ota@Sun.COM * poll to wake up and check whether a congested destination became uncongested.
8212198SEiji.Ota@Sun.COM * The really sad thing is we have no idea which destinations the application
8312198SEiji.Ota@Sun.COM * wants to send to - we don't even know which rdsv3_connections are involved.
8412198SEiji.Ota@Sun.COM * So until we implement a more flexible rds poll interface, we have to make
8512198SEiji.Ota@Sun.COM * do with this:
8612198SEiji.Ota@Sun.COM * We maintain a global counter that is incremented each time a congestion map
8712198SEiji.Ota@Sun.COM * update is received. Each rds socket tracks this value, and if rdsv3_poll
8812198SEiji.Ota@Sun.COM * finds that the saved generation number is smaller than the global generation
8912198SEiji.Ota@Sun.COM * number, it wakes up the process.
9012198SEiji.Ota@Sun.COM */
9112198SEiji.Ota@Sun.COM static atomic_t rdsv3_cong_generation = ATOMIC_INIT(0);
9212198SEiji.Ota@Sun.COM
9312198SEiji.Ota@Sun.COM /*
9412198SEiji.Ota@Sun.COM * Congestion monitoring
9512198SEiji.Ota@Sun.COM */
9612198SEiji.Ota@Sun.COM static struct list rdsv3_cong_monitor;
9712198SEiji.Ota@Sun.COM static krwlock_t rdsv3_cong_monitor_lock;
9812198SEiji.Ota@Sun.COM
9912198SEiji.Ota@Sun.COM /*
10012198SEiji.Ota@Sun.COM * Yes, a global lock. It's used so infrequently that it's worth keeping it
10112198SEiji.Ota@Sun.COM * global to simplify the locking. It's only used in the following
10212198SEiji.Ota@Sun.COM * circumstances:
10312198SEiji.Ota@Sun.COM *
10412198SEiji.Ota@Sun.COM * - on connection buildup to associate a conn with its maps
10512198SEiji.Ota@Sun.COM * - on map changes to inform conns of a new map to send
10612198SEiji.Ota@Sun.COM *
10712198SEiji.Ota@Sun.COM * It's sadly ordered under the socket callback lock and the connection lock.
10812198SEiji.Ota@Sun.COM * Receive paths can mark ports congested from interrupt context so the
10912198SEiji.Ota@Sun.COM * lock masks interrupts.
11012198SEiji.Ota@Sun.COM */
11112198SEiji.Ota@Sun.COM static kmutex_t rdsv3_cong_lock;
11212198SEiji.Ota@Sun.COM static struct avl_tree rdsv3_cong_tree;
11312198SEiji.Ota@Sun.COM
11412198SEiji.Ota@Sun.COM static struct rdsv3_cong_map *
rdsv3_cong_tree_walk(uint32_be_t addr,struct rdsv3_cong_map * insert)11512198SEiji.Ota@Sun.COM rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
11612198SEiji.Ota@Sun.COM {
11712198SEiji.Ota@Sun.COM struct rdsv3_cong_map *map;
11812198SEiji.Ota@Sun.COM avl_index_t where;
11912198SEiji.Ota@Sun.COM
12012198SEiji.Ota@Sun.COM if (insert) {
12112198SEiji.Ota@Sun.COM map = avl_find(&rdsv3_cong_tree, insert, &where);
12212198SEiji.Ota@Sun.COM if (map == NULL) {
12312198SEiji.Ota@Sun.COM avl_insert(&rdsv3_cong_tree, insert, where);
12412198SEiji.Ota@Sun.COM return (NULL);
12512198SEiji.Ota@Sun.COM }
12612198SEiji.Ota@Sun.COM } else {
12712198SEiji.Ota@Sun.COM struct rdsv3_cong_map map1;
12812198SEiji.Ota@Sun.COM map1.m_addr = addr;
12912198SEiji.Ota@Sun.COM map = avl_find(&rdsv3_cong_tree, &map1, &where);
13012198SEiji.Ota@Sun.COM }
13112198SEiji.Ota@Sun.COM
13212198SEiji.Ota@Sun.COM return (map);
13312198SEiji.Ota@Sun.COM }
13412198SEiji.Ota@Sun.COM
13512198SEiji.Ota@Sun.COM /*
13612198SEiji.Ota@Sun.COM * There is only ever one bitmap for any address. Connections try and allocate
13712198SEiji.Ota@Sun.COM * these bitmaps in the process getting pointers to them. The bitmaps are only
13812198SEiji.Ota@Sun.COM * ever freed as the module is removed after all connections have been freed.
13912198SEiji.Ota@Sun.COM */
14012198SEiji.Ota@Sun.COM static struct rdsv3_cong_map *
rdsv3_cong_from_addr(uint32_be_t addr)14112198SEiji.Ota@Sun.COM rdsv3_cong_from_addr(uint32_be_t addr)
14212198SEiji.Ota@Sun.COM {
14312198SEiji.Ota@Sun.COM struct rdsv3_cong_map *map;
14412198SEiji.Ota@Sun.COM struct rdsv3_cong_map *ret = NULL;
14512198SEiji.Ota@Sun.COM unsigned long zp;
14612198SEiji.Ota@Sun.COM unsigned long i;
14712198SEiji.Ota@Sun.COM
14812198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
14912198SEiji.Ota@Sun.COM
15012198SEiji.Ota@Sun.COM map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
15112676SEiji.Ota@Sun.COM if (!map)
15212198SEiji.Ota@Sun.COM return (NULL);
15312198SEiji.Ota@Sun.COM
15412198SEiji.Ota@Sun.COM map->m_addr = addr;
15512198SEiji.Ota@Sun.COM rdsv3_init_waitqueue(&map->m_waitq);
15612198SEiji.Ota@Sun.COM list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
15712198SEiji.Ota@Sun.COM offsetof(struct rdsv3_connection, c_map_item));
15812198SEiji.Ota@Sun.COM
15912198SEiji.Ota@Sun.COM for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
16012198SEiji.Ota@Sun.COM zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
16112198SEiji.Ota@Sun.COM if (zp == 0)
16212198SEiji.Ota@Sun.COM goto out;
16312198SEiji.Ota@Sun.COM map->m_page_addrs[i] = zp;
16412198SEiji.Ota@Sun.COM }
16512198SEiji.Ota@Sun.COM
16612198SEiji.Ota@Sun.COM mutex_enter(&rdsv3_cong_lock);
16712198SEiji.Ota@Sun.COM ret = rdsv3_cong_tree_walk(addr, map);
16812198SEiji.Ota@Sun.COM mutex_exit(&rdsv3_cong_lock);
16912198SEiji.Ota@Sun.COM
17012676SEiji.Ota@Sun.COM if (!ret) {
17112198SEiji.Ota@Sun.COM ret = map;
17212198SEiji.Ota@Sun.COM map = NULL;
17312198SEiji.Ota@Sun.COM }
17412198SEiji.Ota@Sun.COM
17512198SEiji.Ota@Sun.COM out:
17612198SEiji.Ota@Sun.COM if (map) {
17712198SEiji.Ota@Sun.COM for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
17812198SEiji.Ota@Sun.COM i++)
17912198SEiji.Ota@Sun.COM kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
18012198SEiji.Ota@Sun.COM kmem_free(map, sizeof (*map));
18112198SEiji.Ota@Sun.COM }
18212198SEiji.Ota@Sun.COM
18312198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
18412198SEiji.Ota@Sun.COM ret, ntohl(addr));
18512198SEiji.Ota@Sun.COM
18612198SEiji.Ota@Sun.COM return (ret);
18712198SEiji.Ota@Sun.COM }
18812198SEiji.Ota@Sun.COM
18912198SEiji.Ota@Sun.COM /*
19012198SEiji.Ota@Sun.COM * Put the conn on its local map's list. This is called when the conn is
19112198SEiji.Ota@Sun.COM * really added to the hash. It's nested under the rdsv3_conn_lock, sadly.
19212198SEiji.Ota@Sun.COM */
19312198SEiji.Ota@Sun.COM void
rdsv3_cong_add_conn(struct rdsv3_connection * conn)19412198SEiji.Ota@Sun.COM rdsv3_cong_add_conn(struct rdsv3_connection *conn)
19512198SEiji.Ota@Sun.COM {
19612198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
19712198SEiji.Ota@Sun.COM
19812198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
19912198SEiji.Ota@Sun.COM conn, conn->c_lcong);
20012198SEiji.Ota@Sun.COM mutex_enter(&rdsv3_cong_lock);
20112198SEiji.Ota@Sun.COM list_insert_tail(&conn->c_lcong->m_conn_list, conn);
20212198SEiji.Ota@Sun.COM mutex_exit(&rdsv3_cong_lock);
20312198SEiji.Ota@Sun.COM
20412198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
20512198SEiji.Ota@Sun.COM }
20612198SEiji.Ota@Sun.COM
20712198SEiji.Ota@Sun.COM void
rdsv3_cong_remove_conn(struct rdsv3_connection * conn)20812198SEiji.Ota@Sun.COM rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
20912198SEiji.Ota@Sun.COM {
21012198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
21112198SEiji.Ota@Sun.COM
21212198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
21312198SEiji.Ota@Sun.COM conn, conn->c_lcong);
21412198SEiji.Ota@Sun.COM mutex_enter(&rdsv3_cong_lock);
21512198SEiji.Ota@Sun.COM list_remove_node(&conn->c_map_item);
21612198SEiji.Ota@Sun.COM mutex_exit(&rdsv3_cong_lock);
21712198SEiji.Ota@Sun.COM
21812198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
21912198SEiji.Ota@Sun.COM }
22012198SEiji.Ota@Sun.COM
22112198SEiji.Ota@Sun.COM int
rdsv3_cong_get_maps(struct rdsv3_connection * conn)22212198SEiji.Ota@Sun.COM rdsv3_cong_get_maps(struct rdsv3_connection *conn)
22312198SEiji.Ota@Sun.COM {
22412198SEiji.Ota@Sun.COM conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
22512198SEiji.Ota@Sun.COM conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
22612198SEiji.Ota@Sun.COM
22712676SEiji.Ota@Sun.COM if (!(conn->c_lcong && conn->c_fcong))
22812198SEiji.Ota@Sun.COM return (-ENOMEM);
22912198SEiji.Ota@Sun.COM
23012198SEiji.Ota@Sun.COM return (0);
23112198SEiji.Ota@Sun.COM }
23212198SEiji.Ota@Sun.COM
23312198SEiji.Ota@Sun.COM void
rdsv3_cong_queue_updates(struct rdsv3_cong_map * map)23412198SEiji.Ota@Sun.COM rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
23512198SEiji.Ota@Sun.COM {
23612198SEiji.Ota@Sun.COM struct rdsv3_connection *conn;
23712198SEiji.Ota@Sun.COM
23812198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
23912198SEiji.Ota@Sun.COM
24012198SEiji.Ota@Sun.COM mutex_enter(&rdsv3_cong_lock);
24112198SEiji.Ota@Sun.COM
24212198SEiji.Ota@Sun.COM RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
24312198SEiji.Ota@Sun.COM if (!test_and_set_bit(0, &conn->c_map_queued)) {
24412198SEiji.Ota@Sun.COM rdsv3_stats_inc(s_cong_update_queued);
24512676SEiji.Ota@Sun.COM (void) rdsv3_send_xmit(conn);
24612198SEiji.Ota@Sun.COM }
24712198SEiji.Ota@Sun.COM }
24812198SEiji.Ota@Sun.COM
24912198SEiji.Ota@Sun.COM mutex_exit(&rdsv3_cong_lock);
25012198SEiji.Ota@Sun.COM
25112198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
25212198SEiji.Ota@Sun.COM }
25312198SEiji.Ota@Sun.COM
25412198SEiji.Ota@Sun.COM void
rdsv3_cong_map_updated(struct rdsv3_cong_map * map,uint64_t portmask)25512198SEiji.Ota@Sun.COM rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
25612198SEiji.Ota@Sun.COM {
25712198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_map_updated",
25812198SEiji.Ota@Sun.COM "waking map %p for %u.%u.%u.%u",
25912198SEiji.Ota@Sun.COM map, NIPQUAD(map->m_addr));
26012414SEiji.Ota@Sun.COM
26112198SEiji.Ota@Sun.COM rdsv3_stats_inc(s_cong_update_received);
26212198SEiji.Ota@Sun.COM atomic_add_32(&rdsv3_cong_generation, 1);
26312198SEiji.Ota@Sun.COM #if 0
26412198SEiji.Ota@Sun.COM XXX
26512198SEiji.Ota@Sun.COM if (waitqueue_active(&map->m_waitq))
26612198SEiji.Ota@Sun.COM #endif
26712198SEiji.Ota@Sun.COM rdsv3_wake_up(&map->m_waitq);
26812198SEiji.Ota@Sun.COM
26912198SEiji.Ota@Sun.COM if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
27012198SEiji.Ota@Sun.COM struct rdsv3_sock *rs;
27112198SEiji.Ota@Sun.COM
27212198SEiji.Ota@Sun.COM rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
27312198SEiji.Ota@Sun.COM RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
27412198SEiji.Ota@Sun.COM rs_cong_list) {
27512198SEiji.Ota@Sun.COM mutex_enter(&rs->rs_lock);
27612198SEiji.Ota@Sun.COM rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
27712198SEiji.Ota@Sun.COM rs->rs_cong_mask &= ~portmask;
27812198SEiji.Ota@Sun.COM mutex_exit(&rs->rs_lock);
27912198SEiji.Ota@Sun.COM if (rs->rs_cong_notify)
28012198SEiji.Ota@Sun.COM rdsv3_wake_sk_sleep(rs);
28112198SEiji.Ota@Sun.COM }
28212198SEiji.Ota@Sun.COM rw_exit(&rdsv3_cong_monitor_lock);
28312198SEiji.Ota@Sun.COM }
28412198SEiji.Ota@Sun.COM
28512198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
28612198SEiji.Ota@Sun.COM }
28712198SEiji.Ota@Sun.COM
28812198SEiji.Ota@Sun.COM int
rdsv3_cong_updated_since(unsigned long * recent)28912198SEiji.Ota@Sun.COM rdsv3_cong_updated_since(unsigned long *recent)
29012198SEiji.Ota@Sun.COM {
29112198SEiji.Ota@Sun.COM unsigned long gen = atomic_get(&rdsv3_cong_generation);
29212198SEiji.Ota@Sun.COM
29312198SEiji.Ota@Sun.COM if (*recent == gen)
29412198SEiji.Ota@Sun.COM return (0);
29512198SEiji.Ota@Sun.COM *recent = gen;
29612198SEiji.Ota@Sun.COM return (1);
29712198SEiji.Ota@Sun.COM }
29812198SEiji.Ota@Sun.COM
29912198SEiji.Ota@Sun.COM /*
30012198SEiji.Ota@Sun.COM * We're called under the locking that protects the sockets receive buffer
30112198SEiji.Ota@Sun.COM * consumption. This makes it a lot easier for the caller to only call us
30212198SEiji.Ota@Sun.COM * when it knows that an existing set bit needs to be cleared, and vice versa.
30312198SEiji.Ota@Sun.COM * We can't block and we need to deal with concurrent sockets working against
30412198SEiji.Ota@Sun.COM * the same per-address map.
30512198SEiji.Ota@Sun.COM */
30612198SEiji.Ota@Sun.COM void
rdsv3_cong_set_bit(struct rdsv3_cong_map * map,uint16_be_t port)30712198SEiji.Ota@Sun.COM rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
30812198SEiji.Ota@Sun.COM {
30912198SEiji.Ota@Sun.COM unsigned long i;
31012198SEiji.Ota@Sun.COM unsigned long off;
31112198SEiji.Ota@Sun.COM
31212198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_set_bit",
31312198SEiji.Ota@Sun.COM "setting congestion for %u.%u.%u.%u:%u in map %p",
31412198SEiji.Ota@Sun.COM NIPQUAD(map->m_addr), ntohs(port), map);
31512198SEiji.Ota@Sun.COM
31612198SEiji.Ota@Sun.COM i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
31712198SEiji.Ota@Sun.COM off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
31812414SEiji.Ota@Sun.COM set_le_bit(off, (void *)map->m_page_addrs[i]);
31912198SEiji.Ota@Sun.COM }
32012198SEiji.Ota@Sun.COM
32112198SEiji.Ota@Sun.COM void
rdsv3_cong_clear_bit(struct rdsv3_cong_map * map,uint16_be_t port)32212198SEiji.Ota@Sun.COM rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
32312198SEiji.Ota@Sun.COM {
32412198SEiji.Ota@Sun.COM unsigned long i;
32512198SEiji.Ota@Sun.COM unsigned long off;
32612198SEiji.Ota@Sun.COM
32712198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
32812198SEiji.Ota@Sun.COM "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
32912198SEiji.Ota@Sun.COM NIPQUAD(map->m_addr), ntohs(port), map);
33012198SEiji.Ota@Sun.COM
33112198SEiji.Ota@Sun.COM i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
33212198SEiji.Ota@Sun.COM off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
33312414SEiji.Ota@Sun.COM clear_le_bit(off, (void *)map->m_page_addrs[i]);
33412198SEiji.Ota@Sun.COM }
33512198SEiji.Ota@Sun.COM
33612198SEiji.Ota@Sun.COM static int
rdsv3_cong_test_bit(struct rdsv3_cong_map * map,uint16_be_t port)33712198SEiji.Ota@Sun.COM rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
33812198SEiji.Ota@Sun.COM {
33912198SEiji.Ota@Sun.COM unsigned long i;
34012198SEiji.Ota@Sun.COM unsigned long off;
34112198SEiji.Ota@Sun.COM
34212198SEiji.Ota@Sun.COM i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
34312198SEiji.Ota@Sun.COM off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
34412198SEiji.Ota@Sun.COM
34512198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
34612198SEiji.Ota@Sun.COM ntohs(port), i, off);
34712198SEiji.Ota@Sun.COM
34812414SEiji.Ota@Sun.COM return (test_le_bit(off, (void *)map->m_page_addrs[i]));
34912198SEiji.Ota@Sun.COM }
35012198SEiji.Ota@Sun.COM
35112198SEiji.Ota@Sun.COM void
rdsv3_cong_add_socket(struct rdsv3_sock * rs)35212198SEiji.Ota@Sun.COM rdsv3_cong_add_socket(struct rdsv3_sock *rs)
35312198SEiji.Ota@Sun.COM {
35412198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
35512198SEiji.Ota@Sun.COM
35612198SEiji.Ota@Sun.COM rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
35712198SEiji.Ota@Sun.COM if (!list_link_active(&rs->rs_cong_list))
35812198SEiji.Ota@Sun.COM list_insert_head(&rdsv3_cong_monitor, rs);
35912198SEiji.Ota@Sun.COM rw_exit(&rdsv3_cong_monitor_lock);
36012198SEiji.Ota@Sun.COM }
36112198SEiji.Ota@Sun.COM
36212198SEiji.Ota@Sun.COM void
rdsv3_cong_remove_socket(struct rdsv3_sock * rs)36312198SEiji.Ota@Sun.COM rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
36412198SEiji.Ota@Sun.COM {
36512198SEiji.Ota@Sun.COM struct rdsv3_cong_map *map;
36612198SEiji.Ota@Sun.COM
36712198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
36812198SEiji.Ota@Sun.COM
36912198SEiji.Ota@Sun.COM rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
37012198SEiji.Ota@Sun.COM list_remove_node(&rs->rs_cong_list);
37112198SEiji.Ota@Sun.COM rw_exit(&rdsv3_cong_monitor_lock);
37212198SEiji.Ota@Sun.COM
37312198SEiji.Ota@Sun.COM /* update congestion map for now-closed port */
37412198SEiji.Ota@Sun.COM mutex_enter(&rdsv3_cong_lock);
37512198SEiji.Ota@Sun.COM map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
37612198SEiji.Ota@Sun.COM mutex_exit(&rdsv3_cong_lock);
37712198SEiji.Ota@Sun.COM
37812198SEiji.Ota@Sun.COM if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
37912198SEiji.Ota@Sun.COM rdsv3_cong_clear_bit(map, rs->rs_bound_port);
38012198SEiji.Ota@Sun.COM rdsv3_cong_queue_updates(map);
38112198SEiji.Ota@Sun.COM }
38212198SEiji.Ota@Sun.COM }
38312198SEiji.Ota@Sun.COM
38412198SEiji.Ota@Sun.COM int
rdsv3_cong_wait(struct rdsv3_cong_map * map,uint16_be_t port,int nonblock,struct rdsv3_sock * rs)38512198SEiji.Ota@Sun.COM rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
38612198SEiji.Ota@Sun.COM struct rdsv3_sock *rs)
38712198SEiji.Ota@Sun.COM {
38812320SGiri.Adari@Sun.COM int ret = 0;
38912198SEiji.Ota@Sun.COM
39012198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
39112198SEiji.Ota@Sun.COM rs, nonblock);
39212198SEiji.Ota@Sun.COM
39312198SEiji.Ota@Sun.COM if (!rdsv3_cong_test_bit(map, port))
39412198SEiji.Ota@Sun.COM return (0);
39512198SEiji.Ota@Sun.COM if (nonblock) {
39612198SEiji.Ota@Sun.COM if (rs && rs->rs_cong_monitor) {
39712198SEiji.Ota@Sun.COM /*
39812198SEiji.Ota@Sun.COM * It would have been nice to have an atomic set_bit on
39912198SEiji.Ota@Sun.COM * a uint64_t.
40012198SEiji.Ota@Sun.COM */
40112198SEiji.Ota@Sun.COM mutex_enter(&rs->rs_lock);
40212198SEiji.Ota@Sun.COM rs->rs_cong_mask |=
40312863SEiji.Ota@Sun.COM RDS_CONG_MONITOR_MASK(ntohs(port));
40412198SEiji.Ota@Sun.COM mutex_exit(&rs->rs_lock);
40512198SEiji.Ota@Sun.COM
40612198SEiji.Ota@Sun.COM /*
40712198SEiji.Ota@Sun.COM * Test again - a congestion update may have arrived in
40812198SEiji.Ota@Sun.COM * the meantime.
40912198SEiji.Ota@Sun.COM */
41012198SEiji.Ota@Sun.COM if (!rdsv3_cong_test_bit(map, port))
41112198SEiji.Ota@Sun.COM return (0);
41212198SEiji.Ota@Sun.COM }
41312198SEiji.Ota@Sun.COM rdsv3_stats_inc(s_cong_send_error);
41412198SEiji.Ota@Sun.COM return (-ENOBUFS);
41512198SEiji.Ota@Sun.COM }
41612198SEiji.Ota@Sun.COM
41712198SEiji.Ota@Sun.COM rdsv3_stats_inc(s_cong_send_blocked);
41812198SEiji.Ota@Sun.COM RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
41912198SEiji.Ota@Sun.COM map, ntohs(port));
42012198SEiji.Ota@Sun.COM
42112320SGiri.Adari@Sun.COM #if 0
42212320SGiri.Adari@Sun.COM ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port));
42312320SGiri.Adari@Sun.COM if (ret == 0)
42412320SGiri.Adari@Sun.COM return (-ERESTART);
42512320SGiri.Adari@Sun.COM return (0);
42612320SGiri.Adari@Sun.COM #else
42712198SEiji.Ota@Sun.COM mutex_enter(&map->m_waitq.waitq_mutex);
42812320SGiri.Adari@Sun.COM map->m_waitq.waitq_waiters++;
42912198SEiji.Ota@Sun.COM while (rdsv3_cong_test_bit(map, port)) {
43012320SGiri.Adari@Sun.COM ret = cv_wait_sig(&map->m_waitq.waitq_cv,
43112320SGiri.Adari@Sun.COM &map->m_waitq.waitq_mutex);
43212320SGiri.Adari@Sun.COM if (ret == 0) {
433*13118SEiji.Ota@Sun.COM ret = -EINTR;
43412198SEiji.Ota@Sun.COM break;
43512198SEiji.Ota@Sun.COM }
43612198SEiji.Ota@Sun.COM }
43712320SGiri.Adari@Sun.COM map->m_waitq.waitq_waiters--;
43812198SEiji.Ota@Sun.COM mutex_exit(&map->m_waitq.waitq_mutex);
43912198SEiji.Ota@Sun.COM return (ret);
44012320SGiri.Adari@Sun.COM #endif
44112198SEiji.Ota@Sun.COM }
44212198SEiji.Ota@Sun.COM
44312198SEiji.Ota@Sun.COM void
rdsv3_cong_exit(void)44412198SEiji.Ota@Sun.COM rdsv3_cong_exit(void)
44512198SEiji.Ota@Sun.COM {
44612198SEiji.Ota@Sun.COM struct rdsv3_cong_map *map;
44712198SEiji.Ota@Sun.COM unsigned long i;
44812198SEiji.Ota@Sun.COM
44912198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
45012198SEiji.Ota@Sun.COM
45112198SEiji.Ota@Sun.COM while ((map = avl_first(&rdsv3_cong_tree))) {
45212198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
45312198SEiji.Ota@Sun.COM avl_remove(&rdsv3_cong_tree, map);
45412198SEiji.Ota@Sun.COM for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
45512198SEiji.Ota@Sun.COM i++)
45612198SEiji.Ota@Sun.COM kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
45712198SEiji.Ota@Sun.COM kmem_free(map, sizeof (*map));
45812198SEiji.Ota@Sun.COM }
45912198SEiji.Ota@Sun.COM
46012198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
46112198SEiji.Ota@Sun.COM }
46212198SEiji.Ota@Sun.COM
46312198SEiji.Ota@Sun.COM /*
46412198SEiji.Ota@Sun.COM * Allocate a RDS message containing a congestion update.
46512198SEiji.Ota@Sun.COM */
46612198SEiji.Ota@Sun.COM struct rdsv3_message *
rdsv3_cong_update_alloc(struct rdsv3_connection * conn)46712198SEiji.Ota@Sun.COM rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
46812198SEiji.Ota@Sun.COM {
46912198SEiji.Ota@Sun.COM struct rdsv3_cong_map *map = conn->c_lcong;
47012198SEiji.Ota@Sun.COM struct rdsv3_message *rm;
47112198SEiji.Ota@Sun.COM
47212198SEiji.Ota@Sun.COM rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
47312198SEiji.Ota@Sun.COM if (!IS_ERR(rm))
47412198SEiji.Ota@Sun.COM rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
47512198SEiji.Ota@Sun.COM
47612198SEiji.Ota@Sun.COM return (rm);
47712198SEiji.Ota@Sun.COM }
47812198SEiji.Ota@Sun.COM
47912198SEiji.Ota@Sun.COM static int
rdsv3_cong_compare(const void * map1,const void * map2)48012198SEiji.Ota@Sun.COM rdsv3_cong_compare(const void *map1, const void *map2)
48112198SEiji.Ota@Sun.COM {
48212198SEiji.Ota@Sun.COM #define addr1 ((struct rdsv3_cong_map *)map1)->m_addr
48312198SEiji.Ota@Sun.COM #define addr2 ((struct rdsv3_cong_map *)map2)->m_addr
48412198SEiji.Ota@Sun.COM
48512198SEiji.Ota@Sun.COM if (addr1 < addr2)
48612198SEiji.Ota@Sun.COM return (-1);
48712198SEiji.Ota@Sun.COM if (addr1 > addr2)
48812198SEiji.Ota@Sun.COM return (1);
48912198SEiji.Ota@Sun.COM return (0);
49012198SEiji.Ota@Sun.COM }
49112198SEiji.Ota@Sun.COM
49212198SEiji.Ota@Sun.COM void
rdsv3_cong_init(void)49312198SEiji.Ota@Sun.COM rdsv3_cong_init(void)
49412198SEiji.Ota@Sun.COM {
49512198SEiji.Ota@Sun.COM list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
49612198SEiji.Ota@Sun.COM offsetof(struct rdsv3_sock, rs_cong_list));
49712198SEiji.Ota@Sun.COM rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
49812198SEiji.Ota@Sun.COM mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
49912198SEiji.Ota@Sun.COM avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
50012198SEiji.Ota@Sun.COM sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
50112198SEiji.Ota@Sun.COM m_rb_node));
50212198SEiji.Ota@Sun.COM }
503