xref: /onnv-gate/usr/src/uts/common/io/ib/clients/rdsv3/cong.c (revision 13118:e192495818d4)
112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM  */
412198SEiji.Ota@Sun.COM 
512198SEiji.Ota@Sun.COM /*
612763SGiri.Adari@Sun.COM  * This file contains code imported from the OFED rds source file cong.c
712763SGiri.Adari@Sun.COM  * Oracle elects to have and use the contents of cong.c under and governed
812763SGiri.Adari@Sun.COM  * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM  * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM  */
1112763SGiri.Adari@Sun.COM 
1212763SGiri.Adari@Sun.COM 
1312763SGiri.Adari@Sun.COM /*
1412198SEiji.Ota@Sun.COM  * Copyright (c) 2007 Oracle.  All rights reserved.
1512198SEiji.Ota@Sun.COM  *
1612198SEiji.Ota@Sun.COM  * This software is available to you under a choice of one of two
1712198SEiji.Ota@Sun.COM  * licenses.  You may choose to be licensed under the terms of the GNU
1812198SEiji.Ota@Sun.COM  * General Public License (GPL) Version 2, available from the file
1912198SEiji.Ota@Sun.COM  * COPYING in the main directory of this source tree, or the
2012198SEiji.Ota@Sun.COM  * OpenIB.org BSD license below:
2112198SEiji.Ota@Sun.COM  *
2212198SEiji.Ota@Sun.COM  *     Redistribution and use in source and binary forms, with or
2312198SEiji.Ota@Sun.COM  *     without modification, are permitted provided that the following
2412198SEiji.Ota@Sun.COM  *     conditions are met:
2512198SEiji.Ota@Sun.COM  *
2612198SEiji.Ota@Sun.COM  *      - Redistributions of source code must retain the above
2712198SEiji.Ota@Sun.COM  *        copyright notice, this list of conditions and the following
2812198SEiji.Ota@Sun.COM  *        disclaimer.
2912198SEiji.Ota@Sun.COM  *
3012198SEiji.Ota@Sun.COM  *      - Redistributions in binary form must reproduce the above
3112198SEiji.Ota@Sun.COM  *        copyright notice, this list of conditions and the following
3212198SEiji.Ota@Sun.COM  *        disclaimer in the documentation and/or other materials
3312198SEiji.Ota@Sun.COM  *        provided with the distribution.
3412198SEiji.Ota@Sun.COM  *
3512198SEiji.Ota@Sun.COM  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3612198SEiji.Ota@Sun.COM  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3712198SEiji.Ota@Sun.COM  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3812198SEiji.Ota@Sun.COM  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3912198SEiji.Ota@Sun.COM  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
4012198SEiji.Ota@Sun.COM  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4112198SEiji.Ota@Sun.COM  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4212198SEiji.Ota@Sun.COM  * SOFTWARE.
4312198SEiji.Ota@Sun.COM  *
4412198SEiji.Ota@Sun.COM  */
4512198SEiji.Ota@Sun.COM #include <sys/rds.h>
4612198SEiji.Ota@Sun.COM 
4712198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3.h>
4812198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
4912198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5012198SEiji.Ota@Sun.COM 
5112198SEiji.Ota@Sun.COM /*
5212198SEiji.Ota@Sun.COM  * This file implements the receive side of the unconventional congestion
5312198SEiji.Ota@Sun.COM  * management in RDS.
5412198SEiji.Ota@Sun.COM  *
5512198SEiji.Ota@Sun.COM  * Messages waiting in the receive queue on the receiving socket are accounted
5612198SEiji.Ota@Sun.COM  * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
5712198SEiji.Ota@Sun.COM  * message are accounted for.  If the number of bytes queued equals or exceeds
5812198SEiji.Ota@Sun.COM  * rcvbuf then the socket is congested.  All sends attempted to this socket's
5912198SEiji.Ota@Sun.COM  * address should return block or return -EWOULDBLOCK.
6012198SEiji.Ota@Sun.COM  *
6112198SEiji.Ota@Sun.COM  * Applications are expected to be reasonably tuned such that this situation
6212198SEiji.Ota@Sun.COM  * very rarely occurs.  An application encountering this "back-pressure" is
6312198SEiji.Ota@Sun.COM  * considered a bug.
6412198SEiji.Ota@Sun.COM  *
6512198SEiji.Ota@Sun.COM  * This is implemented by having each node maintain bitmaps which indicate
6612198SEiji.Ota@Sun.COM  * which ports on bound addresses are congested.  As the bitmap changes it is
6712198SEiji.Ota@Sun.COM  * sent through all the connections which terminate in the local address of the
6812198SEiji.Ota@Sun.COM  * bitmap which changed.
6912198SEiji.Ota@Sun.COM  *
7012198SEiji.Ota@Sun.COM  * The bitmaps are allocated as connections are brought up.  This avoids
7112198SEiji.Ota@Sun.COM  * allocation in the interrupt handling path which queues messages on sockets.
7212198SEiji.Ota@Sun.COM  * The dense bitmaps let transports send the entire bitmap on any bitmap change
7312198SEiji.Ota@Sun.COM  * reasonably efficiently.  This is much easier to implement than some
7412198SEiji.Ota@Sun.COM  * finer-grained communication of per-port congestion.  The sender does a very
7512198SEiji.Ota@Sun.COM  * inexpensive bit test to test if the port it's about to send to is congested
7612198SEiji.Ota@Sun.COM  * or not.
7712198SEiji.Ota@Sun.COM  */
7812198SEiji.Ota@Sun.COM 
7912198SEiji.Ota@Sun.COM /*
8012198SEiji.Ota@Sun.COM  * Interaction with poll is a tad tricky. We want all processes stuck in
8112198SEiji.Ota@Sun.COM  * poll to wake up and check whether a congested destination became uncongested.
8212198SEiji.Ota@Sun.COM  * The really sad thing is we have no idea which destinations the application
8312198SEiji.Ota@Sun.COM  * wants to send to - we don't even know which rdsv3_connections are involved.
8412198SEiji.Ota@Sun.COM  * So until we implement a more flexible rds poll interface, we have to make
8512198SEiji.Ota@Sun.COM  * do with this:
8612198SEiji.Ota@Sun.COM  * We maintain a global counter that is incremented each time a congestion map
8712198SEiji.Ota@Sun.COM  * update is received. Each rds socket tracks this value, and if rdsv3_poll
8812198SEiji.Ota@Sun.COM  * finds that the saved generation number is smaller than the global generation
8912198SEiji.Ota@Sun.COM  * number, it wakes up the process.
9012198SEiji.Ota@Sun.COM  */
9112198SEiji.Ota@Sun.COM static atomic_t		rdsv3_cong_generation = ATOMIC_INIT(0);
9212198SEiji.Ota@Sun.COM 
9312198SEiji.Ota@Sun.COM /*
9412198SEiji.Ota@Sun.COM  * Congestion monitoring
9512198SEiji.Ota@Sun.COM  */
9612198SEiji.Ota@Sun.COM static struct list rdsv3_cong_monitor;
9712198SEiji.Ota@Sun.COM static krwlock_t rdsv3_cong_monitor_lock;
9812198SEiji.Ota@Sun.COM 
9912198SEiji.Ota@Sun.COM /*
10012198SEiji.Ota@Sun.COM  * Yes, a global lock.  It's used so infrequently that it's worth keeping it
10112198SEiji.Ota@Sun.COM  * global to simplify the locking.  It's only used in the following
10212198SEiji.Ota@Sun.COM  * circumstances:
10312198SEiji.Ota@Sun.COM  *
10412198SEiji.Ota@Sun.COM  *  - on connection buildup to associate a conn with its maps
10512198SEiji.Ota@Sun.COM  *  - on map changes to inform conns of a new map to send
10612198SEiji.Ota@Sun.COM  *
10712198SEiji.Ota@Sun.COM  *  It's sadly ordered under the socket callback lock and the connection lock.
10812198SEiji.Ota@Sun.COM  *  Receive paths can mark ports congested from interrupt context so the
10912198SEiji.Ota@Sun.COM  *  lock masks interrupts.
11012198SEiji.Ota@Sun.COM  */
11112198SEiji.Ota@Sun.COM static kmutex_t rdsv3_cong_lock;
11212198SEiji.Ota@Sun.COM static struct avl_tree rdsv3_cong_tree;
11312198SEiji.Ota@Sun.COM 
11412198SEiji.Ota@Sun.COM static struct rdsv3_cong_map *
rdsv3_cong_tree_walk(uint32_be_t addr,struct rdsv3_cong_map * insert)11512198SEiji.Ota@Sun.COM rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
11612198SEiji.Ota@Sun.COM {
11712198SEiji.Ota@Sun.COM 	struct rdsv3_cong_map *map;
11812198SEiji.Ota@Sun.COM 	avl_index_t where;
11912198SEiji.Ota@Sun.COM 
12012198SEiji.Ota@Sun.COM 	if (insert) {
12112198SEiji.Ota@Sun.COM 		map = avl_find(&rdsv3_cong_tree, insert, &where);
12212198SEiji.Ota@Sun.COM 		if (map == NULL) {
12312198SEiji.Ota@Sun.COM 			avl_insert(&rdsv3_cong_tree, insert, where);
12412198SEiji.Ota@Sun.COM 			return (NULL);
12512198SEiji.Ota@Sun.COM 		}
12612198SEiji.Ota@Sun.COM 	} else {
12712198SEiji.Ota@Sun.COM 		struct rdsv3_cong_map map1;
12812198SEiji.Ota@Sun.COM 		map1.m_addr = addr;
12912198SEiji.Ota@Sun.COM 		map = avl_find(&rdsv3_cong_tree, &map1, &where);
13012198SEiji.Ota@Sun.COM 	}
13112198SEiji.Ota@Sun.COM 
13212198SEiji.Ota@Sun.COM 	return (map);
13312198SEiji.Ota@Sun.COM }
13412198SEiji.Ota@Sun.COM 
13512198SEiji.Ota@Sun.COM /*
13612198SEiji.Ota@Sun.COM  * There is only ever one bitmap for any address.  Connections try and allocate
13712198SEiji.Ota@Sun.COM  * these bitmaps in the process getting pointers to them.  The bitmaps are only
13812198SEiji.Ota@Sun.COM  * ever freed as the module is removed after all connections have been freed.
13912198SEiji.Ota@Sun.COM  */
14012198SEiji.Ota@Sun.COM static struct rdsv3_cong_map *
rdsv3_cong_from_addr(uint32_be_t addr)14112198SEiji.Ota@Sun.COM rdsv3_cong_from_addr(uint32_be_t addr)
14212198SEiji.Ota@Sun.COM {
14312198SEiji.Ota@Sun.COM 	struct rdsv3_cong_map *map;
14412198SEiji.Ota@Sun.COM 	struct rdsv3_cong_map *ret = NULL;
14512198SEiji.Ota@Sun.COM 	unsigned long zp;
14612198SEiji.Ota@Sun.COM 	unsigned long i;
14712198SEiji.Ota@Sun.COM 
14812198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
14912198SEiji.Ota@Sun.COM 
15012198SEiji.Ota@Sun.COM 	map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
15112676SEiji.Ota@Sun.COM 	if (!map)
15212198SEiji.Ota@Sun.COM 		return (NULL);
15312198SEiji.Ota@Sun.COM 
15412198SEiji.Ota@Sun.COM 	map->m_addr = addr;
15512198SEiji.Ota@Sun.COM 	rdsv3_init_waitqueue(&map->m_waitq);
15612198SEiji.Ota@Sun.COM 	list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
15712198SEiji.Ota@Sun.COM 	    offsetof(struct rdsv3_connection, c_map_item));
15812198SEiji.Ota@Sun.COM 
15912198SEiji.Ota@Sun.COM 	for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
16012198SEiji.Ota@Sun.COM 		zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
16112198SEiji.Ota@Sun.COM 		if (zp == 0)
16212198SEiji.Ota@Sun.COM 			goto out;
16312198SEiji.Ota@Sun.COM 		map->m_page_addrs[i] = zp;
16412198SEiji.Ota@Sun.COM 	}
16512198SEiji.Ota@Sun.COM 
16612198SEiji.Ota@Sun.COM 	mutex_enter(&rdsv3_cong_lock);
16712198SEiji.Ota@Sun.COM 	ret = rdsv3_cong_tree_walk(addr, map);
16812198SEiji.Ota@Sun.COM 	mutex_exit(&rdsv3_cong_lock);
16912198SEiji.Ota@Sun.COM 
17012676SEiji.Ota@Sun.COM 	if (!ret) {
17112198SEiji.Ota@Sun.COM 		ret = map;
17212198SEiji.Ota@Sun.COM 		map = NULL;
17312198SEiji.Ota@Sun.COM 	}
17412198SEiji.Ota@Sun.COM 
17512198SEiji.Ota@Sun.COM out:
17612198SEiji.Ota@Sun.COM 	if (map) {
17712198SEiji.Ota@Sun.COM 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
17812198SEiji.Ota@Sun.COM 		    i++)
17912198SEiji.Ota@Sun.COM 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
18012198SEiji.Ota@Sun.COM 		kmem_free(map, sizeof (*map));
18112198SEiji.Ota@Sun.COM 	}
18212198SEiji.Ota@Sun.COM 
18312198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
18412198SEiji.Ota@Sun.COM 	    ret, ntohl(addr));
18512198SEiji.Ota@Sun.COM 
18612198SEiji.Ota@Sun.COM 	return (ret);
18712198SEiji.Ota@Sun.COM }
18812198SEiji.Ota@Sun.COM 
18912198SEiji.Ota@Sun.COM /*
19012198SEiji.Ota@Sun.COM  * Put the conn on its local map's list.  This is called when the conn is
19112198SEiji.Ota@Sun.COM  * really added to the hash.  It's nested under the rdsv3_conn_lock, sadly.
19212198SEiji.Ota@Sun.COM  */
19312198SEiji.Ota@Sun.COM void
rdsv3_cong_add_conn(struct rdsv3_connection * conn)19412198SEiji.Ota@Sun.COM rdsv3_cong_add_conn(struct rdsv3_connection *conn)
19512198SEiji.Ota@Sun.COM {
19612198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
19712198SEiji.Ota@Sun.COM 
19812198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
19912198SEiji.Ota@Sun.COM 	    conn, conn->c_lcong);
20012198SEiji.Ota@Sun.COM 	mutex_enter(&rdsv3_cong_lock);
20112198SEiji.Ota@Sun.COM 	list_insert_tail(&conn->c_lcong->m_conn_list, conn);
20212198SEiji.Ota@Sun.COM 	mutex_exit(&rdsv3_cong_lock);
20312198SEiji.Ota@Sun.COM 
20412198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
20512198SEiji.Ota@Sun.COM }
20612198SEiji.Ota@Sun.COM 
20712198SEiji.Ota@Sun.COM void
rdsv3_cong_remove_conn(struct rdsv3_connection * conn)20812198SEiji.Ota@Sun.COM rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
20912198SEiji.Ota@Sun.COM {
21012198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
21112198SEiji.Ota@Sun.COM 
21212198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
21312198SEiji.Ota@Sun.COM 	    conn, conn->c_lcong);
21412198SEiji.Ota@Sun.COM 	mutex_enter(&rdsv3_cong_lock);
21512198SEiji.Ota@Sun.COM 	list_remove_node(&conn->c_map_item);
21612198SEiji.Ota@Sun.COM 	mutex_exit(&rdsv3_cong_lock);
21712198SEiji.Ota@Sun.COM 
21812198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
21912198SEiji.Ota@Sun.COM }
22012198SEiji.Ota@Sun.COM 
22112198SEiji.Ota@Sun.COM int
rdsv3_cong_get_maps(struct rdsv3_connection * conn)22212198SEiji.Ota@Sun.COM rdsv3_cong_get_maps(struct rdsv3_connection *conn)
22312198SEiji.Ota@Sun.COM {
22412198SEiji.Ota@Sun.COM 	conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
22512198SEiji.Ota@Sun.COM 	conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
22612198SEiji.Ota@Sun.COM 
22712676SEiji.Ota@Sun.COM 	if (!(conn->c_lcong && conn->c_fcong))
22812198SEiji.Ota@Sun.COM 		return (-ENOMEM);
22912198SEiji.Ota@Sun.COM 
23012198SEiji.Ota@Sun.COM 	return (0);
23112198SEiji.Ota@Sun.COM }
23212198SEiji.Ota@Sun.COM 
23312198SEiji.Ota@Sun.COM void
rdsv3_cong_queue_updates(struct rdsv3_cong_map * map)23412198SEiji.Ota@Sun.COM rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
23512198SEiji.Ota@Sun.COM {
23612198SEiji.Ota@Sun.COM 	struct rdsv3_connection *conn;
23712198SEiji.Ota@Sun.COM 
23812198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
23912198SEiji.Ota@Sun.COM 
24012198SEiji.Ota@Sun.COM 	mutex_enter(&rdsv3_cong_lock);
24112198SEiji.Ota@Sun.COM 
24212198SEiji.Ota@Sun.COM 	RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
24312198SEiji.Ota@Sun.COM 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
24412198SEiji.Ota@Sun.COM 			rdsv3_stats_inc(s_cong_update_queued);
24512676SEiji.Ota@Sun.COM 			(void) rdsv3_send_xmit(conn);
24612198SEiji.Ota@Sun.COM 		}
24712198SEiji.Ota@Sun.COM 	}
24812198SEiji.Ota@Sun.COM 
24912198SEiji.Ota@Sun.COM 	mutex_exit(&rdsv3_cong_lock);
25012198SEiji.Ota@Sun.COM 
25112198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
25212198SEiji.Ota@Sun.COM }
25312198SEiji.Ota@Sun.COM 
25412198SEiji.Ota@Sun.COM void
rdsv3_cong_map_updated(struct rdsv3_cong_map * map,uint64_t portmask)25512198SEiji.Ota@Sun.COM rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
25612198SEiji.Ota@Sun.COM {
25712198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_map_updated",
25812198SEiji.Ota@Sun.COM 	    "waking map %p for %u.%u.%u.%u",
25912198SEiji.Ota@Sun.COM 	    map, NIPQUAD(map->m_addr));
26012414SEiji.Ota@Sun.COM 
26112198SEiji.Ota@Sun.COM 	rdsv3_stats_inc(s_cong_update_received);
26212198SEiji.Ota@Sun.COM 	atomic_add_32(&rdsv3_cong_generation, 1);
26312198SEiji.Ota@Sun.COM #if 0
26412198SEiji.Ota@Sun.COM XXX
26512198SEiji.Ota@Sun.COM 	if (waitqueue_active(&map->m_waitq))
26612198SEiji.Ota@Sun.COM #endif
26712198SEiji.Ota@Sun.COM 		rdsv3_wake_up(&map->m_waitq);
26812198SEiji.Ota@Sun.COM 
26912198SEiji.Ota@Sun.COM 	if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
27012198SEiji.Ota@Sun.COM 		struct rdsv3_sock *rs;
27112198SEiji.Ota@Sun.COM 
27212198SEiji.Ota@Sun.COM 		rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
27312198SEiji.Ota@Sun.COM 		RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
27412198SEiji.Ota@Sun.COM 		    rs_cong_list) {
27512198SEiji.Ota@Sun.COM 			mutex_enter(&rs->rs_lock);
27612198SEiji.Ota@Sun.COM 			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
27712198SEiji.Ota@Sun.COM 			rs->rs_cong_mask &= ~portmask;
27812198SEiji.Ota@Sun.COM 			mutex_exit(&rs->rs_lock);
27912198SEiji.Ota@Sun.COM 			if (rs->rs_cong_notify)
28012198SEiji.Ota@Sun.COM 				rdsv3_wake_sk_sleep(rs);
28112198SEiji.Ota@Sun.COM 		}
28212198SEiji.Ota@Sun.COM 		rw_exit(&rdsv3_cong_monitor_lock);
28312198SEiji.Ota@Sun.COM 	}
28412198SEiji.Ota@Sun.COM 
28512198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
28612198SEiji.Ota@Sun.COM }
28712198SEiji.Ota@Sun.COM 
28812198SEiji.Ota@Sun.COM int
rdsv3_cong_updated_since(unsigned long * recent)28912198SEiji.Ota@Sun.COM rdsv3_cong_updated_since(unsigned long *recent)
29012198SEiji.Ota@Sun.COM {
29112198SEiji.Ota@Sun.COM 	unsigned long gen = atomic_get(&rdsv3_cong_generation);
29212198SEiji.Ota@Sun.COM 
29312198SEiji.Ota@Sun.COM 	if (*recent == gen)
29412198SEiji.Ota@Sun.COM 		return (0);
29512198SEiji.Ota@Sun.COM 	*recent = gen;
29612198SEiji.Ota@Sun.COM 	return (1);
29712198SEiji.Ota@Sun.COM }
29812198SEiji.Ota@Sun.COM 
29912198SEiji.Ota@Sun.COM /*
30012198SEiji.Ota@Sun.COM  * We're called under the locking that protects the sockets receive buffer
30112198SEiji.Ota@Sun.COM  * consumption.  This makes it a lot easier for the caller to only call us
30212198SEiji.Ota@Sun.COM  * when it knows that an existing set bit needs to be cleared, and vice versa.
30312198SEiji.Ota@Sun.COM  * We can't block and we need to deal with concurrent sockets working against
30412198SEiji.Ota@Sun.COM  * the same per-address map.
30512198SEiji.Ota@Sun.COM  */
30612198SEiji.Ota@Sun.COM void
rdsv3_cong_set_bit(struct rdsv3_cong_map * map,uint16_be_t port)30712198SEiji.Ota@Sun.COM rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
30812198SEiji.Ota@Sun.COM {
30912198SEiji.Ota@Sun.COM 	unsigned long i;
31012198SEiji.Ota@Sun.COM 	unsigned long off;
31112198SEiji.Ota@Sun.COM 
31212198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_set_bit",
31312198SEiji.Ota@Sun.COM 	    "setting congestion for %u.%u.%u.%u:%u in map %p",
31412198SEiji.Ota@Sun.COM 	    NIPQUAD(map->m_addr), ntohs(port), map);
31512198SEiji.Ota@Sun.COM 
31612198SEiji.Ota@Sun.COM 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
31712198SEiji.Ota@Sun.COM 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
31812414SEiji.Ota@Sun.COM 	set_le_bit(off, (void *)map->m_page_addrs[i]);
31912198SEiji.Ota@Sun.COM }
32012198SEiji.Ota@Sun.COM 
32112198SEiji.Ota@Sun.COM void
rdsv3_cong_clear_bit(struct rdsv3_cong_map * map,uint16_be_t port)32212198SEiji.Ota@Sun.COM rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
32312198SEiji.Ota@Sun.COM {
32412198SEiji.Ota@Sun.COM 	unsigned long i;
32512198SEiji.Ota@Sun.COM 	unsigned long off;
32612198SEiji.Ota@Sun.COM 
32712198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
32812198SEiji.Ota@Sun.COM 	    "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
32912198SEiji.Ota@Sun.COM 	    NIPQUAD(map->m_addr), ntohs(port), map);
33012198SEiji.Ota@Sun.COM 
33112198SEiji.Ota@Sun.COM 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
33212198SEiji.Ota@Sun.COM 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
33312414SEiji.Ota@Sun.COM 	clear_le_bit(off, (void *)map->m_page_addrs[i]);
33412198SEiji.Ota@Sun.COM }
33512198SEiji.Ota@Sun.COM 
33612198SEiji.Ota@Sun.COM static int
rdsv3_cong_test_bit(struct rdsv3_cong_map * map,uint16_be_t port)33712198SEiji.Ota@Sun.COM rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
33812198SEiji.Ota@Sun.COM {
33912198SEiji.Ota@Sun.COM 	unsigned long i;
34012198SEiji.Ota@Sun.COM 	unsigned long off;
34112198SEiji.Ota@Sun.COM 
34212198SEiji.Ota@Sun.COM 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
34312198SEiji.Ota@Sun.COM 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
34412198SEiji.Ota@Sun.COM 
34512198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
34612198SEiji.Ota@Sun.COM 	    ntohs(port), i, off);
34712198SEiji.Ota@Sun.COM 
34812414SEiji.Ota@Sun.COM 	return (test_le_bit(off, (void *)map->m_page_addrs[i]));
34912198SEiji.Ota@Sun.COM }
35012198SEiji.Ota@Sun.COM 
35112198SEiji.Ota@Sun.COM void
rdsv3_cong_add_socket(struct rdsv3_sock * rs)35212198SEiji.Ota@Sun.COM rdsv3_cong_add_socket(struct rdsv3_sock *rs)
35312198SEiji.Ota@Sun.COM {
35412198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
35512198SEiji.Ota@Sun.COM 
35612198SEiji.Ota@Sun.COM 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
35712198SEiji.Ota@Sun.COM 	if (!list_link_active(&rs->rs_cong_list))
35812198SEiji.Ota@Sun.COM 		list_insert_head(&rdsv3_cong_monitor, rs);
35912198SEiji.Ota@Sun.COM 	rw_exit(&rdsv3_cong_monitor_lock);
36012198SEiji.Ota@Sun.COM }
36112198SEiji.Ota@Sun.COM 
36212198SEiji.Ota@Sun.COM void
rdsv3_cong_remove_socket(struct rdsv3_sock * rs)36312198SEiji.Ota@Sun.COM rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
36412198SEiji.Ota@Sun.COM {
36512198SEiji.Ota@Sun.COM 	struct rdsv3_cong_map *map;
36612198SEiji.Ota@Sun.COM 
36712198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
36812198SEiji.Ota@Sun.COM 
36912198SEiji.Ota@Sun.COM 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
37012198SEiji.Ota@Sun.COM 	list_remove_node(&rs->rs_cong_list);
37112198SEiji.Ota@Sun.COM 	rw_exit(&rdsv3_cong_monitor_lock);
37212198SEiji.Ota@Sun.COM 
37312198SEiji.Ota@Sun.COM 	/* update congestion map for now-closed port */
37412198SEiji.Ota@Sun.COM 	mutex_enter(&rdsv3_cong_lock);
37512198SEiji.Ota@Sun.COM 	map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
37612198SEiji.Ota@Sun.COM 	mutex_exit(&rdsv3_cong_lock);
37712198SEiji.Ota@Sun.COM 
37812198SEiji.Ota@Sun.COM 	if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
37912198SEiji.Ota@Sun.COM 		rdsv3_cong_clear_bit(map, rs->rs_bound_port);
38012198SEiji.Ota@Sun.COM 		rdsv3_cong_queue_updates(map);
38112198SEiji.Ota@Sun.COM 	}
38212198SEiji.Ota@Sun.COM }
38312198SEiji.Ota@Sun.COM 
38412198SEiji.Ota@Sun.COM int
rdsv3_cong_wait(struct rdsv3_cong_map * map,uint16_be_t port,int nonblock,struct rdsv3_sock * rs)38512198SEiji.Ota@Sun.COM rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
38612198SEiji.Ota@Sun.COM     struct rdsv3_sock *rs)
38712198SEiji.Ota@Sun.COM {
38812320SGiri.Adari@Sun.COM 	int ret = 0;
38912198SEiji.Ota@Sun.COM 
39012198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
39112198SEiji.Ota@Sun.COM 	    rs, nonblock);
39212198SEiji.Ota@Sun.COM 
39312198SEiji.Ota@Sun.COM 	if (!rdsv3_cong_test_bit(map, port))
39412198SEiji.Ota@Sun.COM 		return (0);
39512198SEiji.Ota@Sun.COM 	if (nonblock) {
39612198SEiji.Ota@Sun.COM 		if (rs && rs->rs_cong_monitor) {
39712198SEiji.Ota@Sun.COM 			/*
39812198SEiji.Ota@Sun.COM 			 * It would have been nice to have an atomic set_bit on
39912198SEiji.Ota@Sun.COM 			 * a uint64_t.
40012198SEiji.Ota@Sun.COM 			 */
40112198SEiji.Ota@Sun.COM 			mutex_enter(&rs->rs_lock);
40212198SEiji.Ota@Sun.COM 			rs->rs_cong_mask |=
40312863SEiji.Ota@Sun.COM 			    RDS_CONG_MONITOR_MASK(ntohs(port));
40412198SEiji.Ota@Sun.COM 			mutex_exit(&rs->rs_lock);
40512198SEiji.Ota@Sun.COM 
40612198SEiji.Ota@Sun.COM 			/*
40712198SEiji.Ota@Sun.COM 			 * Test again - a congestion update may have arrived in
40812198SEiji.Ota@Sun.COM 			 * the meantime.
40912198SEiji.Ota@Sun.COM 			 */
41012198SEiji.Ota@Sun.COM 			if (!rdsv3_cong_test_bit(map, port))
41112198SEiji.Ota@Sun.COM 				return (0);
41212198SEiji.Ota@Sun.COM 		}
41312198SEiji.Ota@Sun.COM 		rdsv3_stats_inc(s_cong_send_error);
41412198SEiji.Ota@Sun.COM 		return (-ENOBUFS);
41512198SEiji.Ota@Sun.COM 	}
41612198SEiji.Ota@Sun.COM 
41712198SEiji.Ota@Sun.COM 	rdsv3_stats_inc(s_cong_send_blocked);
41812198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
41912198SEiji.Ota@Sun.COM 	    map, ntohs(port));
42012198SEiji.Ota@Sun.COM 
42112320SGiri.Adari@Sun.COM #if 0
42212320SGiri.Adari@Sun.COM 	ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port));
42312320SGiri.Adari@Sun.COM 	if (ret == 0)
42412320SGiri.Adari@Sun.COM 		return (-ERESTART);
42512320SGiri.Adari@Sun.COM 	return (0);
42612320SGiri.Adari@Sun.COM #else
42712198SEiji.Ota@Sun.COM 	mutex_enter(&map->m_waitq.waitq_mutex);
42812320SGiri.Adari@Sun.COM 	map->m_waitq.waitq_waiters++;
42912198SEiji.Ota@Sun.COM 	while (rdsv3_cong_test_bit(map, port)) {
43012320SGiri.Adari@Sun.COM 		ret = cv_wait_sig(&map->m_waitq.waitq_cv,
43112320SGiri.Adari@Sun.COM 		    &map->m_waitq.waitq_mutex);
43212320SGiri.Adari@Sun.COM 		if (ret == 0) {
433*13118SEiji.Ota@Sun.COM 			ret = -EINTR;
43412198SEiji.Ota@Sun.COM 			break;
43512198SEiji.Ota@Sun.COM 		}
43612198SEiji.Ota@Sun.COM 	}
43712320SGiri.Adari@Sun.COM 	map->m_waitq.waitq_waiters--;
43812198SEiji.Ota@Sun.COM 	mutex_exit(&map->m_waitq.waitq_mutex);
43912198SEiji.Ota@Sun.COM 	return (ret);
44012320SGiri.Adari@Sun.COM #endif
44112198SEiji.Ota@Sun.COM }
44212198SEiji.Ota@Sun.COM 
44312198SEiji.Ota@Sun.COM void
rdsv3_cong_exit(void)44412198SEiji.Ota@Sun.COM rdsv3_cong_exit(void)
44512198SEiji.Ota@Sun.COM {
44612198SEiji.Ota@Sun.COM 	struct rdsv3_cong_map *map;
44712198SEiji.Ota@Sun.COM 	unsigned long i;
44812198SEiji.Ota@Sun.COM 
44912198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
45012198SEiji.Ota@Sun.COM 
45112198SEiji.Ota@Sun.COM 	while ((map = avl_first(&rdsv3_cong_tree))) {
45212198SEiji.Ota@Sun.COM 		RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
45312198SEiji.Ota@Sun.COM 		avl_remove(&rdsv3_cong_tree, map);
45412198SEiji.Ota@Sun.COM 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
45512198SEiji.Ota@Sun.COM 		    i++)
45612198SEiji.Ota@Sun.COM 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
45712198SEiji.Ota@Sun.COM 		kmem_free(map, sizeof (*map));
45812198SEiji.Ota@Sun.COM 	}
45912198SEiji.Ota@Sun.COM 
46012198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
46112198SEiji.Ota@Sun.COM }
46212198SEiji.Ota@Sun.COM 
46312198SEiji.Ota@Sun.COM /*
46412198SEiji.Ota@Sun.COM  * Allocate a RDS message containing a congestion update.
46512198SEiji.Ota@Sun.COM  */
46612198SEiji.Ota@Sun.COM struct rdsv3_message *
rdsv3_cong_update_alloc(struct rdsv3_connection * conn)46712198SEiji.Ota@Sun.COM rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
46812198SEiji.Ota@Sun.COM {
46912198SEiji.Ota@Sun.COM 	struct rdsv3_cong_map *map = conn->c_lcong;
47012198SEiji.Ota@Sun.COM 	struct rdsv3_message *rm;
47112198SEiji.Ota@Sun.COM 
47212198SEiji.Ota@Sun.COM 	rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
47312198SEiji.Ota@Sun.COM 	if (!IS_ERR(rm))
47412198SEiji.Ota@Sun.COM 		rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
47512198SEiji.Ota@Sun.COM 
47612198SEiji.Ota@Sun.COM 	return (rm);
47712198SEiji.Ota@Sun.COM }
47812198SEiji.Ota@Sun.COM 
47912198SEiji.Ota@Sun.COM static int
rdsv3_cong_compare(const void * map1,const void * map2)48012198SEiji.Ota@Sun.COM rdsv3_cong_compare(const void *map1, const void *map2)
48112198SEiji.Ota@Sun.COM {
48212198SEiji.Ota@Sun.COM #define	addr1	((struct rdsv3_cong_map *)map1)->m_addr
48312198SEiji.Ota@Sun.COM #define	addr2	((struct rdsv3_cong_map *)map2)->m_addr
48412198SEiji.Ota@Sun.COM 
48512198SEiji.Ota@Sun.COM 	if (addr1 < addr2)
48612198SEiji.Ota@Sun.COM 		return (-1);
48712198SEiji.Ota@Sun.COM 	if (addr1 > addr2)
48812198SEiji.Ota@Sun.COM 		return (1);
48912198SEiji.Ota@Sun.COM 	return (0);
49012198SEiji.Ota@Sun.COM }
49112198SEiji.Ota@Sun.COM 
49212198SEiji.Ota@Sun.COM void
rdsv3_cong_init(void)49312198SEiji.Ota@Sun.COM rdsv3_cong_init(void)
49412198SEiji.Ota@Sun.COM {
49512198SEiji.Ota@Sun.COM 	list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
49612198SEiji.Ota@Sun.COM 	    offsetof(struct rdsv3_sock, rs_cong_list));
49712198SEiji.Ota@Sun.COM 	rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
49812198SEiji.Ota@Sun.COM 	mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
49912198SEiji.Ota@Sun.COM 	avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
50012198SEiji.Ota@Sun.COM 	    sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
50112198SEiji.Ota@Sun.COM 	    m_rb_node));
50212198SEiji.Ota@Sun.COM }
503