xref: /openbsd-src/sys/net/pf_lb.c (revision 3374c67d44f9b75b98444cbf63020f777792342e)
1 /*	$OpenBSD: pf_lb.c,v 1.73 2023/01/04 10:31:55 dlg Exp $ */
2 
3 /*
4  * Copyright (c) 2001 Daniel Hartmeier
5  * Copyright (c) 2002 - 2008 Henning Brauer
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  *    - Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    - Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following
16  *      disclaimer in the documentation and/or other materials provided
17  *      with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * Effort sponsored in part by the Defense Advanced Research Projects
33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35  *
36  */
37 
38 #include "bpfilter.h"
39 #include "pflog.h"
40 #include "pfsync.h"
41 #include "pflow.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/filio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/kernel.h>
50 #include <sys/time.h>
51 #include <sys/pool.h>
52 #include <sys/rwlock.h>
53 #include <sys/syslog.h>
54 #include <sys/stdint.h>
55 
56 #include <crypto/siphash.h>
57 
58 #include <net/if.h>
59 #include <net/bpf.h>
60 #include <net/route.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/ip_var.h>
66 #include <netinet/ip_icmp.h>
67 #include <netinet/icmp_var.h>
68 #include <netinet/tcp.h>
69 #include <netinet/tcp_seq.h>
70 #include <netinet/tcp_timer.h>
71 #include <netinet/udp.h>
72 #include <netinet/udp_var.h>
73 #include <netinet/if_ether.h>
74 
75 #ifdef INET6
76 #include <netinet/ip6.h>
77 #include <netinet/icmp6.h>
78 #endif /* INET6 */
79 
80 #include <net/pfvar.h>
81 #include <net/pfvar_priv.h>
82 
83 #if NPFLOG > 0
84 #include <net/if_pflog.h>
85 #endif	/* NPFLOG > 0 */
86 
87 #if NPFLOW > 0
88 #include <net/if_pflow.h>
89 #endif	/* NPFLOW > 0 */
90 
91 #if NPFSYNC > 0
92 #include <net/if_pfsync.h>
93 #endif /* NPFSYNC > 0 */
94 
95 u_int64_t		 pf_hash(struct pf_addr *, struct pf_addr *,
96 			    struct pf_poolhashkey *, sa_family_t);
97 int			 pf_get_sport(struct pf_pdesc *, struct pf_rule *,
98 			    struct pf_addr *, u_int16_t *, u_int16_t,
99 			    u_int16_t, struct pf_src_node **);
100 int			 pf_map_addr_states_increase(sa_family_t,
101 				struct pf_pool *, struct pf_addr *);
102 int			 pf_get_transaddr_af(struct pf_rule *,
103 			    struct pf_pdesc *, struct pf_src_node **);
104 int			 pf_map_addr_sticky(sa_family_t, struct pf_rule *,
105 			    struct pf_addr *, struct pf_addr *,
106 			    struct pf_src_node **, struct pf_pool *,
107 			    enum pf_sn_types);
108 
109 u_int64_t
110 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
111     struct pf_poolhashkey *key, sa_family_t af)
112 {
113 	uint64_t res = 0;
114 #ifdef INET6
115 	union {
116 		uint64_t hash64;
117 		uint32_t hash32[2];
118 	} h;
119 #endif	/* INET6 */
120 
121 	switch (af) {
122 	case AF_INET:
123 		res = SipHash24((SIPHASH_KEY *)key,
124 		    &inaddr->addr32[0], sizeof(inaddr->addr32[0]));
125 		hash->addr32[0] = res;
126 		break;
127 #ifdef INET6
128 	case AF_INET6:
129 		res = SipHash24((SIPHASH_KEY *)key, &inaddr->addr32[0],
130 		    4 * sizeof(inaddr->addr32[0]));
131 		h.hash64 = res;
132 		hash->addr32[0] = h.hash32[0];
133 		hash->addr32[1] = h.hash32[1];
134 		/*
135 		 * siphash isn't big enough, but flipping it around is
136 		 * good enough here.
137 		 */
138 		hash->addr32[2] = ~h.hash32[1];
139 		hash->addr32[3] = ~h.hash32[0];
140 		break;
141 #endif /* INET6 */
142 	default:
143 		unhandled_af(af);
144 	}
145 	return (res);
146 }
147 
148 int
149 pf_get_sport(struct pf_pdesc *pd, struct pf_rule *r,
150     struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high,
151     struct pf_src_node **sn)
152 {
153 	struct pf_state_key_cmp	key;
154 	struct pf_addr		init_addr;
155 	u_int16_t		cut;
156 	int			dir = (pd->dir == PF_IN) ? PF_OUT : PF_IN;
157 	int			sidx = pd->sidx;
158 	int			didx = pd->didx;
159 
160 	memset(&init_addr, 0, sizeof(init_addr));
161 	if (pf_map_addr(pd->naf, r, &pd->nsaddr, naddr, &init_addr, sn, &r->nat,
162 	    PF_SN_NAT))
163 		return (1);
164 
165 	if (pd->proto == IPPROTO_ICMP) {
166 		if (pd->ndport == htons(ICMP_ECHO)) {
167 			low = 1;
168 			high = 65535;
169 		} else
170 			return (0);	/* Don't try to modify non-echo ICMP */
171 	}
172 #ifdef INET6
173 	if (pd->proto == IPPROTO_ICMPV6) {
174 		if (pd->ndport == htons(ICMP6_ECHO_REQUEST)) {
175 			low = 1;
176 			high = 65535;
177 		} else
178 			return (0);	/* Don't try to modify non-echo ICMP */
179 	}
180 #endif /* INET6 */
181 
182 	do {
183 		key.af = pd->naf;
184 		key.proto = pd->proto;
185 		key.rdomain = pd->rdomain;
186 		pf_addrcpy(&key.addr[didx], &pd->ndaddr, key.af);
187 		pf_addrcpy(&key.addr[sidx], naddr, key.af);
188 		key.port[didx] = pd->ndport;
189 
190 		/*
191 		 * port search; start random, step;
192 		 * similar 2 portloop in in_pcbbind
193 		 */
194 		if (!(pd->proto == IPPROTO_TCP || pd->proto == IPPROTO_UDP ||
195 		    pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6)) {
196 			/* XXX bug: icmp states dont use the id on both
197 			 * XXX sides (traceroute -I through nat) */
198 			key.port[sidx] = pd->nsport;
199 			if (pf_find_state_all(&key, dir, NULL) == NULL) {
200 				*nport = pd->nsport;
201 				return (0);
202 			}
203 		} else if (low == 0 && high == 0) {
204 			key.port[sidx] = pd->nsport;
205 			if (pf_find_state_all(&key, dir, NULL) == NULL) {
206 				*nport = pd->nsport;
207 				return (0);
208 			}
209 		} else if (low == high) {
210 			key.port[sidx] = htons(low);
211 			if (pf_find_state_all(&key, dir, NULL) == NULL) {
212 				*nport = htons(low);
213 				return (0);
214 			}
215 		} else {
216 			u_int32_t tmp;
217 
218 			if (low > high) {
219 				tmp = low;
220 				low = high;
221 				high = tmp;
222 			}
223 			/* low < high */
224 			cut = arc4random_uniform(1 + high - low) + low;
225 			/* low <= cut <= high */
226 			for (tmp = cut; tmp <= high && tmp <= 0xffff; ++tmp) {
227 				key.port[sidx] = htons(tmp);
228 				if (pf_find_state_all(&key, dir, NULL) ==
229 				    NULL && !in_baddynamic(tmp, pd->proto)) {
230 					*nport = htons(tmp);
231 					return (0);
232 				}
233 			}
234 			tmp = cut;
235 			for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) {
236 				key.port[sidx] = htons(tmp);
237 				if (pf_find_state_all(&key, dir, NULL) ==
238 				    NULL && !in_baddynamic(tmp, pd->proto)) {
239 					*nport = htons(tmp);
240 					return (0);
241 				}
242 			}
243 		}
244 
245 		switch (r->nat.opts & PF_POOL_TYPEMASK) {
246 		case PF_POOL_RANDOM:
247 		case PF_POOL_ROUNDROBIN:
248 		case PF_POOL_LEASTSTATES:
249 			/*
250 			 * pick a different source address since we're out
251 			 * of free port choices for the current one.
252 			 */
253 			if (pf_map_addr(pd->naf, r, &pd->nsaddr, naddr,
254 			    &init_addr, sn, &r->nat, PF_SN_NAT))
255 				return (1);
256 			break;
257 		case PF_POOL_NONE:
258 		case PF_POOL_SRCHASH:
259 		case PF_POOL_BITMASK:
260 		default:
261 			return (1);
262 		}
263 	} while (! PF_AEQ(&init_addr, naddr, pd->naf) );
264 	return (1);					/* none available */
265 }
266 
267 int
268 pf_map_addr_sticky(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
269     struct pf_addr *naddr, struct pf_src_node **sns, struct pf_pool *rpool,
270     enum pf_sn_types type)
271 {
272 	struct pf_addr		*raddr, *rmask, *cached;
273 	struct pf_state		*s;
274 	struct pf_src_node	 k;
275 	int			 valid;
276 
277 	k.af = af;
278 	k.type = type;
279 	pf_addrcpy(&k.addr, saddr, af);
280 	k.rule.ptr = r;
281 	pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
282 	sns[type] = RB_FIND(pf_src_tree, &tree_src_tracking, &k);
283 	if (sns[type] == NULL)
284 		return (-1);
285 
286 	/* check if the cached entry is still valid */
287 	cached = &(sns[type])->raddr;
288 	valid = 0;
289 	if (PF_AZERO(cached, af)) {
290 		valid = 1;
291 	} else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
292 		if (pfr_kentry_byaddr(rpool->addr.p.dyn->pfid_kt, cached,
293 		    af, 0))
294 			valid = 1;
295 	} else if (rpool->addr.type == PF_ADDR_TABLE) {
296 		if (pfr_kentry_byaddr(rpool->addr.p.tbl, cached, af, 0))
297 			valid = 1;
298 	} else if (rpool->addr.type != PF_ADDR_NOROUTE) {
299 		raddr = &rpool->addr.v.a.addr;
300 		rmask = &rpool->addr.v.a.mask;
301 		valid = pf_match_addr(0, raddr, rmask, cached, af);
302 	}
303 	if (!valid) {
304 		if (pf_status.debug >= LOG_DEBUG) {
305 			log(LOG_DEBUG, "pf: pf_map_addr: "
306 			    "stale src tracking (%u) ", type);
307 			pf_print_host(&k.addr, 0, af);
308 			addlog(" to ");
309 			pf_print_host(cached, 0, af);
310 			addlog("\n");
311 		}
312 		if (sns[type]->states != 0) {
313 			/* XXX expensive */
314 			RBT_FOREACH(s, pf_state_tree_id, &tree_id)
315 				pf_state_rm_src_node(s, sns[type]);
316 		}
317 		sns[type]->expire = 1;
318 		pf_remove_src_node(sns[type]);
319 		sns[type] = NULL;
320 		return (-1);
321 	}
322 
323 
324 	if (!PF_AZERO(cached, af)) {
325 		pf_addrcpy(naddr, cached, af);
326 		if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES &&
327 		    pf_map_addr_states_increase(af, rpool, cached) == -1)
328 			return (-1);
329 	}
330 	if (pf_status.debug >= LOG_DEBUG) {
331 		log(LOG_DEBUG, "pf: pf_map_addr: "
332 		    "src tracking (%u) maps ", type);
333 		pf_print_host(&k.addr, 0, af);
334 		addlog(" to ");
335 		pf_print_host(naddr, 0, af);
336 		addlog("\n");
337 	}
338 
339 	if (sns[type]->kif != NULL)
340 		rpool->kif = sns[type]->kif;
341 
342 	return (0);
343 }
344 
345 uint32_t
346 pf_rand_addr(uint32_t mask)
347 {
348 	uint32_t addr;
349 
350 	mask = ~ntohl(mask);
351 	addr = arc4random_uniform(mask + 1);
352 
353 	return (htonl(addr));
354 }
355 
356 int
357 pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
358     struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sns,
359     struct pf_pool *rpool, enum pf_sn_types type)
360 {
361 	struct pf_addr		 hash;
362 	struct pf_addr		 faddr;
363 	struct pf_addr		*raddr = &rpool->addr.v.a.addr;
364 	struct pf_addr		*rmask = &rpool->addr.v.a.mask;
365 	struct pfr_ktable	*kt;
366 	struct pfi_kif		*kif;
367 	u_int64_t		 states;
368 	u_int16_t		 weight;
369 	u_int64_t		 load;
370 	u_int64_t		 cload;
371 	u_int64_t		 hashidx;
372 	int			 cnt;
373 
374 	if (sns[type] == NULL && rpool->opts & PF_POOL_STICKYADDR &&
375 	    (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE &&
376 	    pf_map_addr_sticky(af, r, saddr, naddr, sns, rpool, type) == 0)
377 		return (0);
378 
379 	if (rpool->addr.type == PF_ADDR_NOROUTE)
380 		return (1);
381 	if (rpool->addr.type == PF_ADDR_DYNIFTL) {
382 		switch (af) {
383 		case AF_INET:
384 			if (rpool->addr.p.dyn->pfid_acnt4 < 1 &&
385 			    !PF_POOL_DYNTYPE(rpool->opts))
386 				return (1);
387 			raddr = &rpool->addr.p.dyn->pfid_addr4;
388 			rmask = &rpool->addr.p.dyn->pfid_mask4;
389 			break;
390 #ifdef INET6
391 		case AF_INET6:
392 			if (rpool->addr.p.dyn->pfid_acnt6 < 1 &&
393 			    !PF_POOL_DYNTYPE(rpool->opts))
394 				return (1);
395 			raddr = &rpool->addr.p.dyn->pfid_addr6;
396 			rmask = &rpool->addr.p.dyn->pfid_mask6;
397 			break;
398 #endif /* INET6 */
399 		default:
400 			unhandled_af(af);
401 		}
402 	} else if (rpool->addr.type == PF_ADDR_TABLE) {
403 		if (!PF_POOL_DYNTYPE(rpool->opts))
404 			return (1); /* unsupported */
405 	} else {
406 		raddr = &rpool->addr.v.a.addr;
407 		rmask = &rpool->addr.v.a.mask;
408 	}
409 
410 	switch (rpool->opts & PF_POOL_TYPEMASK) {
411 	case PF_POOL_NONE:
412 		pf_addrcpy(naddr, raddr, af);
413 		break;
414 	case PF_POOL_BITMASK:
415 		pf_poolmask(naddr, raddr, rmask, saddr, af);
416 		break;
417 	case PF_POOL_RANDOM:
418 		if (rpool->addr.type == PF_ADDR_TABLE ||
419 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
420 			if (rpool->addr.type == PF_ADDR_TABLE)
421 				kt = rpool->addr.p.tbl;
422 			else
423 				kt = rpool->addr.p.dyn->pfid_kt;
424 			kt = pfr_ktable_select_active(kt);
425 			if (kt == NULL)
426 				return (1);
427 
428 			cnt = kt->pfrkt_cnt;
429 			if (cnt == 0)
430 				rpool->tblidx = 0;
431 			else
432 				rpool->tblidx = (int)arc4random_uniform(cnt);
433 			memset(&rpool->counter, 0, sizeof(rpool->counter));
434 			if (pfr_pool_get(rpool, &raddr, &rmask, af))
435 				return (1);
436 			pf_addrcpy(naddr, &rpool->counter, af);
437 		} else if (init_addr != NULL && PF_AZERO(init_addr, af)) {
438 			switch (af) {
439 			case AF_INET:
440 				rpool->counter.addr32[0] = pf_rand_addr(
441 				    rmask->addr32[0]);
442 				break;
443 #ifdef INET6
444 			case AF_INET6:
445 				if (rmask->addr32[3] != 0xffffffff)
446 					rpool->counter.addr32[3] = pf_rand_addr(
447 					    rmask->addr32[3]);
448 				else
449 					break;
450 				if (rmask->addr32[2] != 0xffffffff)
451 					rpool->counter.addr32[2] = pf_rand_addr(
452 					    rmask->addr32[2]);
453 				else
454 					break;
455 				if (rmask->addr32[1] != 0xffffffff)
456 					rpool->counter.addr32[1] = pf_rand_addr(
457 					    rmask->addr32[1]);
458 				else
459 					break;
460 				if (rmask->addr32[0] != 0xffffffff)
461 					rpool->counter.addr32[0] = pf_rand_addr(
462 					    rmask->addr32[0]);
463 				break;
464 #endif /* INET6 */
465 			default:
466 				unhandled_af(af);
467 			}
468 			pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
469 			pf_addrcpy(init_addr, naddr, af);
470 
471 		} else {
472 			pf_addr_inc(&rpool->counter, af);
473 			pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
474 		}
475 		break;
476 	case PF_POOL_SRCHASH:
477 		hashidx = pf_hash(saddr, &hash, &rpool->key, af);
478 
479 		if (rpool->addr.type == PF_ADDR_TABLE ||
480 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
481 			if (rpool->addr.type == PF_ADDR_TABLE)
482 				kt = rpool->addr.p.tbl;
483 			else
484 				kt = rpool->addr.p.dyn->pfid_kt;
485 			kt = pfr_ktable_select_active(kt);
486 			if (kt == NULL)
487 				return (1);
488 
489 			cnt = kt->pfrkt_cnt;
490 			if (cnt == 0)
491 				rpool->tblidx = 0;
492 			else
493 				rpool->tblidx = (int)(hashidx % cnt);
494 			memset(&rpool->counter, 0, sizeof(rpool->counter));
495 			if (pfr_pool_get(rpool, &raddr, &rmask, af))
496 				return (1);
497 			pf_addrcpy(naddr, &rpool->counter, af);
498 		} else {
499 			pf_poolmask(naddr, raddr, rmask, &hash, af);
500 		}
501 		break;
502 	case PF_POOL_ROUNDROBIN:
503 		if (rpool->addr.type == PF_ADDR_TABLE ||
504 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
505 			if (pfr_pool_get(rpool, &raddr, &rmask, af)) {
506 				/*
507 				 * reset counter in case its value
508 				 * has been removed from the pool.
509 				 */
510 				memset(&rpool->counter, 0,
511 				    sizeof(rpool->counter));
512 				if (pfr_pool_get(rpool, &raddr, &rmask, af))
513 					return (1);
514 			}
515 		} else if (PF_AZERO(&rpool->counter, af)) {
516 			/*
517 			 * fall back to POOL_NONE if there is a single host
518 			 * address in pool.
519 			 */
520 			if (af == AF_INET &&
521 			    rmask->addr32[0] == INADDR_BROADCAST) {
522 				pf_addrcpy(naddr, raddr, af);
523 				break;
524 			}
525 #ifdef INET6
526 			if (af == AF_INET6 &&
527 			    IN6_ARE_ADDR_EQUAL(&rmask->v6, &in6mask128)) {
528 				pf_addrcpy(naddr, raddr, af);
529 				break;
530 			}
531 #endif
532 		} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
533 			return (1);
534 
535 		/* iterate over table if it contains entries which are weighted */
536 		if ((rpool->addr.type == PF_ADDR_TABLE &&
537 		    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
538 		    (rpool->addr.type == PF_ADDR_DYNIFTL &&
539 		    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0)) {
540 			do {
541 				if (rpool->addr.type == PF_ADDR_TABLE ||
542 				    rpool->addr.type == PF_ADDR_DYNIFTL) {
543 					if (pfr_pool_get(rpool,
544 					    &raddr, &rmask, af))
545 						return (1);
546 				} else {
547 					log(LOG_ERR, "pf: pf_map_addr: "
548 					    "weighted RR failure");
549 					return (1);
550 				}
551 				if (rpool->weight >= rpool->curweight)
552 					break;
553 				pf_addr_inc(&rpool->counter, af);
554 			} while (1);
555 
556 			weight = rpool->weight;
557 		}
558 
559 		pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
560 		if (init_addr != NULL && PF_AZERO(init_addr, af))
561 			pf_addrcpy(init_addr, &rpool->counter, af);
562 		pf_addr_inc(&rpool->counter, af);
563 		break;
564 	case PF_POOL_LEASTSTATES:
565 		/* retrieve an address first */
566 		if (rpool->addr.type == PF_ADDR_TABLE ||
567 		    rpool->addr.type == PF_ADDR_DYNIFTL) {
568 			if (pfr_pool_get(rpool, &raddr, &rmask, af)) {
569 				/* see PF_POOL_ROUNDROBIN */
570 				memset(&rpool->counter, 0,
571 				    sizeof(rpool->counter));
572 				if (pfr_pool_get(rpool, &raddr, &rmask, af))
573 					return (1);
574 			}
575 		} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
576 			return (1);
577 
578 		states = rpool->states;
579 		weight = rpool->weight;
580 		kif = rpool->kif;
581 
582 		if ((rpool->addr.type == PF_ADDR_TABLE &&
583 		    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
584 		    (rpool->addr.type == PF_ADDR_DYNIFTL &&
585 		    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0))
586 			load = ((UINT16_MAX * rpool->states) / rpool->weight);
587 		else
588 			load = states;
589 
590 		pf_addrcpy(&faddr, &rpool->counter, af);
591 
592 		pf_addrcpy(naddr, &rpool->counter, af);
593 		if (init_addr != NULL && PF_AZERO(init_addr, af))
594 			pf_addrcpy(init_addr, naddr, af);
595 
596 		/*
597 		 * iterate *once* over whole table and find destination with
598 		 * least connection
599 		 */
600 		do  {
601 			pf_addr_inc(&rpool->counter, af);
602 			if (rpool->addr.type == PF_ADDR_TABLE ||
603 			    rpool->addr.type == PF_ADDR_DYNIFTL) {
604 				if (pfr_pool_get(rpool, &raddr, &rmask, af))
605 					return (1);
606 			} else if (pf_match_addr(0, raddr, rmask,
607 			    &rpool->counter, af))
608 				return (1);
609 
610 			if ((rpool->addr.type == PF_ADDR_TABLE &&
611 			    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
612 			    (rpool->addr.type == PF_ADDR_DYNIFTL &&
613 			    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0))
614 				cload = ((UINT16_MAX * rpool->states)
615 					/ rpool->weight);
616 			else
617 				cload = rpool->states;
618 
619 			/* find lc minimum */
620 			if (cload < load) {
621 				states = rpool->states;
622 				weight = rpool->weight;
623 				kif = rpool->kif;
624 				load = cload;
625 
626 				pf_addrcpy(naddr, &rpool->counter, af);
627 				if (init_addr != NULL &&
628 				    PF_AZERO(init_addr, af))
629 				    pf_addrcpy(init_addr, naddr, af);
630 			}
631 		} while (pf_match_addr(1, &faddr, rmask, &rpool->counter, af) &&
632 		    (states > 0));
633 
634 		if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
635 			return (1);
636 		/* revert the kif which was set by pfr_pool_get() */
637 		rpool->kif = kif;
638 		break;
639 	}
640 
641 	if (rpool->opts & PF_POOL_STICKYADDR) {
642 		if (sns[type] != NULL) {
643 			pf_remove_src_node(sns[type]);
644 			sns[type] = NULL;
645 		}
646 		if (pf_insert_src_node(&sns[type], r, type, af, saddr, naddr,
647 		    rpool->kif))
648 			return (1);
649 	}
650 
651 	if (pf_status.debug >= LOG_INFO &&
652 	    (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
653 		log(LOG_INFO, "pf: pf_map_addr: selected address ");
654 		pf_print_host(naddr, 0, af);
655 		if ((rpool->opts & PF_POOL_TYPEMASK) ==
656 		    PF_POOL_LEASTSTATES)
657 			addlog(" with state count %llu", states);
658 		if ((rpool->addr.type == PF_ADDR_TABLE &&
659 		    rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
660 		    (rpool->addr.type == PF_ADDR_DYNIFTL &&
661 		    rpool->addr.p.dyn->pfid_kt->pfrkt_refcntcost > 0))
662 			addlog(" with weight %u", weight);
663 		addlog("\n");
664 	}
665 
666 	return (0);
667 }
668 
669 int
670 pf_map_addr_states_increase(sa_family_t af, struct pf_pool *rpool,
671     struct pf_addr *naddr)
672 {
673 	if (rpool->addr.type == PF_ADDR_TABLE) {
674 		if (pfr_states_increase(rpool->addr.p.tbl,
675 		    naddr, af) == -1) {
676 			if (pf_status.debug >= LOG_DEBUG) {
677 				log(LOG_DEBUG,
678 				    "pf: pf_map_addr_states_increase: "
679 				    "selected address ");
680 				pf_print_host(naddr, 0, af);
681 				addlog(". Failed to increase count!\n");
682 			}
683 			return (-1);
684 		}
685 	} else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
686 		if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt,
687 		    naddr, af) == -1) {
688 			if (pf_status.debug >= LOG_DEBUG) {
689 				log(LOG_DEBUG,
690 				    "pf: pf_map_addr_states_increase: "
691 				    "selected address ");
692 				pf_print_host(naddr, 0, af);
693 				addlog(". Failed to increase count!\n");
694 			}
695 			return (-1);
696 		}
697 	}
698 	return (0);
699 }
700 
701 int
702 pf_get_transaddr(struct pf_rule *r, struct pf_pdesc *pd,
703     struct pf_src_node **sns, struct pf_rule **nr)
704 {
705 	struct pf_addr	naddr;
706 	u_int16_t	nport;
707 
708 #ifdef INET6
709 	if (pd->af != pd->naf)
710 		return (pf_get_transaddr_af(r, pd, sns));
711 #endif /* INET6 */
712 
713 	if (r->nat.addr.type != PF_ADDR_NONE) {
714 		/* XXX is this right? what if rtable is changed at the same
715 		 * XXX time? where do I need to figure out the sport? */
716 		nport = 0;
717 		if (pf_get_sport(pd, r, &naddr, &nport,
718 		    r->nat.proxy_port[0], r->nat.proxy_port[1], sns)) {
719 			DPFPRINTF(LOG_NOTICE,
720 			    "pf: NAT proxy port allocation (%u-%u) failed",
721 			    r->nat.proxy_port[0],
722 			    r->nat.proxy_port[1]);
723 			return (-1);
724 		}
725 		*nr = r;
726 		pf_addrcpy(&pd->nsaddr, &naddr, pd->af);
727 		pd->nsport = nport;
728 	}
729 	if (r->rdr.addr.type != PF_ADDR_NONE) {
730 		if (pf_map_addr(pd->af, r, &pd->nsaddr, &naddr, NULL, sns,
731 		    &r->rdr, PF_SN_RDR))
732 			return (-1);
733 		if ((r->rdr.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
734 			pf_poolmask(&naddr, &naddr,  &r->rdr.addr.v.a.mask,
735 			    &pd->ndaddr, pd->af);
736 
737 		nport = 0;
738 		if (r->rdr.proxy_port[1]) {
739 			u_int32_t	tmp_nport;
740 			u_int16_t	div;
741 
742 			div = r->rdr.proxy_port[1] - r->rdr.proxy_port[0] + 1;
743 			div = (div == 0) ? 1 : div;
744 
745 			tmp_nport = ((ntohs(pd->ndport) - ntohs(r->dst.port[0])) % div) +
746 			    r->rdr.proxy_port[0];
747 
748 			/* wrap around if necessary */
749 			if (tmp_nport > 65535)
750 				tmp_nport -= 65535;
751 			nport = htons((u_int16_t)tmp_nport);
752 		} else if (r->rdr.proxy_port[0])
753 			nport = htons(r->rdr.proxy_port[0]);
754 		*nr = r;
755 		pf_addrcpy(&pd->ndaddr, &naddr, pd->af);
756 		if (nport)
757 			pd->ndport = nport;
758 	}
759 
760 	return (0);
761 }
762 
763 #ifdef INET6
764 int
765 pf_get_transaddr_af(struct pf_rule *r, struct pf_pdesc *pd,
766     struct pf_src_node **sns)
767 {
768 	struct pf_addr	ndaddr, nsaddr, naddr;
769 	u_int16_t	nport;
770 	int		prefixlen = 96;
771 
772 	if (pf_status.debug >= LOG_INFO) {
773 		log(LOG_INFO, "pf: af-to %s %s, ",
774 		    pd->naf == AF_INET ? "inet" : "inet6",
775 		    r->rdr.addr.type == PF_ADDR_NONE ? "nat" : "rdr");
776 		pf_print_host(&pd->nsaddr, pd->nsport, pd->af);
777 		addlog(" -> ");
778 		pf_print_host(&pd->ndaddr, pd->ndport, pd->af);
779 		addlog("\n");
780 	}
781 
782 	if (r->nat.addr.type == PF_ADDR_NONE)
783 		panic("pf_get_transaddr_af: no nat pool for source address");
784 
785 	/* get source address and port */
786 	nport = 0;
787 	if (pf_get_sport(pd, r, &nsaddr, &nport,
788 	    r->nat.proxy_port[0], r->nat.proxy_port[1], sns)) {
789 		DPFPRINTF(LOG_NOTICE,
790 		    "pf: af-to NAT proxy port allocation (%u-%u) failed",
791 		    r->nat.proxy_port[0],
792 		    r->nat.proxy_port[1]);
793 		return (-1);
794 	}
795 	pd->nsport = nport;
796 
797 	if (pd->proto == IPPROTO_ICMPV6 && pd->naf == AF_INET) {
798 		if (pd->dir == PF_IN) {
799 			pd->ndport = ntohs(pd->ndport);
800 			if (pd->ndport == ICMP6_ECHO_REQUEST)
801 				pd->ndport = ICMP_ECHO;
802 			else if (pd->ndport == ICMP6_ECHO_REPLY)
803 				pd->ndport = ICMP_ECHOREPLY;
804 			pd->ndport = htons(pd->ndport);
805 		} else {
806 			pd->nsport = ntohs(pd->nsport);
807 			if (pd->nsport == ICMP6_ECHO_REQUEST)
808 				pd->nsport = ICMP_ECHO;
809 			else if (pd->nsport == ICMP6_ECHO_REPLY)
810 				pd->nsport = ICMP_ECHOREPLY;
811 			pd->nsport = htons(pd->nsport);
812 		}
813 	} else if (pd->proto == IPPROTO_ICMP && pd->naf == AF_INET6) {
814 		if (pd->dir == PF_IN) {
815 			pd->ndport = ntohs(pd->ndport);
816 			if (pd->ndport == ICMP_ECHO)
817 				pd->ndport = ICMP6_ECHO_REQUEST;
818 			else if (pd->ndport == ICMP_ECHOREPLY)
819 				pd->ndport = ICMP6_ECHO_REPLY;
820 			pd->ndport = htons(pd->ndport);
821 		} else {
822 			pd->nsport = ntohs(pd->nsport);
823 			if (pd->nsport == ICMP_ECHO)
824 				pd->nsport = ICMP6_ECHO_REQUEST;
825 			else if (pd->nsport == ICMP_ECHOREPLY)
826 				pd->nsport = ICMP6_ECHO_REPLY;
827 			pd->nsport = htons(pd->nsport);
828 		}
829 	}
830 
831 	/* get the destination address and port */
832 	if (r->rdr.addr.type != PF_ADDR_NONE) {
833 		if (pf_map_addr(pd->naf, r, &nsaddr, &naddr, NULL, sns,
834 		    &r->rdr, PF_SN_RDR))
835 			return (-1);
836 		if (r->rdr.proxy_port[0])
837 			pd->ndport = htons(r->rdr.proxy_port[0]);
838 
839 		if (pd->naf == AF_INET) {
840 			/* The prefix is the IPv4 rdr address */
841 			prefixlen = in_mask2len((struct in_addr *)
842 			    &r->rdr.addr.v.a.mask);
843 			inet_nat46(pd->naf, &pd->ndaddr,
844 			    &ndaddr, &naddr, prefixlen);
845 		} else {
846 			/* The prefix is the IPv6 rdr address */
847 			prefixlen =
848 			    in6_mask2len((struct in6_addr *)
849 			    &r->rdr.addr.v.a.mask, NULL);
850 			inet_nat64(pd->naf, &pd->ndaddr,
851 			    &ndaddr, &naddr, prefixlen);
852 		}
853 	} else {
854 		if (pd->naf == AF_INET) {
855 			/* The prefix is the IPv6 dst address */
856 			prefixlen =
857 			    in6_mask2len((struct in6_addr *)
858 			    &r->dst.addr.v.a.mask, NULL);
859 			if (prefixlen < 32)
860 				prefixlen = 96;
861 			inet_nat64(pd->naf, &pd->ndaddr,
862 			    &ndaddr, &pd->ndaddr, prefixlen);
863 		} else {
864 			/*
865 			 * The prefix is the IPv6 nat address
866 			 * (that was stored in pd->nsaddr)
867 			 */
868 			prefixlen = in6_mask2len((struct in6_addr *)
869 			    &r->nat.addr.v.a.mask, NULL);
870 			if (prefixlen > 96)
871 				prefixlen = 96;
872 			inet_nat64(pd->naf, &pd->ndaddr,
873 			    &ndaddr, &nsaddr, prefixlen);
874 		}
875 	}
876 
877 	pf_addrcpy(&pd->nsaddr, &nsaddr, pd->naf);
878 	pf_addrcpy(&pd->ndaddr, &ndaddr, pd->naf);
879 
880 	if (pf_status.debug >= LOG_INFO) {
881 		log(LOG_INFO, "pf: af-to %s %s done, prefixlen %d, ",
882 		    pd->naf == AF_INET ? "inet" : "inet6",
883 		    r->rdr.addr.type == PF_ADDR_NONE ? "nat" : "rdr",
884 		    prefixlen);
885 		pf_print_host(&pd->nsaddr, pd->nsport, pd->naf);
886 		addlog(" -> ");
887 		pf_print_host(&pd->ndaddr, pd->ndport, pd->naf);
888 		addlog("\n");
889 	}
890 
891 	return (0);
892 }
893 #endif /* INET6 */
894 
895 int
896 pf_postprocess_addr(struct pf_state *cur)
897 {
898 	struct pf_rule		*nr;
899 	struct pf_state_key	*sks;
900 	struct pf_pool		 rpool;
901 	struct pf_addr		 lookup_addr;
902 	int			 slbcount = -1;
903 
904 	nr = cur->natrule.ptr;
905 
906 	if (nr == NULL)
907 		return (0);
908 
909 	/* decrease counter */
910 
911 	sks = cur->key[PF_SK_STACK];
912 
913 	/* check for outgoing or ingoing balancing */
914 	if (nr->rt == PF_ROUTETO)
915 		lookup_addr = cur->rt_addr;
916 	else if (sks != NULL)
917 		lookup_addr = sks->addr[1];
918 	else {
919 		if (pf_status.debug >= LOG_DEBUG) {
920 			log(LOG_DEBUG, "pf: %s: unable to obtain address",
921 			    __func__);
922 		}
923 		return (1);
924 	}
925 
926 	/* check for appropriate pool */
927 	if (nr->rdr.addr.type != PF_ADDR_NONE)
928 		rpool = nr->rdr;
929 	else if (nr->nat.addr.type != PF_ADDR_NONE)
930 		rpool = nr->nat;
931 	else if (nr->route.addr.type != PF_ADDR_NONE)
932 		rpool = nr->route;
933 	else
934 		return (0);
935 
936 	if (((rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_LEASTSTATES))
937 		return (0);
938 
939 	if (rpool.addr.type == PF_ADDR_TABLE) {
940 		if ((slbcount = pfr_states_decrease(
941 		    rpool.addr.p.tbl,
942 		    &lookup_addr, sks->af)) == -1) {
943 			if (pf_status.debug >= LOG_DEBUG) {
944 				log(LOG_DEBUG, "pf: %s: selected address ",
945 				    __func__);
946 				pf_print_host(&lookup_addr,
947 				    sks->port[0], sks->af);
948 				addlog(". Failed to "
949 				    "decrease count!\n");
950 			}
951 			return (1);
952 		}
953 	} else if (rpool.addr.type == PF_ADDR_DYNIFTL) {
954 		if ((slbcount = pfr_states_decrease(
955 		    rpool.addr.p.dyn->pfid_kt,
956 		    &lookup_addr, sks->af)) == -1) {
957 			if (pf_status.debug >= LOG_DEBUG) {
958 				log(LOG_DEBUG, "pf: %s: selected address ",
959 				    __func__);
960 				pf_print_host(&lookup_addr,
961 				    sks->port[0], sks->af);
962 				addlog(". Failed to "
963 				    "decrease count!\n");
964 			}
965 			return (1);
966 		}
967 	}
968 	if (slbcount > -1) {
969 		if (pf_status.debug >= LOG_INFO) {
970 			log(LOG_INFO, "pf: %s: selected address ", __func__);
971 			pf_print_host(&lookup_addr, sks->port[0],
972 			    sks->af);
973 			addlog(" decreased state count to %u\n",
974 			    slbcount);
975 		}
976 	}
977 	return (0);
978 }
979