xref: /openbsd-src/usr.sbin/bgpd/session.c (revision 5a38ef86d0b61900239c7913d24a05e7b88a58f0)
1 /*	$OpenBSD: session.c,v 1.424 2021/09/03 07:48:24 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004, 2005 Henning Brauer <henning@openbsd.org>
5  * Copyright (c) 2017 Peter van Dijk <peter.van.dijk@powerdns.com>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/types.h>
21 
22 #include <sys/mman.h>
23 #include <sys/socket.h>
24 #include <sys/time.h>
25 #include <sys/resource.h>
26 #include <sys/un.h>
27 #include <netinet/in.h>
28 #include <netinet/ip.h>
29 #include <netinet/tcp.h>
30 #include <arpa/inet.h>
31 #include <limits.h>
32 
33 #include <err.h>
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <ifaddrs.h>
37 #include <poll.h>
38 #include <pwd.h>
39 #include <signal.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <syslog.h>
44 #include <unistd.h>
45 
46 #include "bgpd.h"
47 #include "session.h"
48 #include "log.h"
49 
50 #define PFD_PIPE_MAIN		0
51 #define PFD_PIPE_ROUTE		1
52 #define PFD_PIPE_ROUTE_CTL	2
53 #define PFD_SOCK_CTL		3
54 #define PFD_SOCK_RCTL		4
55 #define PFD_LISTENERS_START	5
56 
57 void	session_sighdlr(int);
58 int	setup_listeners(u_int *);
59 void	init_peer(struct peer *);
60 void	start_timer_holdtime(struct peer *);
61 void	start_timer_keepalive(struct peer *);
62 void	session_close_connection(struct peer *);
63 void	change_state(struct peer *, enum session_state, enum session_events);
64 int	session_setup_socket(struct peer *);
65 void	session_accept(int);
66 int	session_connect(struct peer *);
67 void	session_tcp_established(struct peer *);
68 void	session_capa_ann_none(struct peer *);
69 int	session_capa_add(struct ibuf *, u_int8_t, u_int8_t);
70 int	session_capa_add_mp(struct ibuf *, u_int8_t);
71 int	session_capa_add_afi(struct peer *, struct ibuf *, u_int8_t, u_int8_t);
72 struct bgp_msg	*session_newmsg(enum msg_type, u_int16_t);
73 int	session_sendmsg(struct bgp_msg *, struct peer *);
74 void	session_open(struct peer *);
75 void	session_keepalive(struct peer *);
76 void	session_update(u_int32_t, void *, size_t);
77 void	session_notification(struct peer *, u_int8_t, u_int8_t, void *,
78 	    ssize_t);
79 void	session_rrefresh(struct peer *, u_int8_t, u_int8_t);
80 int	session_graceful_restart(struct peer *);
81 int	session_graceful_stop(struct peer *);
82 int	session_dispatch_msg(struct pollfd *, struct peer *);
83 void	session_process_msg(struct peer *);
84 int	parse_header(struct peer *, u_char *, u_int16_t *, u_int8_t *);
85 int	parse_open(struct peer *);
86 int	parse_update(struct peer *);
87 int	parse_rrefresh(struct peer *);
88 int	parse_notification(struct peer *);
89 int	parse_capabilities(struct peer *, u_char *, u_int16_t, u_int32_t *);
90 int	capa_neg_calc(struct peer *);
91 void	session_dispatch_imsg(struct imsgbuf *, int, u_int *);
92 void	session_up(struct peer *);
93 void	session_down(struct peer *);
94 int	imsg_rde(int, u_int32_t, void *, u_int16_t);
95 void	session_demote(struct peer *, int);
96 void	merge_peers(struct bgpd_config *, struct bgpd_config *);
97 
98 int		 la_cmp(struct listen_addr *, struct listen_addr *);
99 void		 session_template_clone(struct peer *, struct sockaddr *,
100 		    u_int32_t, u_int32_t);
101 int		 session_match_mask(struct peer *, struct bgpd_addr *);
102 
103 static struct bgpd_config	*conf, *nconf;
104 static struct imsgbuf		*ibuf_rde;
105 static struct imsgbuf		*ibuf_rde_ctl;
106 static struct imsgbuf		*ibuf_main;
107 
108 struct bgpd_sysdep	 sysdep;
109 volatile sig_atomic_t	 session_quit;
110 int			 pending_reconf;
111 int			 csock = -1, rcsock = -1;
112 u_int			 peer_cnt;
113 
114 struct mrt_head		 mrthead;
115 time_t			 pauseaccept;
116 
117 static inline int
118 peer_compare(const struct peer *a, const struct peer *b)
119 {
120 	return a->conf.id - b->conf.id;
121 }
122 
123 RB_GENERATE(peer_head, peer, entry, peer_compare);
124 
125 void
126 session_sighdlr(int sig)
127 {
128 	switch (sig) {
129 	case SIGINT:
130 	case SIGTERM:
131 		session_quit = 1;
132 		break;
133 	}
134 }
135 
136 int
137 setup_listeners(u_int *la_cnt)
138 {
139 	int			 ttl = 255;
140 	struct listen_addr	*la;
141 	u_int			 cnt = 0;
142 
143 	TAILQ_FOREACH(la, conf->listen_addrs, entry) {
144 		la->reconf = RECONF_NONE;
145 		cnt++;
146 
147 		if (la->flags & LISTENER_LISTENING)
148 			continue;
149 
150 		if (la->fd == -1) {
151 			log_warn("cannot establish listener on %s: invalid fd",
152 			    log_sockaddr((struct sockaddr *)&la->sa,
153 			    la->sa_len));
154 			continue;
155 		}
156 
157 		if (tcp_md5_prep_listener(la, &conf->peers) == -1)
158 			fatal("tcp_md5_prep_listener");
159 
160 		/* set ttl to 255 so that ttl-security works */
161 		if (la->sa.ss_family == AF_INET && setsockopt(la->fd,
162 		    IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)) == -1) {
163 			log_warn("setup_listeners setsockopt TTL");
164 			continue;
165 		}
166 		if (la->sa.ss_family == AF_INET6 && setsockopt(la->fd,
167 		    IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) == -1) {
168 			log_warn("setup_listeners setsockopt hoplimit");
169 			continue;
170 		}
171 
172 		if (listen(la->fd, MAX_BACKLOG)) {
173 			close(la->fd);
174 			fatal("listen");
175 		}
176 
177 		la->flags |= LISTENER_LISTENING;
178 
179 		log_info("listening on %s",
180 		    log_sockaddr((struct sockaddr *)&la->sa, la->sa_len));
181 	}
182 
183 	*la_cnt = cnt;
184 
185 	return (0);
186 }
187 
188 void
189 session_main(int debug, int verbose)
190 {
191 	int			 timeout;
192 	unsigned int		 i, j, idx_peers, idx_listeners, idx_mrts;
193 	u_int			 pfd_elms = 0, peer_l_elms = 0, mrt_l_elms = 0;
194 	u_int			 listener_cnt, ctl_cnt, mrt_cnt;
195 	u_int			 new_cnt;
196 	struct passwd		*pw;
197 	struct peer		*p, **peer_l = NULL, *next;
198 	struct mrt		*m, *xm, **mrt_l = NULL;
199 	struct pollfd		*pfd = NULL;
200 	struct listen_addr	*la;
201 	void			*newp;
202 	time_t			 now;
203 	short			 events;
204 
205 	log_init(debug, LOG_DAEMON);
206 	log_setverbose(verbose);
207 
208 	log_procinit(log_procnames[PROC_SE]);
209 
210 	if ((pw = getpwnam(BGPD_USER)) == NULL)
211 		fatal(NULL);
212 
213 	if (chroot(pw->pw_dir) == -1)
214 		fatal("chroot");
215 	if (chdir("/") == -1)
216 		fatal("chdir(\"/\")");
217 
218 	setproctitle("session engine");
219 
220 	if (setgroups(1, &pw->pw_gid) ||
221 	    setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) ||
222 	    setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid))
223 		fatal("can't drop privileges");
224 
225 	if (pledge("stdio inet recvfd", NULL) == -1)
226 		fatal("pledge");
227 
228 	signal(SIGTERM, session_sighdlr);
229 	signal(SIGINT, session_sighdlr);
230 	signal(SIGPIPE, SIG_IGN);
231 	signal(SIGHUP, SIG_IGN);
232 	signal(SIGALRM, SIG_IGN);
233 	signal(SIGUSR1, SIG_IGN);
234 
235 	if ((ibuf_main = malloc(sizeof(struct imsgbuf))) == NULL)
236 		fatal(NULL);
237 	imsg_init(ibuf_main, 3);
238 
239 	LIST_INIT(&mrthead);
240 	listener_cnt = 0;
241 	peer_cnt = 0;
242 	ctl_cnt = 0;
243 
244 	conf = new_config();
245 	log_info("session engine ready");
246 
247 	while (session_quit == 0) {
248 		/* check for peers to be initialized or deleted */
249 		if (!pending_reconf) {
250 			RB_FOREACH_SAFE(p, peer_head, &conf->peers, next) {
251 				/* cloned peer that idled out? */
252 				if (p->template && (p->state == STATE_IDLE ||
253 				    p->state == STATE_ACTIVE) &&
254 				    getmonotime() - p->stats.last_updown >=
255 				    INTERVAL_HOLD_CLONED)
256 					p->reconf_action = RECONF_DELETE;
257 
258 				/* new peer that needs init? */
259 				if (p->state == STATE_NONE)
260 					init_peer(p);
261 
262 				/* reinit due? */
263 				if (p->reconf_action == RECONF_REINIT) {
264 					session_stop(p, ERR_CEASE_ADMIN_RESET);
265 					if (!p->conf.down)
266 						timer_set(&p->timers,
267 						    Timer_IdleHold, 0);
268 				}
269 
270 				/* deletion due? */
271 				if (p->reconf_action == RECONF_DELETE) {
272 					if (p->demoted)
273 						session_demote(p, -1);
274 					p->conf.demote_group[0] = 0;
275 					session_stop(p, ERR_CEASE_PEER_UNCONF);
276 					timer_remove_all(&p->timers);
277 					tcp_md5_del_listener(conf, p);
278 					log_peer_warnx(&p->conf, "removed");
279 					RB_REMOVE(peer_head, &conf->peers, p);
280 					free(p);
281 					peer_cnt--;
282 					continue;
283 				}
284 				p->reconf_action = RECONF_NONE;
285 			}
286 		}
287 
288 		if (peer_cnt > peer_l_elms) {
289 			if ((newp = reallocarray(peer_l, peer_cnt,
290 			    sizeof(struct peer *))) == NULL) {
291 				/* panic for now  */
292 				log_warn("could not resize peer_l from %u -> %u"
293 				    " entries", peer_l_elms, peer_cnt);
294 				fatalx("exiting");
295 			}
296 			peer_l = newp;
297 			peer_l_elms = peer_cnt;
298 		}
299 
300 		mrt_cnt = 0;
301 		for (m = LIST_FIRST(&mrthead); m != NULL; m = xm) {
302 			xm = LIST_NEXT(m, entry);
303 			if (m->state == MRT_STATE_REMOVE) {
304 				mrt_clean(m);
305 				LIST_REMOVE(m, entry);
306 				free(m);
307 				continue;
308 			}
309 			if (m->wbuf.queued)
310 				mrt_cnt++;
311 		}
312 
313 		if (mrt_cnt > mrt_l_elms) {
314 			if ((newp = reallocarray(mrt_l, mrt_cnt,
315 			    sizeof(struct mrt *))) == NULL) {
316 				/* panic for now  */
317 				log_warn("could not resize mrt_l from %u -> %u"
318 				    " entries", mrt_l_elms, mrt_cnt);
319 				fatalx("exiting");
320 			}
321 			mrt_l = newp;
322 			mrt_l_elms = mrt_cnt;
323 		}
324 
325 		new_cnt = PFD_LISTENERS_START + listener_cnt + peer_cnt +
326 		    ctl_cnt + mrt_cnt;
327 		if (new_cnt > pfd_elms) {
328 			if ((newp = reallocarray(pfd, new_cnt,
329 			    sizeof(struct pollfd))) == NULL) {
330 				/* panic for now  */
331 				log_warn("could not resize pfd from %u -> %u"
332 				    " entries", pfd_elms, new_cnt);
333 				fatalx("exiting");
334 			}
335 			pfd = newp;
336 			pfd_elms = new_cnt;
337 		}
338 
339 		bzero(pfd, sizeof(struct pollfd) * pfd_elms);
340 
341 		set_pollfd(&pfd[PFD_PIPE_MAIN], ibuf_main);
342 		set_pollfd(&pfd[PFD_PIPE_ROUTE], ibuf_rde);
343 		set_pollfd(&pfd[PFD_PIPE_ROUTE_CTL], ibuf_rde_ctl);
344 
345 		if (pauseaccept == 0) {
346 			pfd[PFD_SOCK_CTL].fd = csock;
347 			pfd[PFD_SOCK_CTL].events = POLLIN;
348 			pfd[PFD_SOCK_RCTL].fd = rcsock;
349 			pfd[PFD_SOCK_RCTL].events = POLLIN;
350 		} else {
351 			pfd[PFD_SOCK_CTL].fd = -1;
352 			pfd[PFD_SOCK_RCTL].fd = -1;
353 		}
354 
355 		i = PFD_LISTENERS_START;
356 		TAILQ_FOREACH(la, conf->listen_addrs, entry) {
357 			if (pauseaccept == 0) {
358 				pfd[i].fd = la->fd;
359 				pfd[i].events = POLLIN;
360 			} else
361 				pfd[i].fd = -1;
362 			i++;
363 		}
364 		idx_listeners = i;
365 		timeout = 240;	/* loop every 240s at least */
366 
367 		now = getmonotime();
368 		RB_FOREACH(p, peer_head, &conf->peers) {
369 			time_t	nextaction;
370 			struct timer *pt;
371 
372 			/* check timers */
373 			if ((pt = timer_nextisdue(&p->timers, now)) != NULL) {
374 				switch (pt->type) {
375 				case Timer_Hold:
376 					bgp_fsm(p, EVNT_TIMER_HOLDTIME);
377 					break;
378 				case Timer_SendHold:
379 					bgp_fsm(p, EVNT_TIMER_SENDHOLD);
380 					break;
381 				case Timer_ConnectRetry:
382 					bgp_fsm(p, EVNT_TIMER_CONNRETRY);
383 					break;
384 				case Timer_Keepalive:
385 					bgp_fsm(p, EVNT_TIMER_KEEPALIVE);
386 					break;
387 				case Timer_IdleHold:
388 					bgp_fsm(p, EVNT_START);
389 					break;
390 				case Timer_IdleHoldReset:
391 					p->IdleHoldTime =
392 					    INTERVAL_IDLE_HOLD_INITIAL;
393 					p->errcnt = 0;
394 					timer_stop(&p->timers,
395 					    Timer_IdleHoldReset);
396 					break;
397 				case Timer_CarpUndemote:
398 					timer_stop(&p->timers,
399 					    Timer_CarpUndemote);
400 					if (p->demoted &&
401 					    p->state == STATE_ESTABLISHED)
402 						session_demote(p, -1);
403 					break;
404 				case Timer_RestartTimeout:
405 					timer_stop(&p->timers,
406 					    Timer_RestartTimeout);
407 					session_graceful_stop(p);
408 					break;
409 				default:
410 					fatalx("King Bula lost in time");
411 				}
412 			}
413 			if ((nextaction = timer_nextduein(&p->timers,
414 			    now)) != -1 && nextaction < timeout)
415 				timeout = nextaction;
416 
417 			/* are we waiting for a write? */
418 			events = POLLIN;
419 			if (p->wbuf.queued > 0 || p->state == STATE_CONNECT)
420 				events |= POLLOUT;
421 			/* is there still work to do? */
422 			if (p->rpending && p->rbuf && p->rbuf->wpos)
423 				timeout = 0;
424 
425 			/* poll events */
426 			if (p->fd != -1 && events != 0) {
427 				pfd[i].fd = p->fd;
428 				pfd[i].events = events;
429 				peer_l[i - idx_listeners] = p;
430 				i++;
431 			}
432 		}
433 
434 		idx_peers = i;
435 
436 		LIST_FOREACH(m, &mrthead, entry)
437 			if (m->wbuf.queued) {
438 				pfd[i].fd = m->wbuf.fd;
439 				pfd[i].events = POLLOUT;
440 				mrt_l[i - idx_peers] = m;
441 				i++;
442 			}
443 
444 		idx_mrts = i;
445 
446 		i += control_fill_pfds(pfd + i, pfd_elms -i);
447 
448 		if (i > pfd_elms)
449 			fatalx("poll pfd overflow");
450 
451 		if (pauseaccept && timeout > 1)
452 			timeout = 1;
453 		if (timeout < 0)
454 			timeout = 0;
455 		if (poll(pfd, i, timeout * 1000) == -1)
456 			if (errno != EINTR)
457 				fatal("poll error");
458 
459 		/*
460 		 * If we previously saw fd exhaustion, we stop accept()
461 		 * for 1 second to throttle the accept() loop.
462 		 */
463 		if (pauseaccept && getmonotime() > pauseaccept + 1)
464 			pauseaccept = 0;
465 
466 		if (handle_pollfd(&pfd[PFD_PIPE_MAIN], ibuf_main) == -1) {
467 			log_warnx("SE: Lost connection to parent");
468 			session_quit = 1;
469 			continue;
470 		} else
471 			session_dispatch_imsg(ibuf_main, PFD_PIPE_MAIN,
472 			    &listener_cnt);
473 
474 		if (handle_pollfd(&pfd[PFD_PIPE_ROUTE], ibuf_rde) == -1) {
475 			log_warnx("SE: Lost connection to RDE");
476 			msgbuf_clear(&ibuf_rde->w);
477 			free(ibuf_rde);
478 			ibuf_rde = NULL;
479 		} else
480 			session_dispatch_imsg(ibuf_rde, PFD_PIPE_ROUTE,
481 			    &listener_cnt);
482 
483 		if (handle_pollfd(&pfd[PFD_PIPE_ROUTE_CTL], ibuf_rde_ctl) ==
484 		    -1) {
485 			log_warnx("SE: Lost connection to RDE control");
486 			msgbuf_clear(&ibuf_rde_ctl->w);
487 			free(ibuf_rde_ctl);
488 			ibuf_rde_ctl = NULL;
489 		} else
490 			session_dispatch_imsg(ibuf_rde_ctl, PFD_PIPE_ROUTE_CTL,
491 			    &listener_cnt);
492 
493 		if (pfd[PFD_SOCK_CTL].revents & POLLIN)
494 			ctl_cnt += control_accept(csock, 0);
495 
496 		if (pfd[PFD_SOCK_RCTL].revents & POLLIN)
497 			ctl_cnt += control_accept(rcsock, 1);
498 
499 		for (j = PFD_LISTENERS_START; j < idx_listeners; j++)
500 			if (pfd[j].revents & POLLIN)
501 				session_accept(pfd[j].fd);
502 
503 		for (; j < idx_peers; j++)
504 			session_dispatch_msg(&pfd[j],
505 			    peer_l[j - idx_listeners]);
506 
507 		RB_FOREACH(p, peer_head, &conf->peers)
508 			if (p->rbuf && p->rbuf->wpos)
509 				session_process_msg(p);
510 
511 		for (; j < idx_mrts; j++)
512 			if (pfd[j].revents & POLLOUT)
513 				mrt_write(mrt_l[j - idx_peers]);
514 
515 		for (; j < i; j++)
516 			ctl_cnt -= control_dispatch_msg(&pfd[j], &conf->peers);
517 	}
518 
519 	RB_FOREACH_SAFE(p, peer_head, &conf->peers, next) {
520 		RB_REMOVE(peer_head, &conf->peers, p);
521 		strlcpy(p->conf.reason,
522 		    "bgpd shutting down",
523 		    sizeof(p->conf.reason));
524 		session_stop(p, ERR_CEASE_ADMIN_DOWN);
525 		timer_remove_all(&p->timers);
526 		free(p);
527 	}
528 
529 	while ((m = LIST_FIRST(&mrthead)) != NULL) {
530 		mrt_clean(m);
531 		LIST_REMOVE(m, entry);
532 		free(m);
533 	}
534 
535 	free_config(conf);
536 	free(peer_l);
537 	free(mrt_l);
538 	free(pfd);
539 
540 	/* close pipes */
541 	if (ibuf_rde) {
542 		msgbuf_write(&ibuf_rde->w);
543 		msgbuf_clear(&ibuf_rde->w);
544 		close(ibuf_rde->fd);
545 		free(ibuf_rde);
546 	}
547 	if (ibuf_rde_ctl) {
548 		msgbuf_clear(&ibuf_rde_ctl->w);
549 		close(ibuf_rde_ctl->fd);
550 		free(ibuf_rde_ctl);
551 	}
552 	msgbuf_write(&ibuf_main->w);
553 	msgbuf_clear(&ibuf_main->w);
554 	close(ibuf_main->fd);
555 	free(ibuf_main);
556 
557 	control_shutdown(csock);
558 	control_shutdown(rcsock);
559 	log_info("session engine exiting");
560 	exit(0);
561 }
562 
563 void
564 init_peer(struct peer *p)
565 {
566 	TAILQ_INIT(&p->timers);
567 	p->fd = p->wbuf.fd = -1;
568 
569 	if (p->conf.if_depend[0])
570 		imsg_compose(ibuf_main, IMSG_IFINFO, 0, 0, -1,
571 		    p->conf.if_depend, sizeof(p->conf.if_depend));
572 	else
573 		p->depend_ok = 1;
574 
575 	peer_cnt++;
576 
577 	change_state(p, STATE_IDLE, EVNT_NONE);
578 	if (p->conf.down)
579 		timer_stop(&p->timers, Timer_IdleHold); /* no autostart */
580 	else
581 		timer_set(&p->timers, Timer_IdleHold, 0); /* start ASAP */
582 
583 	/*
584 	 * on startup, demote if requested.
585 	 * do not handle new peers. they must reach ESTABLISHED beforehands.
586 	 * peers added at runtime have reconf_action set to RECONF_REINIT.
587 	 */
588 	if (p->reconf_action != RECONF_REINIT && p->conf.demote_group[0])
589 		session_demote(p, +1);
590 }
591 
592 void
593 bgp_fsm(struct peer *peer, enum session_events event)
594 {
595 	switch (peer->state) {
596 	case STATE_NONE:
597 		/* nothing */
598 		break;
599 	case STATE_IDLE:
600 		switch (event) {
601 		case EVNT_START:
602 			timer_stop(&peer->timers, Timer_Hold);
603 			timer_stop(&peer->timers, Timer_SendHold);
604 			timer_stop(&peer->timers, Timer_Keepalive);
605 			timer_stop(&peer->timers, Timer_IdleHold);
606 
607 			/* allocate read buffer */
608 			peer->rbuf = calloc(1, sizeof(struct ibuf_read));
609 			if (peer->rbuf == NULL)
610 				fatal(NULL);
611 
612 			/* init write buffer */
613 			msgbuf_init(&peer->wbuf);
614 
615 			peer->stats.last_sent_errcode = 0;
616 			peer->stats.last_sent_suberr = 0;
617 			peer->stats.last_rcvd_errcode = 0;
618 			peer->stats.last_rcvd_suberr = 0;
619 
620 			if (!peer->depend_ok)
621 				timer_stop(&peer->timers, Timer_ConnectRetry);
622 			else if (peer->passive || peer->conf.passive ||
623 			    peer->conf.template) {
624 				change_state(peer, STATE_ACTIVE, event);
625 				timer_stop(&peer->timers, Timer_ConnectRetry);
626 			} else {
627 				change_state(peer, STATE_CONNECT, event);
628 				timer_set(&peer->timers, Timer_ConnectRetry,
629 				    conf->connectretry);
630 				session_connect(peer);
631 			}
632 			peer->passive = 0;
633 			break;
634 		default:
635 			/* ignore */
636 			break;
637 		}
638 		break;
639 	case STATE_CONNECT:
640 		switch (event) {
641 		case EVNT_START:
642 			/* ignore */
643 			break;
644 		case EVNT_CON_OPEN:
645 			session_tcp_established(peer);
646 			session_open(peer);
647 			timer_stop(&peer->timers, Timer_ConnectRetry);
648 			peer->holdtime = INTERVAL_HOLD_INITIAL;
649 			start_timer_holdtime(peer);
650 			change_state(peer, STATE_OPENSENT, event);
651 			break;
652 		case EVNT_CON_OPENFAIL:
653 			timer_set(&peer->timers, Timer_ConnectRetry,
654 			    conf->connectretry);
655 			session_close_connection(peer);
656 			change_state(peer, STATE_ACTIVE, event);
657 			break;
658 		case EVNT_TIMER_CONNRETRY:
659 			timer_set(&peer->timers, Timer_ConnectRetry,
660 			    conf->connectretry);
661 			session_connect(peer);
662 			break;
663 		default:
664 			change_state(peer, STATE_IDLE, event);
665 			break;
666 		}
667 		break;
668 	case STATE_ACTIVE:
669 		switch (event) {
670 		case EVNT_START:
671 			/* ignore */
672 			break;
673 		case EVNT_CON_OPEN:
674 			session_tcp_established(peer);
675 			session_open(peer);
676 			timer_stop(&peer->timers, Timer_ConnectRetry);
677 			peer->holdtime = INTERVAL_HOLD_INITIAL;
678 			start_timer_holdtime(peer);
679 			change_state(peer, STATE_OPENSENT, event);
680 			break;
681 		case EVNT_CON_OPENFAIL:
682 			timer_set(&peer->timers, Timer_ConnectRetry,
683 			    conf->connectretry);
684 			session_close_connection(peer);
685 			change_state(peer, STATE_ACTIVE, event);
686 			break;
687 		case EVNT_TIMER_CONNRETRY:
688 			timer_set(&peer->timers, Timer_ConnectRetry,
689 			    peer->holdtime);
690 			change_state(peer, STATE_CONNECT, event);
691 			session_connect(peer);
692 			break;
693 		default:
694 			change_state(peer, STATE_IDLE, event);
695 			break;
696 		}
697 		break;
698 	case STATE_OPENSENT:
699 		switch (event) {
700 		case EVNT_START:
701 			/* ignore */
702 			break;
703 		case EVNT_STOP:
704 			change_state(peer, STATE_IDLE, event);
705 			break;
706 		case EVNT_CON_CLOSED:
707 			session_close_connection(peer);
708 			timer_set(&peer->timers, Timer_ConnectRetry,
709 			    conf->connectretry);
710 			change_state(peer, STATE_ACTIVE, event);
711 			break;
712 		case EVNT_CON_FATAL:
713 			change_state(peer, STATE_IDLE, event);
714 			break;
715 		case EVNT_TIMER_HOLDTIME:
716 		case EVNT_TIMER_SENDHOLD:
717 			session_notification(peer, ERR_HOLDTIMEREXPIRED,
718 			    0, NULL, 0);
719 			change_state(peer, STATE_IDLE, event);
720 			break;
721 		case EVNT_RCVD_OPEN:
722 			/* parse_open calls change_state itself on failure */
723 			if (parse_open(peer))
724 				break;
725 			session_keepalive(peer);
726 			change_state(peer, STATE_OPENCONFIRM, event);
727 			break;
728 		case EVNT_RCVD_NOTIFICATION:
729 			if (parse_notification(peer)) {
730 				change_state(peer, STATE_IDLE, event);
731 				/* don't punish, capa negotiation */
732 				timer_set(&peer->timers, Timer_IdleHold, 0);
733 				peer->IdleHoldTime /= 2;
734 			} else
735 				change_state(peer, STATE_IDLE, event);
736 			break;
737 		default:
738 			session_notification(peer,
739 			    ERR_FSM, ERR_FSM_UNEX_OPENSENT, NULL, 0);
740 			change_state(peer, STATE_IDLE, event);
741 			break;
742 		}
743 		break;
744 	case STATE_OPENCONFIRM:
745 		switch (event) {
746 		case EVNT_START:
747 			/* ignore */
748 			break;
749 		case EVNT_STOP:
750 			change_state(peer, STATE_IDLE, event);
751 			break;
752 		case EVNT_CON_CLOSED:
753 		case EVNT_CON_FATAL:
754 			change_state(peer, STATE_IDLE, event);
755 			break;
756 		case EVNT_TIMER_HOLDTIME:
757 		case EVNT_TIMER_SENDHOLD:
758 			session_notification(peer, ERR_HOLDTIMEREXPIRED,
759 			    0, NULL, 0);
760 			change_state(peer, STATE_IDLE, event);
761 			break;
762 		case EVNT_TIMER_KEEPALIVE:
763 			session_keepalive(peer);
764 			break;
765 		case EVNT_RCVD_KEEPALIVE:
766 			start_timer_holdtime(peer);
767 			change_state(peer, STATE_ESTABLISHED, event);
768 			break;
769 		case EVNT_RCVD_NOTIFICATION:
770 			parse_notification(peer);
771 			change_state(peer, STATE_IDLE, event);
772 			break;
773 		default:
774 			session_notification(peer,
775 			    ERR_FSM, ERR_FSM_UNEX_OPENCONFIRM, NULL, 0);
776 			change_state(peer, STATE_IDLE, event);
777 			break;
778 		}
779 		break;
780 	case STATE_ESTABLISHED:
781 		switch (event) {
782 		case EVNT_START:
783 			/* ignore */
784 			break;
785 		case EVNT_STOP:
786 			change_state(peer, STATE_IDLE, event);
787 			break;
788 		case EVNT_CON_CLOSED:
789 		case EVNT_CON_FATAL:
790 			change_state(peer, STATE_IDLE, event);
791 			break;
792 		case EVNT_TIMER_HOLDTIME:
793 		case EVNT_TIMER_SENDHOLD:
794 			session_notification(peer, ERR_HOLDTIMEREXPIRED,
795 			    0, NULL, 0);
796 			change_state(peer, STATE_IDLE, event);
797 			break;
798 		case EVNT_TIMER_KEEPALIVE:
799 			session_keepalive(peer);
800 			break;
801 		case EVNT_RCVD_KEEPALIVE:
802 			start_timer_holdtime(peer);
803 			break;
804 		case EVNT_RCVD_UPDATE:
805 			start_timer_holdtime(peer);
806 			if (parse_update(peer))
807 				change_state(peer, STATE_IDLE, event);
808 			else
809 				start_timer_holdtime(peer);
810 			break;
811 		case EVNT_RCVD_NOTIFICATION:
812 			parse_notification(peer);
813 			change_state(peer, STATE_IDLE, event);
814 			break;
815 		default:
816 			session_notification(peer,
817 			    ERR_FSM, ERR_FSM_UNEX_ESTABLISHED, NULL, 0);
818 			change_state(peer, STATE_IDLE, event);
819 			break;
820 		}
821 		break;
822 	}
823 }
824 
825 void
826 start_timer_holdtime(struct peer *peer)
827 {
828 	if (peer->holdtime > 0)
829 		timer_set(&peer->timers, Timer_Hold, peer->holdtime);
830 	else
831 		timer_stop(&peer->timers, Timer_Hold);
832 }
833 
834 void
835 start_timer_keepalive(struct peer *peer)
836 {
837 	if (peer->holdtime > 0)
838 		timer_set(&peer->timers, Timer_Keepalive, peer->holdtime / 3);
839 	else
840 		timer_stop(&peer->timers, Timer_Keepalive);
841 }
842 
843 void
844 session_close_connection(struct peer *peer)
845 {
846 	if (peer->fd != -1) {
847 		close(peer->fd);
848 		pauseaccept = 0;
849 	}
850 	peer->fd = peer->wbuf.fd = -1;
851 }
852 
853 void
854 change_state(struct peer *peer, enum session_state state,
855     enum session_events event)
856 {
857 	struct mrt	*mrt;
858 
859 	switch (state) {
860 	case STATE_IDLE:
861 		/* carp demotion first. new peers handled in init_peer */
862 		if (peer->state == STATE_ESTABLISHED &&
863 		    peer->conf.demote_group[0] && !peer->demoted)
864 			session_demote(peer, +1);
865 
866 		/*
867 		 * try to write out what's buffered (maybe a notification),
868 		 * don't bother if it fails
869 		 */
870 		if (peer->state >= STATE_OPENSENT && peer->wbuf.queued)
871 			msgbuf_write(&peer->wbuf);
872 
873 		/*
874 		 * we must start the timer for the next EVNT_START
875 		 * if we are coming here due to an error and the
876 		 * session was not established successfully before, the
877 		 * starttimerinterval needs to be exponentially increased
878 		 */
879 		if (peer->IdleHoldTime == 0)
880 			peer->IdleHoldTime = INTERVAL_IDLE_HOLD_INITIAL;
881 		peer->holdtime = INTERVAL_HOLD_INITIAL;
882 		timer_stop(&peer->timers, Timer_ConnectRetry);
883 		timer_stop(&peer->timers, Timer_Keepalive);
884 		timer_stop(&peer->timers, Timer_Hold);
885 		timer_stop(&peer->timers, Timer_SendHold);
886 		timer_stop(&peer->timers, Timer_IdleHold);
887 		timer_stop(&peer->timers, Timer_IdleHoldReset);
888 		session_close_connection(peer);
889 		msgbuf_clear(&peer->wbuf);
890 		free(peer->rbuf);
891 		peer->rbuf = NULL;
892 		peer->rpending = 0;
893 		bzero(&peer->capa.peer, sizeof(peer->capa.peer));
894 		if (!peer->template)
895 			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
896 			    peer->conf.id, 0, -1, NULL, 0);
897 
898 		if (event != EVNT_STOP) {
899 			timer_set(&peer->timers, Timer_IdleHold,
900 			    peer->IdleHoldTime);
901 			if (event != EVNT_NONE &&
902 			    peer->IdleHoldTime < MAX_IDLE_HOLD/2)
903 				peer->IdleHoldTime *= 2;
904 		}
905 		if (peer->state == STATE_ESTABLISHED) {
906 			if (peer->capa.neg.grestart.restart == 2 &&
907 			    (event == EVNT_CON_CLOSED ||
908 			    event == EVNT_CON_FATAL)) {
909 				/* don't punish graceful restart */
910 				timer_set(&peer->timers, Timer_IdleHold, 0);
911 				peer->IdleHoldTime /= 2;
912 				session_graceful_restart(peer);
913 			} else
914 				session_down(peer);
915 		}
916 		if (peer->state == STATE_NONE ||
917 		    peer->state == STATE_ESTABLISHED) {
918 			/* initialize capability negotiation structures */
919 			memcpy(&peer->capa.ann, &peer->conf.capabilities,
920 			    sizeof(peer->capa.ann));
921 			if (!peer->conf.announce_capa)
922 				session_capa_ann_none(peer);
923 		}
924 		break;
925 	case STATE_CONNECT:
926 		if (peer->state == STATE_ESTABLISHED &&
927 		    peer->capa.neg.grestart.restart == 2) {
928 			/* do the graceful restart dance */
929 			session_graceful_restart(peer);
930 			peer->holdtime = INTERVAL_HOLD_INITIAL;
931 			timer_stop(&peer->timers, Timer_ConnectRetry);
932 			timer_stop(&peer->timers, Timer_Keepalive);
933 			timer_stop(&peer->timers, Timer_Hold);
934 			timer_stop(&peer->timers, Timer_SendHold);
935 			timer_stop(&peer->timers, Timer_IdleHold);
936 			timer_stop(&peer->timers, Timer_IdleHoldReset);
937 			session_close_connection(peer);
938 			msgbuf_clear(&peer->wbuf);
939 			bzero(&peer->capa.peer, sizeof(peer->capa.peer));
940 		}
941 		break;
942 	case STATE_ACTIVE:
943 		if (!peer->template)
944 			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
945 			    peer->conf.id, 0, -1, NULL, 0);
946 		break;
947 	case STATE_OPENSENT:
948 		break;
949 	case STATE_OPENCONFIRM:
950 		break;
951 	case STATE_ESTABLISHED:
952 		timer_set(&peer->timers, Timer_IdleHoldReset,
953 		    peer->IdleHoldTime);
954 		if (peer->demoted)
955 			timer_set(&peer->timers, Timer_CarpUndemote,
956 			    INTERVAL_HOLD_DEMOTED);
957 		session_up(peer);
958 		break;
959 	default:		/* something seriously fucked */
960 		break;
961 	}
962 
963 	log_statechange(peer, state, event);
964 	LIST_FOREACH(mrt, &mrthead, entry) {
965 		if (!(mrt->type == MRT_ALL_IN || mrt->type == MRT_ALL_OUT))
966 			continue;
967 		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
968 		    mrt->peer_id == peer->conf.id || (mrt->group_id != 0 &&
969 		    mrt->group_id == peer->conf.groupid))
970 			mrt_dump_state(mrt, peer->state, state, peer);
971 	}
972 	peer->prev_state = peer->state;
973 	peer->state = state;
974 }
975 
976 void
977 session_accept(int listenfd)
978 {
979 	int			 connfd;
980 	socklen_t		 len;
981 	struct sockaddr_storage	 cliaddr;
982 	struct peer		*p = NULL;
983 
984 	len = sizeof(cliaddr);
985 	if ((connfd = accept4(listenfd,
986 	    (struct sockaddr *)&cliaddr, &len,
987 	    SOCK_CLOEXEC | SOCK_NONBLOCK)) == -1) {
988 		if (errno == ENFILE || errno == EMFILE)
989 			pauseaccept = getmonotime();
990 		else if (errno != EWOULDBLOCK && errno != EINTR &&
991 		    errno != ECONNABORTED)
992 			log_warn("accept");
993 		return;
994 	}
995 
996 	p = getpeerbyip(conf, (struct sockaddr *)&cliaddr);
997 
998 	if (p != NULL && p->state == STATE_IDLE && p->errcnt < 2) {
999 		if (timer_running(&p->timers, Timer_IdleHold, NULL)) {
1000 			/* fast reconnect after clear */
1001 			p->passive = 1;
1002 			bgp_fsm(p, EVNT_START);
1003 		}
1004 	}
1005 
1006 	if (p != NULL &&
1007 	    (p->state == STATE_CONNECT || p->state == STATE_ACTIVE)) {
1008 		if (p->fd != -1) {
1009 			if (p->state == STATE_CONNECT)
1010 				session_close_connection(p);
1011 			else {
1012 				close(connfd);
1013 				return;
1014 			}
1015 		}
1016 
1017 open:
1018 		if (p->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
1019 			log_peer_warnx(&p->conf,
1020 			    "ipsec or md5sig configured but not available");
1021 			close(connfd);
1022 			return;
1023 		}
1024 
1025 		if (tcp_md5_check(connfd, p) == -1) {
1026 			close(connfd);
1027 			return;
1028 		}
1029 		p->fd = p->wbuf.fd = connfd;
1030 		if (session_setup_socket(p)) {
1031 			close(connfd);
1032 			return;
1033 		}
1034 		bgp_fsm(p, EVNT_CON_OPEN);
1035 		return;
1036 	} else if (p != NULL && p->state == STATE_ESTABLISHED &&
1037 	    p->capa.neg.grestart.restart == 2) {
1038 		/* first do the graceful restart dance */
1039 		change_state(p, STATE_CONNECT, EVNT_CON_CLOSED);
1040 		/* then do part of the open dance */
1041 		goto open;
1042 	} else {
1043 		log_conn_attempt(p, (struct sockaddr *)&cliaddr, len);
1044 		close(connfd);
1045 	}
1046 }
1047 
1048 int
1049 session_connect(struct peer *peer)
1050 {
1051 	struct sockaddr		*sa;
1052 	struct bgpd_addr	*bind_addr = NULL;
1053 	socklen_t		 sa_len;
1054 
1055 	/*
1056 	 * we do not need the overcomplicated collision detection RFC 1771
1057 	 * describes; we simply make sure there is only ever one concurrent
1058 	 * tcp connection per peer.
1059 	 */
1060 	if (peer->fd != -1)
1061 		return (-1);
1062 
1063 	if ((peer->fd = socket(aid2af(peer->conf.remote_addr.aid),
1064 	    SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, IPPROTO_TCP)) == -1) {
1065 		log_peer_warn(&peer->conf, "session_connect socket");
1066 		bgp_fsm(peer, EVNT_CON_OPENFAIL);
1067 		return (-1);
1068 	}
1069 
1070 	if (peer->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
1071 		log_peer_warnx(&peer->conf,
1072 		    "ipsec or md5sig configured but not available");
1073 		bgp_fsm(peer, EVNT_CON_OPENFAIL);
1074 		return (-1);
1075 	}
1076 
1077 	tcp_md5_set(peer->fd, peer);
1078 	peer->wbuf.fd = peer->fd;
1079 
1080 	/* if local-address is set we need to bind() */
1081 	switch (peer->conf.remote_addr.aid) {
1082 	case AID_INET:
1083 		bind_addr = &peer->conf.local_addr_v4;
1084 		break;
1085 	case AID_INET6:
1086 		bind_addr = &peer->conf.local_addr_v6;
1087 		break;
1088 	}
1089 	if ((sa = addr2sa(bind_addr, 0, &sa_len)) != NULL) {
1090 		if (bind(peer->fd, sa, sa_len) == -1) {
1091 			log_peer_warn(&peer->conf, "session_connect bind");
1092 			bgp_fsm(peer, EVNT_CON_OPENFAIL);
1093 			return (-1);
1094 		}
1095 	}
1096 
1097 	if (session_setup_socket(peer)) {
1098 		bgp_fsm(peer, EVNT_CON_OPENFAIL);
1099 		return (-1);
1100 	}
1101 
1102 	sa = addr2sa(&peer->conf.remote_addr, BGP_PORT, &sa_len);
1103 	if (connect(peer->fd, sa, sa_len) == -1) {
1104 		if (errno != EINPROGRESS) {
1105 			if (errno != peer->lasterr)
1106 				log_peer_warn(&peer->conf, "connect");
1107 			peer->lasterr = errno;
1108 			bgp_fsm(peer, EVNT_CON_OPENFAIL);
1109 			return (-1);
1110 		}
1111 	} else
1112 		bgp_fsm(peer, EVNT_CON_OPEN);
1113 
1114 	return (0);
1115 }
1116 
1117 int
1118 session_setup_socket(struct peer *p)
1119 {
1120 	int	ttl = p->conf.distance;
1121 	int	pre = IPTOS_PREC_INTERNETCONTROL;
1122 	int	nodelay = 1;
1123 	int	bsize;
1124 
1125 	switch (p->conf.remote_addr.aid) {
1126 	case AID_INET:
1127 		/* set precedence, see RFC 1771 appendix 5 */
1128 		if (setsockopt(p->fd, IPPROTO_IP, IP_TOS, &pre, sizeof(pre)) ==
1129 		    -1) {
1130 			log_peer_warn(&p->conf,
1131 			    "session_setup_socket setsockopt TOS");
1132 			return (-1);
1133 		}
1134 
1135 		if (p->conf.ebgp) {
1136 			/*
1137 			 * set TTL to foreign router's distance
1138 			 * 1=direct n=multihop with ttlsec, we always use 255
1139 			 */
1140 			if (p->conf.ttlsec) {
1141 				ttl = 256 - p->conf.distance;
1142 				if (setsockopt(p->fd, IPPROTO_IP, IP_MINTTL,
1143 				    &ttl, sizeof(ttl)) == -1) {
1144 					log_peer_warn(&p->conf,
1145 					    "session_setup_socket: "
1146 					    "setsockopt MINTTL");
1147 					return (-1);
1148 				}
1149 				ttl = 255;
1150 			}
1151 
1152 			if (setsockopt(p->fd, IPPROTO_IP, IP_TTL, &ttl,
1153 			    sizeof(ttl)) == -1) {
1154 				log_peer_warn(&p->conf,
1155 				    "session_setup_socket setsockopt TTL");
1156 				return (-1);
1157 			}
1158 		}
1159 		break;
1160 	case AID_INET6:
1161 		if (p->conf.ebgp) {
1162 			/*
1163 			 * set hoplimit to foreign router's distance
1164 			 * 1=direct n=multihop with ttlsec, we always use 255
1165 			 */
1166 			if (p->conf.ttlsec) {
1167 				ttl = 256 - p->conf.distance;
1168 				if (setsockopt(p->fd, IPPROTO_IPV6,
1169 				    IPV6_MINHOPCOUNT, &ttl, sizeof(ttl))
1170 				    == -1) {
1171 					log_peer_warn(&p->conf,
1172 					    "session_setup_socket: "
1173 					    "setsockopt MINHOPCOUNT");
1174 					return (-1);
1175 				}
1176 				ttl = 255;
1177 			}
1178 			if (setsockopt(p->fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
1179 			    &ttl, sizeof(ttl)) == -1) {
1180 				log_peer_warn(&p->conf,
1181 				    "session_setup_socket setsockopt hoplimit");
1182 				return (-1);
1183 			}
1184 		}
1185 		break;
1186 	}
1187 
1188 	/* set TCP_NODELAY */
1189 	if (setsockopt(p->fd, IPPROTO_TCP, TCP_NODELAY, &nodelay,
1190 	    sizeof(nodelay)) == -1) {
1191 		log_peer_warn(&p->conf,
1192 		    "session_setup_socket setsockopt TCP_NODELAY");
1193 		return (-1);
1194 	}
1195 
1196 	/* only increase bufsize (and thus window) if md5 or ipsec is in use */
1197 	if (p->conf.auth.method != AUTH_NONE) {
1198 		/* try to increase bufsize. no biggie if it fails */
1199 		bsize = 65535;
1200 		while (bsize > 8192 &&
1201 		    setsockopt(p->fd, SOL_SOCKET, SO_RCVBUF, &bsize,
1202 		    sizeof(bsize)) == -1 && errno != EINVAL)
1203 			bsize /= 2;
1204 		bsize = 65535;
1205 		while (bsize > 8192 &&
1206 		    setsockopt(p->fd, SOL_SOCKET, SO_SNDBUF, &bsize,
1207 		    sizeof(bsize)) == -1 && errno != EINVAL)
1208 			bsize /= 2;
1209 	}
1210 
1211 	return (0);
1212 }
1213 
1214 /* compare two sockaddrs by converting them into bgpd_addr */
1215 static int
1216 sa_cmp(struct sockaddr *a, struct sockaddr *b)
1217 {
1218 	struct bgpd_addr ba, bb;
1219 
1220 	sa2addr(a, &ba, NULL);
1221 	sa2addr(b, &bb, NULL);
1222 
1223 	return (memcmp(&ba, &bb, sizeof(ba)) == 0);
1224 }
1225 
1226 static void
1227 get_alternate_addr(struct sockaddr *sa, struct bgpd_addr *alt)
1228 {
1229 	struct ifaddrs	*ifap, *ifa, *match;
1230 
1231 	if (getifaddrs(&ifap) == -1)
1232 		fatal("getifaddrs");
1233 
1234 	for (match = ifap; match != NULL; match = match->ifa_next)
1235 		if (match->ifa_addr != NULL &&
1236 		    sa_cmp(sa, match->ifa_addr) == 0)
1237 			break;
1238 
1239 	if (match == NULL) {
1240 		log_warnx("%s: local address not found", __func__);
1241 		return;
1242 	}
1243 
1244 	switch (sa->sa_family) {
1245 	case AF_INET6:
1246 		for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) {
1247 			if (ifa->ifa_addr != NULL &&
1248 			    ifa->ifa_addr->sa_family == AF_INET &&
1249 			    strcmp(ifa->ifa_name, match->ifa_name) == 0) {
1250 				sa2addr(ifa->ifa_addr, alt, NULL);
1251 				break;
1252 			}
1253 		}
1254 		break;
1255 	case AF_INET:
1256 		for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) {
1257 			if (ifa->ifa_addr != NULL &&
1258 			    ifa->ifa_addr->sa_family == AF_INET6 &&
1259 			    strcmp(ifa->ifa_name, match->ifa_name) == 0) {
1260 				struct sockaddr_in6 *s =
1261 				    (struct sockaddr_in6 *)ifa->ifa_addr;
1262 
1263 				/* only accept global scope addresses */
1264 				if (IN6_IS_ADDR_LINKLOCAL(&s->sin6_addr) ||
1265 				    IN6_IS_ADDR_SITELOCAL(&s->sin6_addr))
1266 					continue;
1267 				sa2addr(ifa->ifa_addr, alt, NULL);
1268 				break;
1269 			}
1270 		}
1271 		break;
1272 	default:
1273 		log_warnx("%s: unsupported address family %d", __func__,
1274 		    sa->sa_family);
1275 		break;
1276 	}
1277 
1278 	freeifaddrs(ifap);
1279 }
1280 
1281 void
1282 session_tcp_established(struct peer *peer)
1283 {
1284 	struct sockaddr_storage	ss;
1285 	socklen_t		len;
1286 
1287 	len = sizeof(ss);
1288 	if (getsockname(peer->fd, (struct sockaddr *)&ss, &len) == -1)
1289 		log_warn("getsockname");
1290 	sa2addr((struct sockaddr *)&ss, &peer->local, &peer->local_port);
1291 	get_alternate_addr((struct sockaddr *)&ss, &peer->local_alt);
1292 	len = sizeof(ss);
1293 	if (getpeername(peer->fd, (struct sockaddr *)&ss, &len) == -1)
1294 		log_warn("getpeername");
1295 	sa2addr((struct sockaddr *)&ss, &peer->remote, &peer->remote_port);
1296 }
1297 
1298 void
1299 session_capa_ann_none(struct peer *peer)
1300 {
1301 	bzero(&peer->capa.ann, sizeof(peer->capa.ann));
1302 }
1303 
1304 int
1305 session_capa_add(struct ibuf *opb, u_int8_t capa_code, u_int8_t capa_len)
1306 {
1307 	int errs = 0;
1308 
1309 	errs += ibuf_add(opb, &capa_code, sizeof(capa_code));
1310 	errs += ibuf_add(opb, &capa_len, sizeof(capa_len));
1311 	return (errs);
1312 }
1313 
1314 int
1315 session_capa_add_mp(struct ibuf *buf, u_int8_t aid)
1316 {
1317 	u_int8_t		 safi, pad = 0;
1318 	u_int16_t		 afi;
1319 	int			 errs = 0;
1320 
1321 	if (aid2afi(aid, &afi, &safi) == -1)
1322 		fatalx("session_capa_add_mp: bad afi/safi pair");
1323 	afi = htons(afi);
1324 	errs += ibuf_add(buf, &afi, sizeof(afi));
1325 	errs += ibuf_add(buf, &pad, sizeof(pad));
1326 	errs += ibuf_add(buf, &safi, sizeof(safi));
1327 
1328 	return (errs);
1329 }
1330 
1331 int
1332 session_capa_add_afi(struct peer *p, struct ibuf *b, u_int8_t aid,
1333     u_int8_t flags)
1334 {
1335 	u_int		errs = 0;
1336 	u_int16_t	afi;
1337 	u_int8_t	safi;
1338 
1339 	if (aid2afi(aid, &afi, &safi)) {
1340 		log_warn("session_capa_add_afi: bad AID");
1341 		return (1);
1342 	}
1343 
1344 	afi = htons(afi);
1345 	errs += ibuf_add(b, &afi, sizeof(afi));
1346 	errs += ibuf_add(b, &safi, sizeof(safi));
1347 	errs += ibuf_add(b, &flags, sizeof(flags));
1348 
1349 	return (errs);
1350 }
1351 
1352 struct bgp_msg *
1353 session_newmsg(enum msg_type msgtype, u_int16_t len)
1354 {
1355 	struct bgp_msg		*msg;
1356 	struct msg_header	 hdr;
1357 	struct ibuf		*buf;
1358 	int			 errs = 0;
1359 
1360 	memset(&hdr.marker, 0xff, sizeof(hdr.marker));
1361 	hdr.len = htons(len);
1362 	hdr.type = msgtype;
1363 
1364 	if ((buf = ibuf_open(len)) == NULL)
1365 		return (NULL);
1366 
1367 	errs += ibuf_add(buf, &hdr.marker, sizeof(hdr.marker));
1368 	errs += ibuf_add(buf, &hdr.len, sizeof(hdr.len));
1369 	errs += ibuf_add(buf, &hdr.type, sizeof(hdr.type));
1370 
1371 	if (errs || (msg = calloc(1, sizeof(*msg))) == NULL) {
1372 		ibuf_free(buf);
1373 		return (NULL);
1374 	}
1375 
1376 	msg->buf = buf;
1377 	msg->type = msgtype;
1378 	msg->len = len;
1379 
1380 	return (msg);
1381 }
1382 
1383 int
1384 session_sendmsg(struct bgp_msg *msg, struct peer *p)
1385 {
1386 	struct mrt		*mrt;
1387 
1388 	LIST_FOREACH(mrt, &mrthead, entry) {
1389 		if (!(mrt->type == MRT_ALL_OUT || (msg->type == UPDATE &&
1390 		    mrt->type == MRT_UPDATE_OUT)))
1391 			continue;
1392 		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
1393 		    mrt->peer_id == p->conf.id || (mrt->group_id != 0 &&
1394 		    mrt->group_id == p->conf.groupid))
1395 			mrt_dump_bgp_msg(mrt, msg->buf->buf, msg->len, p,
1396 			    msg->type);
1397 	}
1398 
1399 	ibuf_close(&p->wbuf, msg->buf);
1400 	if (!p->throttled && p->wbuf.queued > SESS_MSG_HIGH_MARK) {
1401 		if (imsg_rde(IMSG_XOFF, p->conf.id, NULL, 0) == -1)
1402 			log_peer_warn(&p->conf, "imsg_compose XOFF");
1403 		else
1404 			p->throttled = 1;
1405 	}
1406 
1407 	free(msg);
1408 	return (0);
1409 }
1410 
1411 void
1412 session_open(struct peer *p)
1413 {
1414 	struct bgp_msg		*buf;
1415 	struct ibuf		*opb;
1416 	struct msg_open		 msg;
1417 	u_int16_t		 len, optparamlen = 0;
1418 	u_int8_t		 i, op_type;
1419 	int			 errs = 0, extlen = 0;
1420 	int			 mpcapa = 0;
1421 
1422 
1423 	if ((opb = ibuf_dynamic(0, UINT16_MAX - 3)) == NULL) {
1424 		bgp_fsm(p, EVNT_CON_FATAL);
1425 		return;
1426 	}
1427 
1428 	/* multiprotocol extensions, RFC 4760 */
1429 	for (i = 0; i < AID_MAX; i++)
1430 		if (p->capa.ann.mp[i]) {	/* 4 bytes data */
1431 			errs += session_capa_add(opb, CAPA_MP, 4);
1432 			errs += session_capa_add_mp(opb, i);
1433 			mpcapa++;
1434 		}
1435 
1436 	/* route refresh, RFC 2918 */
1437 	if (p->capa.ann.refresh)	/* no data */
1438 		errs += session_capa_add(opb, CAPA_REFRESH, 0);
1439 
1440 	/* graceful restart and End-of-RIB marker, RFC 4724 */
1441 	if (p->capa.ann.grestart.restart) {
1442 		int		rst = 0;
1443 		u_int16_t	hdr = 0;
1444 
1445 		for (i = 0; i < AID_MAX; i++) {
1446 			if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
1447 				rst++;
1448 		}
1449 
1450 		/* Only set the R-flag if no graceful restart is ongoing */
1451 		if (!rst)
1452 			hdr |= CAPA_GR_R_FLAG;
1453 		hdr = htons(hdr);
1454 
1455 		errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr));
1456 		errs += ibuf_add(opb, &hdr, sizeof(hdr));
1457 	}
1458 
1459 	/* 4-bytes AS numbers, RFC6793 */
1460 	if (p->capa.ann.as4byte) {	/* 4 bytes data */
1461 		u_int32_t	nas;
1462 
1463 		nas = htonl(p->conf.local_as);
1464 		errs += session_capa_add(opb, CAPA_AS4BYTE, sizeof(nas));
1465 		errs += ibuf_add(opb, &nas, sizeof(nas));
1466 	}
1467 
1468 	/* advertisement of multiple paths, RFC7911 */
1469 	if (p->capa.ann.add_path[0]) {	/* variable */
1470 		u_int8_t	aplen;
1471 
1472 		if (mpcapa)
1473 			aplen = 4 * mpcapa;
1474 		else	/* AID_INET */
1475 			aplen = 4;
1476 		errs += session_capa_add(opb, CAPA_ADD_PATH, aplen);
1477 		if (mpcapa) {
1478 			for (i = AID_MIN; i < AID_MAX; i++) {
1479 				if (p->capa.ann.mp[i]) {
1480 					errs += session_capa_add_afi(p, opb,
1481 					    i, p->capa.ann.add_path[i]);
1482 				}
1483 			}
1484 		} else {	/* AID_INET */
1485 			errs += session_capa_add_afi(p, opb, AID_INET,
1486 			    p->capa.ann.add_path[AID_INET]);
1487 		}
1488 	}
1489 
1490 	/* enhanced route-refresh, RFC7313 */
1491 	if (p->capa.ann.enhanced_rr)	/* no data */
1492 		errs += session_capa_add(opb, CAPA_ENHANCED_RR, 0);
1493 
1494 	optparamlen = ibuf_size(opb);
1495 	if (optparamlen == 0) {
1496 		/* nothing */
1497 	} else if (optparamlen + 2 >= 255) {
1498 		/* RFC9072: 2 byte lenght instead of 1 + 3 byte extra header */
1499 		optparamlen += sizeof(op_type) + 2 + 3;
1500 		msg.optparamlen = 255;
1501 		extlen = 1;
1502 	} else {
1503 		optparamlen += sizeof(op_type) + 1;
1504 		msg.optparamlen = optparamlen;
1505 	}
1506 
1507 	len = MSGSIZE_OPEN_MIN + optparamlen;
1508 	if (errs || (buf = session_newmsg(OPEN, len)) == NULL) {
1509 		ibuf_free(opb);
1510 		bgp_fsm(p, EVNT_CON_FATAL);
1511 		return;
1512 	}
1513 
1514 	msg.version = 4;
1515 	msg.myas = htons(p->conf.local_short_as);
1516 	if (p->conf.holdtime)
1517 		msg.holdtime = htons(p->conf.holdtime);
1518 	else
1519 		msg.holdtime = htons(conf->holdtime);
1520 	msg.bgpid = conf->bgpid;	/* is already in network byte order */
1521 
1522 	errs += ibuf_add(buf->buf, &msg.version, sizeof(msg.version));
1523 	errs += ibuf_add(buf->buf, &msg.myas, sizeof(msg.myas));
1524 	errs += ibuf_add(buf->buf, &msg.holdtime, sizeof(msg.holdtime));
1525 	errs += ibuf_add(buf->buf, &msg.bgpid, sizeof(msg.bgpid));
1526 	errs += ibuf_add(buf->buf, &msg.optparamlen, 1);
1527 
1528 	if (extlen) {
1529 		/* write RFC9072 extra header */
1530 		u_int16_t op_extlen = htons(optparamlen - 3);
1531 		op_type = OPT_PARAM_EXT_LEN;
1532 		errs += ibuf_add(buf->buf, &op_type, 1);
1533 		errs += ibuf_add(buf->buf, &op_extlen, 2);
1534 	}
1535 
1536 	if (optparamlen) {
1537 		op_type = OPT_PARAM_CAPABILITIES;
1538 		errs += ibuf_add(buf->buf, &op_type, sizeof(op_type));
1539 
1540 		optparamlen = ibuf_size(opb);
1541 		if (extlen) {
1542 			/* RFC9072: 2-byte extended length */
1543 			u_int16_t op_extlen = htons(optparamlen);
1544 			errs += ibuf_add(buf->buf, &op_extlen, 2);
1545 		} else {
1546 			u_int8_t op_len = optparamlen;
1547 			errs += ibuf_add(buf->buf, &op_len, 1);
1548 		}
1549 		errs += ibuf_add(buf->buf, opb->buf, ibuf_size(opb));
1550 	}
1551 
1552 	ibuf_free(opb);
1553 
1554 	if (errs) {
1555 		ibuf_free(buf->buf);
1556 		free(buf);
1557 		bgp_fsm(p, EVNT_CON_FATAL);
1558 		return;
1559 	}
1560 
1561 	if (session_sendmsg(buf, p) == -1) {
1562 		bgp_fsm(p, EVNT_CON_FATAL);
1563 		return;
1564 	}
1565 
1566 	p->stats.msg_sent_open++;
1567 }
1568 
1569 void
1570 session_keepalive(struct peer *p)
1571 {
1572 	struct bgp_msg		*buf;
1573 
1574 	if ((buf = session_newmsg(KEEPALIVE, MSGSIZE_KEEPALIVE)) == NULL ||
1575 	    session_sendmsg(buf, p) == -1) {
1576 		bgp_fsm(p, EVNT_CON_FATAL);
1577 		return;
1578 	}
1579 
1580 	start_timer_keepalive(p);
1581 	p->stats.msg_sent_keepalive++;
1582 }
1583 
1584 void
1585 session_update(u_int32_t peerid, void *data, size_t datalen)
1586 {
1587 	struct peer		*p;
1588 	struct bgp_msg		*buf;
1589 
1590 	if ((p = getpeerbyid(conf, peerid)) == NULL) {
1591 		log_warnx("no such peer: id=%u", peerid);
1592 		return;
1593 	}
1594 
1595 	if (p->state != STATE_ESTABLISHED)
1596 		return;
1597 
1598 	if ((buf = session_newmsg(UPDATE, MSGSIZE_HEADER + datalen)) == NULL) {
1599 		bgp_fsm(p, EVNT_CON_FATAL);
1600 		return;
1601 	}
1602 
1603 	if (ibuf_add(buf->buf, data, datalen)) {
1604 		ibuf_free(buf->buf);
1605 		free(buf);
1606 		bgp_fsm(p, EVNT_CON_FATAL);
1607 		return;
1608 	}
1609 
1610 	if (session_sendmsg(buf, p) == -1) {
1611 		bgp_fsm(p, EVNT_CON_FATAL);
1612 		return;
1613 	}
1614 
1615 	start_timer_keepalive(p);
1616 	p->stats.msg_sent_update++;
1617 }
1618 
1619 void
1620 session_notification(struct peer *p, u_int8_t errcode, u_int8_t subcode,
1621     void *data, ssize_t datalen)
1622 {
1623 	struct bgp_msg		*buf;
1624 	int			 errs = 0;
1625 
1626 	if (p->stats.last_sent_errcode)	/* some notification already sent */
1627 		return;
1628 
1629 	log_notification(p, errcode, subcode, data, datalen, "sending");
1630 
1631 	/* cap to maximum size */
1632 	if (datalen > MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN) {
1633 		log_peer_warnx(&p->conf,
1634 		    "oversized notification, data trunkated");
1635 		datalen = MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN;
1636 	}
1637 
1638 	if ((buf = session_newmsg(NOTIFICATION,
1639 	    MSGSIZE_NOTIFICATION_MIN + datalen)) == NULL) {
1640 		bgp_fsm(p, EVNT_CON_FATAL);
1641 		return;
1642 	}
1643 
1644 	errs += ibuf_add(buf->buf, &errcode, sizeof(errcode));
1645 	errs += ibuf_add(buf->buf, &subcode, sizeof(subcode));
1646 
1647 	if (datalen > 0)
1648 		errs += ibuf_add(buf->buf, data, datalen);
1649 
1650 	if (errs) {
1651 		ibuf_free(buf->buf);
1652 		free(buf);
1653 		bgp_fsm(p, EVNT_CON_FATAL);
1654 		return;
1655 	}
1656 
1657 	if (session_sendmsg(buf, p) == -1) {
1658 		bgp_fsm(p, EVNT_CON_FATAL);
1659 		return;
1660 	}
1661 
1662 	p->stats.msg_sent_notification++;
1663 	p->stats.last_sent_errcode = errcode;
1664 	p->stats.last_sent_suberr = subcode;
1665 }
1666 
1667 int
1668 session_neighbor_rrefresh(struct peer *p)
1669 {
1670 	u_int8_t	i;
1671 
1672 	if (!(p->capa.neg.refresh || p->capa.neg.enhanced_rr))
1673 		return (-1);
1674 
1675 	for (i = 0; i < AID_MAX; i++) {
1676 		if (p->capa.neg.mp[i] != 0)
1677 			session_rrefresh(p, i, ROUTE_REFRESH_REQUEST);
1678 	}
1679 
1680 	return (0);
1681 }
1682 
1683 void
1684 session_rrefresh(struct peer *p, u_int8_t aid, u_int8_t subtype)
1685 {
1686 	struct bgp_msg		*buf;
1687 	int			 errs = 0;
1688 	u_int16_t		 afi;
1689 	u_int8_t		 safi;
1690 
1691 	switch (subtype) {
1692 	case ROUTE_REFRESH_REQUEST:
1693 		p->stats.refresh_sent_req++;
1694 		break;
1695 	case ROUTE_REFRESH_BEGIN_RR:
1696 	case ROUTE_REFRESH_END_RR:
1697 		/* requires enhanced route refresh */
1698 		if (!p->capa.neg.enhanced_rr)
1699 			return;
1700 		if (subtype == ROUTE_REFRESH_BEGIN_RR)
1701 			p->stats.refresh_sent_borr++;
1702 		else
1703 			p->stats.refresh_sent_eorr++;
1704 		break;
1705 	default:
1706 		fatalx("session_rrefresh: bad subtype %d", subtype);
1707 	}
1708 
1709 	if (aid2afi(aid, &afi, &safi) == -1)
1710 		fatalx("session_rrefresh: bad afi/safi pair");
1711 
1712 	if ((buf = session_newmsg(RREFRESH, MSGSIZE_RREFRESH)) == NULL) {
1713 		bgp_fsm(p, EVNT_CON_FATAL);
1714 		return;
1715 	}
1716 
1717 	afi = htons(afi);
1718 	errs += ibuf_add(buf->buf, &afi, sizeof(afi));
1719 	errs += ibuf_add(buf->buf, &subtype, sizeof(subtype));
1720 	errs += ibuf_add(buf->buf, &safi, sizeof(safi));
1721 
1722 	if (errs) {
1723 		ibuf_free(buf->buf);
1724 		free(buf);
1725 		bgp_fsm(p, EVNT_CON_FATAL);
1726 		return;
1727 	}
1728 
1729 	if (session_sendmsg(buf, p) == -1) {
1730 		bgp_fsm(p, EVNT_CON_FATAL);
1731 		return;
1732 	}
1733 
1734 	p->stats.msg_sent_rrefresh++;
1735 }
1736 
1737 int
1738 session_graceful_restart(struct peer *p)
1739 {
1740 	u_int8_t	i;
1741 
1742 	timer_set(&p->timers, Timer_RestartTimeout,
1743 	    p->capa.neg.grestart.timeout);
1744 
1745 	for (i = 0; i < AID_MAX; i++) {
1746 		if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) {
1747 			if (imsg_rde(IMSG_SESSION_STALE, p->conf.id,
1748 			    &i, sizeof(i)) == -1)
1749 				return (-1);
1750 			log_peer_warnx(&p->conf,
1751 			    "graceful restart of %s, keeping routes",
1752 			    aid2str(i));
1753 			p->capa.neg.grestart.flags[i] |= CAPA_GR_RESTARTING;
1754 		} else if (p->capa.neg.mp[i]) {
1755 			if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
1756 			    &i, sizeof(i)) == -1)
1757 				return (-1);
1758 			log_peer_warnx(&p->conf,
1759 			    "graceful restart of %s, flushing routes",
1760 			    aid2str(i));
1761 		}
1762 	}
1763 	return (0);
1764 }
1765 
1766 int
1767 session_graceful_stop(struct peer *p)
1768 {
1769 	u_int8_t	i;
1770 
1771 	for (i = 0; i < AID_MAX; i++) {
1772 		/*
1773 		 * Only flush if the peer is restarting and the timeout fired.
1774 		 * In all other cases the session was already flushed when the
1775 		 * session went down or when the new open message was parsed.
1776 		 */
1777 		if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING) {
1778 			log_peer_warnx(&p->conf, "graceful restart of %s, "
1779 			    "time-out, flushing", aid2str(i));
1780 			if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
1781 			    &i, sizeof(i)) == -1)
1782 				return (-1);
1783 		}
1784 		p->capa.neg.grestart.flags[i] &= ~CAPA_GR_RESTARTING;
1785 	}
1786 	return (0);
1787 }
1788 
1789 int
1790 session_dispatch_msg(struct pollfd *pfd, struct peer *p)
1791 {
1792 	ssize_t		n;
1793 	socklen_t	len;
1794 	int		error;
1795 
1796 	if (p->state == STATE_CONNECT) {
1797 		if (pfd->revents & POLLOUT) {
1798 			if (pfd->revents & POLLIN) {
1799 				/* error occurred */
1800 				len = sizeof(error);
1801 				if (getsockopt(pfd->fd, SOL_SOCKET, SO_ERROR,
1802 				    &error, &len) == -1 || error) {
1803 					if (error)
1804 						errno = error;
1805 					if (errno != p->lasterr) {
1806 						log_peer_warn(&p->conf,
1807 						    "socket error");
1808 						p->lasterr = errno;
1809 					}
1810 					bgp_fsm(p, EVNT_CON_OPENFAIL);
1811 					return (1);
1812 				}
1813 			}
1814 			bgp_fsm(p, EVNT_CON_OPEN);
1815 			return (1);
1816 		}
1817 		if (pfd->revents & POLLHUP) {
1818 			bgp_fsm(p, EVNT_CON_OPENFAIL);
1819 			return (1);
1820 		}
1821 		if (pfd->revents & (POLLERR|POLLNVAL)) {
1822 			bgp_fsm(p, EVNT_CON_FATAL);
1823 			return (1);
1824 		}
1825 		return (0);
1826 	}
1827 
1828 	if (pfd->revents & POLLHUP) {
1829 		bgp_fsm(p, EVNT_CON_CLOSED);
1830 		return (1);
1831 	}
1832 	if (pfd->revents & (POLLERR|POLLNVAL)) {
1833 		bgp_fsm(p, EVNT_CON_FATAL);
1834 		return (1);
1835 	}
1836 
1837 	if (pfd->revents & POLLOUT && p->wbuf.queued) {
1838 		if ((error = msgbuf_write(&p->wbuf)) <= 0 && errno != EAGAIN) {
1839 			if (error == 0)
1840 				log_peer_warnx(&p->conf, "Connection closed");
1841 			else if (error == -1)
1842 				log_peer_warn(&p->conf, "write error");
1843 			bgp_fsm(p, EVNT_CON_FATAL);
1844 			return (1);
1845 		}
1846 		p->stats.last_write = getmonotime();
1847 		if (p->holdtime > 0)
1848 			timer_set(&p->timers, Timer_SendHold,
1849 			    p->holdtime < INTERVAL_HOLD ? INTERVAL_HOLD :
1850 			    p->holdtime);
1851 		if (p->throttled && p->wbuf.queued < SESS_MSG_LOW_MARK) {
1852 			if (imsg_rde(IMSG_XON, p->conf.id, NULL, 0) == -1)
1853 				log_peer_warn(&p->conf, "imsg_compose XON");
1854 			else
1855 				p->throttled = 0;
1856 		}
1857 		if (!(pfd->revents & POLLIN))
1858 			return (1);
1859 	}
1860 
1861 	if (p->rbuf && pfd->revents & POLLIN) {
1862 		if ((n = read(p->fd, p->rbuf->buf + p->rbuf->wpos,
1863 		    sizeof(p->rbuf->buf) - p->rbuf->wpos)) == -1) {
1864 			if (errno != EINTR && errno != EAGAIN) {
1865 				log_peer_warn(&p->conf, "read error");
1866 				bgp_fsm(p, EVNT_CON_FATAL);
1867 			}
1868 			return (1);
1869 		}
1870 		if (n == 0) {	/* connection closed */
1871 			bgp_fsm(p, EVNT_CON_CLOSED);
1872 			return (1);
1873 		}
1874 
1875 		p->rbuf->wpos += n;
1876 		p->stats.last_read = getmonotime();
1877 		return (1);
1878 	}
1879 	return (0);
1880 }
1881 
1882 void
1883 session_process_msg(struct peer *p)
1884 {
1885 	struct mrt	*mrt;
1886 	ssize_t		rpos, av, left;
1887 	int		processed = 0;
1888 	u_int16_t	msglen;
1889 	u_int8_t	msgtype;
1890 
1891 	rpos = 0;
1892 	av = p->rbuf->wpos;
1893 	p->rpending = 0;
1894 
1895 	/*
1896 	 * session might drop to IDLE -> buffers deallocated
1897 	 * we MUST check rbuf != NULL before use
1898 	 */
1899 	for (;;) {
1900 		if (p->rbuf == NULL)
1901 			return;
1902 		if (rpos + MSGSIZE_HEADER > av)
1903 			break;
1904 		if (parse_header(p, p->rbuf->buf + rpos, &msglen,
1905 		    &msgtype) == -1)
1906 			return;
1907 		if (rpos + msglen > av)
1908 			break;
1909 		p->rbuf->rptr = p->rbuf->buf + rpos;
1910 
1911 		/* dump to MRT as soon as we have a full packet */
1912 		LIST_FOREACH(mrt, &mrthead, entry) {
1913 			if (!(mrt->type == MRT_ALL_IN || (msgtype == UPDATE &&
1914 			    mrt->type == MRT_UPDATE_IN)))
1915 				continue;
1916 			if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
1917 			    mrt->peer_id == p->conf.id || (mrt->group_id != 0 &&
1918 			    mrt->group_id == p->conf.groupid))
1919 				mrt_dump_bgp_msg(mrt, p->rbuf->rptr, msglen, p,
1920 				    msgtype);
1921 		}
1922 
1923 		switch (msgtype) {
1924 		case OPEN:
1925 			bgp_fsm(p, EVNT_RCVD_OPEN);
1926 			p->stats.msg_rcvd_open++;
1927 			break;
1928 		case UPDATE:
1929 			bgp_fsm(p, EVNT_RCVD_UPDATE);
1930 			p->stats.msg_rcvd_update++;
1931 			break;
1932 		case NOTIFICATION:
1933 			bgp_fsm(p, EVNT_RCVD_NOTIFICATION);
1934 			p->stats.msg_rcvd_notification++;
1935 			break;
1936 		case KEEPALIVE:
1937 			bgp_fsm(p, EVNT_RCVD_KEEPALIVE);
1938 			p->stats.msg_rcvd_keepalive++;
1939 			break;
1940 		case RREFRESH:
1941 			parse_rrefresh(p);
1942 			p->stats.msg_rcvd_rrefresh++;
1943 			break;
1944 		default:	/* cannot happen */
1945 			session_notification(p, ERR_HEADER, ERR_HDR_TYPE,
1946 			    &msgtype, 1);
1947 			log_warnx("received message with unknown type %u",
1948 			    msgtype);
1949 			bgp_fsm(p, EVNT_CON_FATAL);
1950 		}
1951 		rpos += msglen;
1952 		if (++processed > MSG_PROCESS_LIMIT) {
1953 			p->rpending = 1;
1954 			break;
1955 		}
1956 	}
1957 
1958 	if (rpos < av) {
1959 		left = av - rpos;
1960 		memmove(&p->rbuf->buf, p->rbuf->buf + rpos, left);
1961 		p->rbuf->wpos = left;
1962 	} else
1963 		p->rbuf->wpos = 0;
1964 }
1965 
1966 int
1967 parse_header(struct peer *peer, u_char *data, u_int16_t *len, u_int8_t *type)
1968 {
1969 	u_char			*p;
1970 	u_int16_t		 olen;
1971 	static const u_int8_t	 marker[MSGSIZE_HEADER_MARKER] = { 0xff, 0xff,
1972 				    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1973 				    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
1974 
1975 	/* caller MUST make sure we are getting 19 bytes! */
1976 	p = data;
1977 	if (memcmp(p, marker, sizeof(marker))) {
1978 		log_peer_warnx(&peer->conf, "sync error");
1979 		session_notification(peer, ERR_HEADER, ERR_HDR_SYNC, NULL, 0);
1980 		bgp_fsm(peer, EVNT_CON_FATAL);
1981 		return (-1);
1982 	}
1983 	p += MSGSIZE_HEADER_MARKER;
1984 
1985 	memcpy(&olen, p, 2);
1986 	*len = ntohs(olen);
1987 	p += 2;
1988 	memcpy(type, p, 1);
1989 
1990 	if (*len < MSGSIZE_HEADER || *len > MAX_PKTSIZE) {
1991 		log_peer_warnx(&peer->conf,
1992 		    "received message: illegal length: %u byte", *len);
1993 		session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
1994 		    &olen, sizeof(olen));
1995 		bgp_fsm(peer, EVNT_CON_FATAL);
1996 		return (-1);
1997 	}
1998 
1999 	switch (*type) {
2000 	case OPEN:
2001 		if (*len < MSGSIZE_OPEN_MIN) {
2002 			log_peer_warnx(&peer->conf,
2003 			    "received OPEN: illegal len: %u byte", *len);
2004 			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
2005 			    &olen, sizeof(olen));
2006 			bgp_fsm(peer, EVNT_CON_FATAL);
2007 			return (-1);
2008 		}
2009 		break;
2010 	case NOTIFICATION:
2011 		if (*len < MSGSIZE_NOTIFICATION_MIN) {
2012 			log_peer_warnx(&peer->conf,
2013 			    "received NOTIFICATION: illegal len: %u byte",
2014 			    *len);
2015 			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
2016 			    &olen, sizeof(olen));
2017 			bgp_fsm(peer, EVNT_CON_FATAL);
2018 			return (-1);
2019 		}
2020 		break;
2021 	case UPDATE:
2022 		if (*len < MSGSIZE_UPDATE_MIN) {
2023 			log_peer_warnx(&peer->conf,
2024 			    "received UPDATE: illegal len: %u byte", *len);
2025 			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
2026 			    &olen, sizeof(olen));
2027 			bgp_fsm(peer, EVNT_CON_FATAL);
2028 			return (-1);
2029 		}
2030 		break;
2031 	case KEEPALIVE:
2032 		if (*len != MSGSIZE_KEEPALIVE) {
2033 			log_peer_warnx(&peer->conf,
2034 			    "received KEEPALIVE: illegal len: %u byte", *len);
2035 			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
2036 			    &olen, sizeof(olen));
2037 			bgp_fsm(peer, EVNT_CON_FATAL);
2038 			return (-1);
2039 		}
2040 		break;
2041 	case RREFRESH:
2042 		if (*len < MSGSIZE_RREFRESH_MIN) {
2043 			log_peer_warnx(&peer->conf,
2044 			    "received RREFRESH: illegal len: %u byte", *len);
2045 			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
2046 			    &olen, sizeof(olen));
2047 			bgp_fsm(peer, EVNT_CON_FATAL);
2048 			return (-1);
2049 		}
2050 		break;
2051 	default:
2052 		log_peer_warnx(&peer->conf,
2053 		    "received msg with unknown type %u", *type);
2054 		session_notification(peer, ERR_HEADER, ERR_HDR_TYPE,
2055 		    type, 1);
2056 		bgp_fsm(peer, EVNT_CON_FATAL);
2057 		return (-1);
2058 	}
2059 	return (0);
2060 }
2061 
2062 int
2063 parse_open(struct peer *peer)
2064 {
2065 	u_char		*p, *op_val;
2066 	u_int8_t	 version, rversion;
2067 	u_int16_t	 short_as, msglen;
2068 	u_int16_t	 holdtime, oholdtime, myholdtime;
2069 	u_int32_t	 as, bgpid;
2070 	u_int16_t	 optparamlen, extlen, plen, op_len;
2071 	u_int8_t	 op_type;
2072 
2073 	p = peer->rbuf->rptr;
2074 	p += MSGSIZE_HEADER_MARKER;
2075 	memcpy(&msglen, p, sizeof(msglen));
2076 	msglen = ntohs(msglen);
2077 
2078 	p = peer->rbuf->rptr;
2079 	p += MSGSIZE_HEADER;	/* header is already checked */
2080 
2081 	memcpy(&version, p, sizeof(version));
2082 	p += sizeof(version);
2083 
2084 	if (version != BGP_VERSION) {
2085 		log_peer_warnx(&peer->conf,
2086 		    "peer wants unrecognized version %u", version);
2087 		if (version > BGP_VERSION)
2088 			rversion = version - BGP_VERSION;
2089 		else
2090 			rversion = BGP_VERSION;
2091 		session_notification(peer, ERR_OPEN, ERR_OPEN_VERSION,
2092 		    &rversion, sizeof(rversion));
2093 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2094 		return (-1);
2095 	}
2096 
2097 	memcpy(&short_as, p, sizeof(short_as));
2098 	p += sizeof(short_as);
2099 	as = peer->short_as = ntohs(short_as);
2100 	if (as == 0) {
2101 		log_peer_warnx(&peer->conf,
2102 		    "peer requests unacceptable AS %u", as);
2103 		session_notification(peer, ERR_OPEN, ERR_OPEN_AS,
2104 		    NULL, 0);
2105 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2106 		return (-1);
2107 	}
2108 
2109 	memcpy(&oholdtime, p, sizeof(oholdtime));
2110 	p += sizeof(oholdtime);
2111 
2112 	holdtime = ntohs(oholdtime);
2113 	if (holdtime && holdtime < peer->conf.min_holdtime) {
2114 		log_peer_warnx(&peer->conf,
2115 		    "peer requests unacceptable holdtime %u", holdtime);
2116 		session_notification(peer, ERR_OPEN, ERR_OPEN_HOLDTIME,
2117 		    NULL, 0);
2118 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2119 		return (-1);
2120 	}
2121 
2122 	myholdtime = peer->conf.holdtime;
2123 	if (!myholdtime)
2124 		myholdtime = conf->holdtime;
2125 	if (holdtime < myholdtime)
2126 		peer->holdtime = holdtime;
2127 	else
2128 		peer->holdtime = myholdtime;
2129 
2130 	memcpy(&bgpid, p, sizeof(bgpid));
2131 	p += sizeof(bgpid);
2132 
2133 	/* check bgpid for validity - just disallow 0 */
2134 	if (ntohl(bgpid) == 0) {
2135 		log_peer_warnx(&peer->conf, "peer BGPID %u unacceptable",
2136 		    ntohl(bgpid));
2137 		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID,
2138 		    NULL, 0);
2139 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2140 		return (-1);
2141 	}
2142 	peer->remote_bgpid = bgpid;
2143 
2144 	extlen = 0;
2145 	optparamlen = *p++;
2146 
2147 	if (optparamlen == 0) {
2148 		if (msglen != MSGSIZE_OPEN_MIN) {
2149 bad_len:
2150 			log_peer_warnx(&peer->conf,
2151 			    "corrupt OPEN message received: length mismatch");
2152 			session_notification(peer, ERR_OPEN, 0, NULL, 0);
2153 			change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2154 			return (-1);
2155 		}
2156 	} else {
2157 		if (msglen < MSGSIZE_OPEN_MIN + 1)
2158 			goto bad_len;
2159 
2160 		op_type = *p;
2161 		if (op_type == OPT_PARAM_EXT_LEN) {
2162 			p++;
2163 			memcpy(&optparamlen, p, sizeof(optparamlen));
2164 			optparamlen = ntohs(optparamlen);
2165 			p += sizeof(optparamlen);
2166 			extlen = 1;
2167 		}
2168 
2169 		/* RFC9020 encoding has 3 extra bytes */
2170 		if (optparamlen + 3 * extlen != msglen - MSGSIZE_OPEN_MIN)
2171 			goto bad_len;
2172 	}
2173 
2174 	plen = optparamlen;
2175 	while (plen > 0) {
2176 		if (plen < 2 + extlen)
2177 			goto bad_len;
2178 
2179 		memcpy(&op_type, p, sizeof(op_type));
2180 		p += sizeof(op_type);
2181 		plen -= sizeof(op_type);
2182 		if (!extlen) {
2183 			op_len = *p++;
2184 			plen--;
2185 		} else {
2186 			memcpy(&op_len, p, sizeof(op_len));
2187 			op_len = ntohs(op_len);
2188 			p += sizeof(op_len);
2189 			plen -= sizeof(op_len);
2190 		}
2191 		if (op_len > 0) {
2192 			if (plen < op_len)
2193 				goto bad_len;
2194 			op_val = p;
2195 			p += op_len;
2196 			plen -= op_len;
2197 		} else
2198 			op_val = NULL;
2199 
2200 		switch (op_type) {
2201 		case OPT_PARAM_CAPABILITIES:		/* RFC 3392 */
2202 			if (parse_capabilities(peer, op_val, op_len,
2203 			    &as) == -1) {
2204 				session_notification(peer, ERR_OPEN, 0,
2205 				    NULL, 0);
2206 				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2207 				return (-1);
2208 			}
2209 			break;
2210 		case OPT_PARAM_AUTH:			/* deprecated */
2211 		default:
2212 			/*
2213 			 * unsupported type
2214 			 * the RFCs tell us to leave the data section empty
2215 			 * and notify the peer with ERR_OPEN, ERR_OPEN_OPT.
2216 			 * How the peer should know _which_ optional parameter
2217 			 * we don't support is beyond me.
2218 			 */
2219 			log_peer_warnx(&peer->conf,
2220 			    "received OPEN message with unsupported optional "
2221 			    "parameter: type %u", op_type);
2222 			session_notification(peer, ERR_OPEN, ERR_OPEN_OPT,
2223 				NULL, 0);
2224 			change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2225 			/* no punish */
2226 			timer_set(&peer->timers, Timer_IdleHold, 0);
2227 			peer->IdleHoldTime /= 2;
2228 			return (-1);
2229 		}
2230 	}
2231 
2232 	/* if remote-as is zero and it's a cloned neighbor, accept any */
2233 	if (peer->template && !peer->conf.remote_as && as != AS_TRANS) {
2234 		peer->conf.remote_as = as;
2235 		peer->conf.ebgp = (peer->conf.remote_as != peer->conf.local_as);
2236 		if (!peer->conf.ebgp)
2237 			/* force enforce_as off for iBGP sessions */
2238 			peer->conf.enforce_as = ENFORCE_AS_OFF;
2239 	}
2240 
2241 	if (peer->conf.remote_as != as) {
2242 		log_peer_warnx(&peer->conf, "peer sent wrong AS %s",
2243 		    log_as(as));
2244 		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL, 0);
2245 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2246 		return (-1);
2247 	}
2248 
2249 	/* on iBGP sessions check for bgpid collision */
2250 	if (!peer->conf.ebgp && peer->remote_bgpid == conf->bgpid) {
2251 		log_peer_warnx(&peer->conf, "peer BGPID %u conflicts with ours",
2252 		    ntohl(bgpid));
2253 		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID,
2254 		    NULL, 0);
2255 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2256 		return (-1);
2257 	}
2258 
2259 	if (capa_neg_calc(peer) == -1) {
2260 		log_peer_warnx(&peer->conf,
2261 		    "capability negotiation calculation failed");
2262 		session_notification(peer, ERR_OPEN, 0, NULL, 0);
2263 		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2264 		return (-1);
2265 	}
2266 
2267 	return (0);
2268 }
2269 
2270 int
2271 parse_update(struct peer *peer)
2272 {
2273 	u_char		*p;
2274 	u_int16_t	 datalen;
2275 
2276 	/*
2277 	 * we pass the message verbatim to the rde.
2278 	 * in case of errors the whole session is reset with a
2279 	 * notification anyway, we only need to know the peer
2280 	 */
2281 	p = peer->rbuf->rptr;
2282 	p += MSGSIZE_HEADER_MARKER;
2283 	memcpy(&datalen, p, sizeof(datalen));
2284 	datalen = ntohs(datalen);
2285 
2286 	p = peer->rbuf->rptr;
2287 	p += MSGSIZE_HEADER;	/* header is already checked */
2288 	datalen -= MSGSIZE_HEADER;
2289 
2290 	if (imsg_rde(IMSG_UPDATE, peer->conf.id, p, datalen) == -1)
2291 		return (-1);
2292 
2293 	return (0);
2294 }
2295 
2296 int
2297 parse_rrefresh(struct peer *peer)
2298 {
2299 	struct route_refresh rr;
2300 	u_int16_t afi, datalen;
2301 	u_int8_t aid, safi, subtype;
2302 	u_char *p;
2303 
2304 	p = peer->rbuf->rptr;
2305 	p += MSGSIZE_HEADER_MARKER;
2306 	memcpy(&datalen, p, sizeof(datalen));
2307 	datalen = ntohs(datalen);
2308 
2309 	p = peer->rbuf->rptr;
2310 	p += MSGSIZE_HEADER;	/* header is already checked */
2311 
2312 	/*
2313 	 * We could check if we actually announced the capability but
2314 	 * as long as the message is correctly encoded we don't care.
2315 	 */
2316 
2317 	/* afi, 2 byte */
2318 	memcpy(&afi, p, sizeof(afi));
2319 	afi = ntohs(afi);
2320 	p += 2;
2321 	/* subtype, 1 byte */
2322 	subtype = *p;
2323 	p += 1;
2324 	/* safi, 1 byte */
2325 	safi = *p;
2326 
2327 	/* check subtype if peer announced enhanced route refresh */
2328 	if (peer->capa.neg.enhanced_rr) {
2329 		switch (subtype) {
2330 		case ROUTE_REFRESH_REQUEST:
2331 			/* no ORF support, so no oversized RREFRESH msgs */
2332 			if (datalen != MSGSIZE_RREFRESH) {
2333 				log_peer_warnx(&peer->conf,
2334 				    "received RREFRESH: illegal len: %u byte",
2335 				    datalen);
2336 				datalen = htons(datalen);
2337 				session_notification(peer, ERR_HEADER,
2338 				    ERR_HDR_LEN, &datalen, sizeof(datalen));
2339 				bgp_fsm(peer, EVNT_CON_FATAL);
2340 				return (-1);
2341 			}
2342 			peer->stats.refresh_rcvd_req++;
2343 			break;
2344 		case ROUTE_REFRESH_BEGIN_RR:
2345 		case ROUTE_REFRESH_END_RR:
2346 			/* special handling for RFC7313 */
2347 			if (datalen != MSGSIZE_RREFRESH) {
2348 				log_peer_warnx(&peer->conf,
2349 				    "received RREFRESH: illegal len: %u byte",
2350 				    datalen);
2351 				p = peer->rbuf->rptr;
2352 				p += MSGSIZE_HEADER;
2353 				datalen -= MSGSIZE_HEADER;
2354 				session_notification(peer, ERR_RREFRESH,
2355 				    ERR_RR_INV_LEN, p, datalen);
2356 				bgp_fsm(peer, EVNT_CON_FATAL);
2357 				return (-1);
2358 			}
2359 			if (subtype == ROUTE_REFRESH_BEGIN_RR)
2360 				peer->stats.refresh_rcvd_borr++;
2361 			else
2362 				peer->stats.refresh_rcvd_eorr++;
2363 			break;
2364 		default:
2365 			log_peer_warnx(&peer->conf, "peer sent bad refresh, "
2366 			    "bad subtype %d", subtype);
2367 			return (0);
2368 		}
2369 	} else {
2370 		/* force subtype to default */
2371 		subtype = ROUTE_REFRESH_REQUEST;
2372 		peer->stats.refresh_rcvd_req++;
2373 	}
2374 
2375 	/* afi/safi unchecked -	unrecognized values will be ignored anyway */
2376 	if (afi2aid(afi, safi, &aid) == -1) {
2377 		log_peer_warnx(&peer->conf, "peer sent bad refresh, "
2378 		    "invalid afi/safi pair");
2379 		return (0);
2380 	}
2381 
2382 	if (!peer->capa.neg.refresh && !peer->capa.neg.enhanced_rr) {
2383 		log_peer_warnx(&peer->conf, "peer sent unexpected refresh");
2384 		return (0);
2385 	}
2386 
2387 	rr.aid = aid;
2388 	rr.subtype = subtype;
2389 
2390 	if (imsg_rde(IMSG_REFRESH, peer->conf.id, &rr, sizeof(rr)) == -1)
2391 		return (-1);
2392 
2393 	return (0);
2394 }
2395 
2396 int
2397 parse_notification(struct peer *peer)
2398 {
2399 	u_char		*p;
2400 	u_int16_t	 datalen;
2401 	u_int8_t	 errcode;
2402 	u_int8_t	 subcode;
2403 	u_int8_t	 capa_code;
2404 	u_int8_t	 capa_len;
2405 	size_t		 reason_len;
2406 	u_int8_t	 i;
2407 
2408 	/* just log */
2409 	p = peer->rbuf->rptr;
2410 	p += MSGSIZE_HEADER_MARKER;
2411 	memcpy(&datalen, p, sizeof(datalen));
2412 	datalen = ntohs(datalen);
2413 
2414 	p = peer->rbuf->rptr;
2415 	p += MSGSIZE_HEADER;	/* header is already checked */
2416 	datalen -= MSGSIZE_HEADER;
2417 
2418 	memcpy(&errcode, p, sizeof(errcode));
2419 	p += sizeof(errcode);
2420 	datalen -= sizeof(errcode);
2421 
2422 	memcpy(&subcode, p, sizeof(subcode));
2423 	p += sizeof(subcode);
2424 	datalen -= sizeof(subcode);
2425 
2426 	log_notification(peer, errcode, subcode, p, datalen, "received");
2427 	peer->errcnt++;
2428 	peer->stats.last_rcvd_errcode = errcode;
2429 	peer->stats.last_rcvd_suberr = subcode;
2430 
2431 	if (errcode == ERR_OPEN && subcode == ERR_OPEN_CAPA) {
2432 		if (datalen == 0) {	/* zebra likes to send those.. humbug */
2433 			log_peer_warnx(&peer->conf, "received \"unsupported "
2434 			    "capability\" notification without data part, "
2435 			    "disabling capability announcements altogether");
2436 			session_capa_ann_none(peer);
2437 		}
2438 
2439 		while (datalen > 0) {
2440 			if (datalen < 2) {
2441 				log_peer_warnx(&peer->conf,
2442 				    "parse_notification: "
2443 				    "expect len >= 2, len is %u", datalen);
2444 				return (-1);
2445 			}
2446 			memcpy(&capa_code, p, sizeof(capa_code));
2447 			p += sizeof(capa_code);
2448 			datalen -= sizeof(capa_code);
2449 			memcpy(&capa_len, p, sizeof(capa_len));
2450 			p += sizeof(capa_len);
2451 			datalen -= sizeof(capa_len);
2452 			if (datalen < capa_len) {
2453 				log_peer_warnx(&peer->conf,
2454 				    "parse_notification: capa_len %u exceeds "
2455 				    "remaining msg length %u", capa_len,
2456 				    datalen);
2457 				return (-1);
2458 			}
2459 			p += capa_len;
2460 			datalen -= capa_len;
2461 			switch (capa_code) {
2462 			case CAPA_MP:
2463 				for (i = 0; i < AID_MAX; i++)
2464 					peer->capa.ann.mp[i] = 0;
2465 				log_peer_warnx(&peer->conf,
2466 				    "disabling multiprotocol capability");
2467 				break;
2468 			case CAPA_REFRESH:
2469 				peer->capa.ann.refresh = 0;
2470 				log_peer_warnx(&peer->conf,
2471 				    "disabling route refresh capability");
2472 				break;
2473 			case CAPA_RESTART:
2474 				peer->capa.ann.grestart.restart = 0;
2475 				log_peer_warnx(&peer->conf,
2476 				    "disabling restart capability");
2477 				break;
2478 			case CAPA_AS4BYTE:
2479 				peer->capa.ann.as4byte = 0;
2480 				log_peer_warnx(&peer->conf,
2481 				    "disabling 4-byte AS num capability");
2482 				break;
2483 			case CAPA_ADD_PATH:
2484 				memset(peer->capa.ann.add_path, 0,
2485 				    sizeof(peer->capa.ann.add_path));
2486 				log_peer_warnx(&peer->conf,
2487 				    "disabling ADD-PATH capability");
2488 				break;
2489 			case CAPA_ENHANCED_RR:
2490 				peer->capa.ann.enhanced_rr = 0;
2491 				log_peer_warnx(&peer->conf,
2492 				    "disabling enhanced route refresh "
2493 				    "capability");
2494 				break;
2495 			default:	/* should not happen... */
2496 				log_peer_warnx(&peer->conf, "received "
2497 				    "\"unsupported capability\" notification "
2498 				    "for unknown capability %u, disabling "
2499 				    "capability announcements altogether",
2500 				    capa_code);
2501 				session_capa_ann_none(peer);
2502 				break;
2503 			}
2504 		}
2505 
2506 		return (1);
2507 	}
2508 
2509 	if (errcode == ERR_OPEN && subcode == ERR_OPEN_OPT) {
2510 		session_capa_ann_none(peer);
2511 		return (1);
2512 	}
2513 
2514 	if (errcode == ERR_CEASE &&
2515 	    (subcode == ERR_CEASE_ADMIN_DOWN ||
2516 	     subcode == ERR_CEASE_ADMIN_RESET)) {
2517 		if (datalen > 1) {
2518 			reason_len = *p++;
2519 			datalen--;
2520 			if (datalen < reason_len) {
2521 			    log_peer_warnx(&peer->conf,
2522 				"received truncated shutdown reason");
2523 			    return (0);
2524 			}
2525 			if (reason_len > REASON_LEN - 1) {
2526 			    log_peer_warnx(&peer->conf,
2527 				"received overly long shutdown reason");
2528 			    return (0);
2529 			}
2530 			memcpy(peer->stats.last_reason, p, reason_len);
2531 			peer->stats.last_reason[reason_len] = '\0';
2532 			log_peer_warnx(&peer->conf,
2533 			    "received shutdown reason: \"%s\"",
2534 			    log_reason(peer->stats.last_reason));
2535 			p += reason_len;
2536 			datalen -= reason_len;
2537 		}
2538 	}
2539 
2540 	return (0);
2541 }
2542 
2543 int
2544 parse_capabilities(struct peer *peer, u_char *d, u_int16_t dlen, u_int32_t *as)
2545 {
2546 	u_char		*capa_val;
2547 	u_int32_t	 remote_as;
2548 	u_int16_t	 len;
2549 	u_int16_t	 afi;
2550 	u_int16_t	 gr_header;
2551 	u_int8_t	 safi;
2552 	u_int8_t	 aid;
2553 	u_int8_t	 flags;
2554 	u_int8_t	 capa_code;
2555 	u_int8_t	 capa_len;
2556 	u_int8_t	 i;
2557 
2558 	len = dlen;
2559 	while (len > 0) {
2560 		if (len < 2) {
2561 			log_peer_warnx(&peer->conf, "Bad capabilities attr "
2562 			    "length: %u, too short", len);
2563 			return (-1);
2564 		}
2565 		memcpy(&capa_code, d, sizeof(capa_code));
2566 		d += sizeof(capa_code);
2567 		len -= sizeof(capa_code);
2568 		memcpy(&capa_len, d, sizeof(capa_len));
2569 		d += sizeof(capa_len);
2570 		len -= sizeof(capa_len);
2571 		if (capa_len > 0) {
2572 			if (len < capa_len) {
2573 				log_peer_warnx(&peer->conf,
2574 				    "Bad capabilities attr length: "
2575 				    "len %u smaller than capa_len %u",
2576 				    len, capa_len);
2577 				return (-1);
2578 			}
2579 			capa_val = d;
2580 			d += capa_len;
2581 			len -= capa_len;
2582 		} else
2583 			capa_val = NULL;
2584 
2585 		switch (capa_code) {
2586 		case CAPA_MP:			/* RFC 4760 */
2587 			if (capa_len != 4) {
2588 				log_peer_warnx(&peer->conf,
2589 				    "Bad multi protocol capability length: "
2590 				    "%u", capa_len);
2591 				break;
2592 			}
2593 			memcpy(&afi, capa_val, sizeof(afi));
2594 			afi = ntohs(afi);
2595 			memcpy(&safi, capa_val + 3, sizeof(safi));
2596 			if (afi2aid(afi, safi, &aid) == -1) {
2597 				log_peer_warnx(&peer->conf,
2598 				    "Received multi protocol capability: "
2599 				    " unknown AFI %u, safi %u pair",
2600 				    afi, safi);
2601 				break;
2602 			}
2603 			peer->capa.peer.mp[aid] = 1;
2604 			break;
2605 		case CAPA_REFRESH:
2606 			peer->capa.peer.refresh = 1;
2607 			break;
2608 		case CAPA_RESTART:
2609 			if (capa_len == 2) {
2610 				/* peer only supports EoR marker */
2611 				peer->capa.peer.grestart.restart = 1;
2612 				peer->capa.peer.grestart.timeout = 0;
2613 				break;
2614 			} else if (capa_len % 4 != 2) {
2615 				log_peer_warnx(&peer->conf,
2616 				    "Bad graceful restart capability length: "
2617 				    "%u", capa_len);
2618 				peer->capa.peer.grestart.restart = 0;
2619 				peer->capa.peer.grestart.timeout = 0;
2620 				break;
2621 			}
2622 
2623 			memcpy(&gr_header, capa_val, sizeof(gr_header));
2624 			gr_header = ntohs(gr_header);
2625 			peer->capa.peer.grestart.timeout =
2626 			    gr_header & CAPA_GR_TIMEMASK;
2627 			if (peer->capa.peer.grestart.timeout == 0) {
2628 				log_peer_warnx(&peer->conf, "Received "
2629 				    "graceful restart timeout is zero");
2630 				peer->capa.peer.grestart.restart = 0;
2631 				break;
2632 			}
2633 
2634 			for (i = 2; i <= capa_len - 4; i += 4) {
2635 				memcpy(&afi, capa_val + i, sizeof(afi));
2636 				afi = ntohs(afi);
2637 				safi = capa_val[i + 2];
2638 				flags = capa_val[i + 3];
2639 				if (afi2aid(afi, safi, &aid) == -1) {
2640 					log_peer_warnx(&peer->conf,
2641 					    "Received graceful restart capa: "
2642 					    " unknown AFI %u, safi %u pair",
2643 					    afi, safi);
2644 					continue;
2645 				}
2646 				peer->capa.peer.grestart.flags[aid] |=
2647 				    CAPA_GR_PRESENT;
2648 				if (flags & CAPA_GR_F_FLAG)
2649 					peer->capa.peer.grestart.flags[aid] |=
2650 					    CAPA_GR_FORWARD;
2651 				if (gr_header & CAPA_GR_R_FLAG)
2652 					peer->capa.peer.grestart.flags[aid] |=
2653 					    CAPA_GR_RESTART;
2654 				peer->capa.peer.grestart.restart = 2;
2655 			}
2656 			break;
2657 		case CAPA_AS4BYTE:
2658 			if (capa_len != 4) {
2659 				log_peer_warnx(&peer->conf,
2660 				    "Bad AS4BYTE capability length: "
2661 				    "%u", capa_len);
2662 				peer->capa.peer.as4byte = 0;
2663 				break;
2664 			}
2665 			memcpy(&remote_as, capa_val, sizeof(remote_as));
2666 			*as = ntohl(remote_as);
2667 			if (*as == 0) {
2668 				log_peer_warnx(&peer->conf,
2669 				    "peer requests unacceptable AS %u", *as);
2670 				session_notification(peer, ERR_OPEN,
2671 				    ERR_OPEN_AS, NULL, 0);
2672 				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2673 				return (-1);
2674 			}
2675 			peer->capa.peer.as4byte = 1;
2676 			break;
2677 		case CAPA_ADD_PATH:
2678 			if (capa_len % 4 != 0) {
2679 				log_peer_warnx(&peer->conf,
2680 				    "Bad ADD-PATH capability length: "
2681 				    "%u", capa_len);
2682 				memset(peer->capa.peer.add_path, 0,
2683 				    sizeof(peer->capa.peer.add_path));
2684 				break;
2685 			}
2686 			for (i = 0; i <= capa_len - 4; i += 4) {
2687 				memcpy(&afi, capa_val + i, sizeof(afi));
2688 				afi = ntohs(afi);
2689 				safi = capa_val[i + 2];
2690 				flags = capa_val[i + 3];
2691 				if (afi2aid(afi, safi, &aid) == -1) {
2692 					log_peer_warnx(&peer->conf,
2693 					    "Received ADD-PATH capa: "
2694 					    " unknown AFI %u, safi %u pair",
2695 					    afi, safi);
2696 					memset(peer->capa.peer.add_path, 0,
2697 					    sizeof(peer->capa.peer.add_path));
2698 					break;
2699 				}
2700 				if (flags & ~CAPA_AP_BIDIR) {
2701 					log_peer_warnx(&peer->conf,
2702 					    "Received ADD-PATH capa: "
2703 					    " bad flags %x", flags);
2704 					memset(peer->capa.peer.add_path, 0,
2705 					    sizeof(peer->capa.peer.add_path));
2706 					break;
2707 				}
2708 				peer->capa.peer.add_path[aid] = flags;
2709 			}
2710 			break;
2711 		case CAPA_ENHANCED_RR:
2712 			peer->capa.peer.enhanced_rr = 1;
2713 			break;
2714 		default:
2715 			break;
2716 		}
2717 	}
2718 
2719 	return (0);
2720 }
2721 
2722 int
2723 capa_neg_calc(struct peer *p)
2724 {
2725 	u_int8_t	i, hasmp = 0;
2726 
2727 	/* a capability is accepted only if both sides announced it */
2728 
2729 	p->capa.neg.refresh =
2730 	    (p->capa.ann.refresh && p->capa.peer.refresh) != 0;
2731 	p->capa.neg.enhanced_rr =
2732 	    (p->capa.ann.enhanced_rr && p->capa.peer.enhanced_rr) != 0;
2733 
2734 	p->capa.neg.as4byte =
2735 	    (p->capa.ann.as4byte && p->capa.peer.as4byte) != 0;
2736 
2737 	/* MP: both side must agree on the AFI,SAFI pair */
2738 	for (i = 0; i < AID_MAX; i++) {
2739 		if (p->capa.ann.mp[i] && p->capa.peer.mp[i])
2740 			p->capa.neg.mp[i] = 1;
2741 		else
2742 			p->capa.neg.mp[i] = 0;
2743 		if (p->capa.ann.mp[i])
2744 			hasmp = 1;
2745 	}
2746 	/* if no MP capability present default to IPv4 unicast mode */
2747 	if (!hasmp)
2748 		p->capa.neg.mp[AID_INET] = 1;
2749 
2750 	/*
2751 	 * graceful restart: the peer capabilities are of interest here.
2752 	 * It is necessary to compare the new values with the previous ones
2753 	 * and act acordingly. AFI/SAFI that are not part in the MP capability
2754 	 * are treated as not being present.
2755 	 * Also make sure that a flush happens if the session stopped
2756 	 * supporting graceful restart.
2757 	 */
2758 
2759 	for (i = 0; i < AID_MAX; i++) {
2760 		int8_t	negflags;
2761 
2762 		/* disable GR if the AFI/SAFI is not present */
2763 		if ((p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
2764 		    p->capa.neg.mp[i] == 0))
2765 			p->capa.peer.grestart.flags[i] = 0;	/* disable */
2766 		/* look at current GR state and decide what to do */
2767 		negflags = p->capa.neg.grestart.flags[i];
2768 		p->capa.neg.grestart.flags[i] = p->capa.peer.grestart.flags[i];
2769 		if (negflags & CAPA_GR_RESTARTING) {
2770 			if (p->capa.ann.grestart.restart != 0 &&
2771 			    p->capa.peer.grestart.flags[i] & CAPA_GR_FORWARD) {
2772 				p->capa.neg.grestart.flags[i] |=
2773 				    CAPA_GR_RESTARTING;
2774 			} else {
2775 				if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
2776 				    &i, sizeof(i)) == -1)
2777 					return (-1);
2778 				log_peer_warnx(&p->conf, "graceful restart of "
2779 				    "%s, not restarted, flushing", aid2str(i));
2780 			}
2781 		}
2782 	}
2783 	p->capa.neg.grestart.timeout = p->capa.peer.grestart.timeout;
2784 	p->capa.neg.grestart.restart = p->capa.peer.grestart.restart;
2785 	if (p->capa.ann.grestart.restart == 0)
2786 		p->capa.neg.grestart.restart = 0;
2787 
2788 
2789 	/*
2790 	 * ADD-PATH: set only those bits where both sides agree.
2791 	 * For this compare our send bit with the recv bit from the peer
2792 	 * and vice versa.
2793 	 * The flags are stored from this systems view point.
2794 	 */
2795 	memset(p->capa.neg.add_path, 0, sizeof(p->capa.neg.add_path));
2796 	if (p->capa.ann.add_path[0]) {
2797 		for (i = AID_MIN; i < AID_MAX; i++) {
2798 			if ((p->capa.ann.add_path[i] & CAPA_AP_RECV) &&
2799 			    (p->capa.peer.add_path[i] & CAPA_AP_SEND)) {
2800 				p->capa.neg.add_path[i] |= CAPA_AP_RECV;
2801 				p->capa.neg.add_path[0] |= CAPA_AP_RECV;
2802 			}
2803 			if ((p->capa.ann.add_path[i] & CAPA_AP_SEND) &&
2804 			    (p->capa.peer.add_path[i] & CAPA_AP_RECV)) {
2805 				p->capa.neg.add_path[i] |= CAPA_AP_SEND;
2806 				p->capa.neg.add_path[0] |= CAPA_AP_SEND;
2807 			}
2808 		}
2809 	}
2810 
2811 	return (0);
2812 }
2813 
2814 void
2815 session_dispatch_imsg(struct imsgbuf *ibuf, int idx, u_int *listener_cnt)
2816 {
2817 	struct imsg		 imsg;
2818 	struct mrt		 xmrt;
2819 	struct route_refresh	 rr;
2820 	struct mrt		*mrt;
2821 	struct imsgbuf		*i;
2822 	struct peer		*p;
2823 	struct listen_addr	*la, *nla;
2824 	struct kif		*kif;
2825 	u_char			*data;
2826 	int			 n, fd, depend_ok, restricted;
2827 	u_int16_t		 t;
2828 	u_int8_t		 aid, errcode, subcode;
2829 
2830 	while (ibuf) {
2831 		if ((n = imsg_get(ibuf, &imsg)) == -1)
2832 			fatal("session_dispatch_imsg: imsg_get error");
2833 
2834 		if (n == 0)
2835 			break;
2836 
2837 		switch (imsg.hdr.type) {
2838 		case IMSG_SOCKET_CONN:
2839 		case IMSG_SOCKET_CONN_CTL:
2840 			if (idx != PFD_PIPE_MAIN)
2841 				fatalx("reconf request not from parent");
2842 			if ((fd = imsg.fd) == -1) {
2843 				log_warnx("expected to receive imsg fd to "
2844 				    "RDE but didn't receive any");
2845 				break;
2846 			}
2847 			if ((i = malloc(sizeof(struct imsgbuf))) == NULL)
2848 				fatal(NULL);
2849 			imsg_init(i, fd);
2850 			if (imsg.hdr.type == IMSG_SOCKET_CONN) {
2851 				if (ibuf_rde) {
2852 					log_warnx("Unexpected imsg connection "
2853 					    "to RDE received");
2854 					msgbuf_clear(&ibuf_rde->w);
2855 					free(ibuf_rde);
2856 				}
2857 				ibuf_rde = i;
2858 			} else {
2859 				if (ibuf_rde_ctl) {
2860 					log_warnx("Unexpected imsg ctl "
2861 					    "connection to RDE received");
2862 					msgbuf_clear(&ibuf_rde_ctl->w);
2863 					free(ibuf_rde_ctl);
2864 				}
2865 				ibuf_rde_ctl = i;
2866 			}
2867 			break;
2868 		case IMSG_RECONF_CONF:
2869 			if (idx != PFD_PIPE_MAIN)
2870 				fatalx("reconf request not from parent");
2871 			nconf = new_config();
2872 
2873 			copy_config(nconf, imsg.data);
2874 			pending_reconf = 1;
2875 			break;
2876 		case IMSG_RECONF_PEER:
2877 			if (idx != PFD_PIPE_MAIN)
2878 				fatalx("reconf request not from parent");
2879 			if ((p = calloc(1, sizeof(struct peer))) == NULL)
2880 				fatal("new_peer");
2881 			memcpy(&p->conf, imsg.data, sizeof(struct peer_config));
2882 			p->state = p->prev_state = STATE_NONE;
2883 			p->reconf_action = RECONF_REINIT;
2884 			if (RB_INSERT(peer_head, &nconf->peers, p) != NULL)
2885 				fatalx("%s: peer tree is corrupt", __func__);
2886 			break;
2887 		case IMSG_RECONF_LISTENER:
2888 			if (idx != PFD_PIPE_MAIN)
2889 				fatalx("reconf request not from parent");
2890 			if (nconf == NULL)
2891 				fatalx("IMSG_RECONF_LISTENER but no config");
2892 			nla = imsg.data;
2893 			TAILQ_FOREACH(la, conf->listen_addrs, entry)
2894 				if (!la_cmp(la, nla))
2895 					break;
2896 
2897 			if (la == NULL) {
2898 				if (nla->reconf != RECONF_REINIT)
2899 					fatalx("king bula sez: "
2900 					    "expected REINIT");
2901 
2902 				if ((nla->fd = imsg.fd) == -1)
2903 					log_warnx("expected to receive fd for "
2904 					    "%s but didn't receive any",
2905 					    log_sockaddr((struct sockaddr *)
2906 					    &nla->sa, nla->sa_len));
2907 
2908 				la = calloc(1, sizeof(struct listen_addr));
2909 				if (la == NULL)
2910 					fatal(NULL);
2911 				memcpy(&la->sa, &nla->sa, sizeof(la->sa));
2912 				la->flags = nla->flags;
2913 				la->fd = nla->fd;
2914 				la->reconf = RECONF_REINIT;
2915 				TAILQ_INSERT_TAIL(nconf->listen_addrs, la,
2916 				    entry);
2917 			} else {
2918 				if (nla->reconf != RECONF_KEEP)
2919 					fatalx("king bula sez: expected KEEP");
2920 				la->reconf = RECONF_KEEP;
2921 			}
2922 
2923 			break;
2924 		case IMSG_RECONF_CTRL:
2925 			if (idx != PFD_PIPE_MAIN)
2926 				fatalx("reconf request not from parent");
2927 			if (imsg.hdr.len != IMSG_HEADER_SIZE +
2928 			    sizeof(restricted))
2929 				fatalx("IFINFO imsg with wrong len");
2930 			memcpy(&restricted, imsg.data, sizeof(restricted));
2931 			if (imsg.fd == -1) {
2932 				log_warnx("expected to receive fd for control "
2933 				    "socket but didn't receive any");
2934 				break;
2935 			}
2936 			if (restricted) {
2937 				control_shutdown(rcsock);
2938 				rcsock = imsg.fd;
2939 			} else {
2940 				control_shutdown(csock);
2941 				csock = imsg.fd;
2942 			}
2943 			break;
2944 		case IMSG_RECONF_DRAIN:
2945 			switch (idx) {
2946 			case PFD_PIPE_ROUTE:
2947 				if (nconf != NULL)
2948 					fatalx("got unexpected %s from RDE",
2949 					    "IMSG_RECONF_DONE");
2950 				imsg_compose(ibuf_main, IMSG_RECONF_DONE, 0, 0,
2951 				    -1, NULL, 0);
2952 				break;
2953 			case PFD_PIPE_MAIN:
2954 				if (nconf == NULL)
2955 					fatalx("got unexpected %s from parent",
2956 					    "IMSG_RECONF_DONE");
2957 				imsg_compose(ibuf_main, IMSG_RECONF_DRAIN, 0, 0,
2958 				    -1, NULL, 0);
2959 				break;
2960 			default:
2961 				fatalx("reconf request not from parent or RDE");
2962 			}
2963 			break;
2964 		case IMSG_RECONF_DONE:
2965 			if (idx != PFD_PIPE_MAIN)
2966 				fatalx("reconf request not from parent");
2967 			if (nconf == NULL)
2968 				fatalx("got IMSG_RECONF_DONE but no config");
2969 			copy_config(conf, nconf);
2970 			merge_peers(conf, nconf);
2971 
2972 			/* delete old listeners */
2973 			for (la = TAILQ_FIRST(conf->listen_addrs); la != NULL;
2974 			    la = nla) {
2975 				nla = TAILQ_NEXT(la, entry);
2976 				if (la->reconf == RECONF_NONE) {
2977 					log_info("not listening on %s any more",
2978 					    log_sockaddr((struct sockaddr *)
2979 					    &la->sa, la->sa_len));
2980 					TAILQ_REMOVE(conf->listen_addrs, la,
2981 					    entry);
2982 					close(la->fd);
2983 					free(la);
2984 				}
2985 			}
2986 
2987 			/* add new listeners */
2988 			TAILQ_CONCAT(conf->listen_addrs, nconf->listen_addrs,
2989 			    entry);
2990 
2991 			setup_listeners(listener_cnt);
2992 			free_config(nconf);
2993 			nconf = NULL;
2994 			pending_reconf = 0;
2995 			log_info("SE reconfigured");
2996 			/*
2997 			 * IMSG_RECONF_DONE is sent when the RDE drained
2998 			 * the peer config sent in merge_peers().
2999 			 */
3000 			break;
3001 		case IMSG_IFINFO:
3002 			if (idx != PFD_PIPE_MAIN)
3003 				fatalx("IFINFO message not from parent");
3004 			if (imsg.hdr.len != IMSG_HEADER_SIZE +
3005 			    sizeof(struct kif))
3006 				fatalx("IFINFO imsg with wrong len");
3007 			kif = imsg.data;
3008 			depend_ok = kif->depend_state;
3009 
3010 			RB_FOREACH(p, peer_head, &conf->peers)
3011 				if (!strcmp(p->conf.if_depend, kif->ifname)) {
3012 					if (depend_ok && !p->depend_ok) {
3013 						p->depend_ok = depend_ok;
3014 						bgp_fsm(p, EVNT_START);
3015 					} else if (!depend_ok && p->depend_ok) {
3016 						p->depend_ok = depend_ok;
3017 						session_stop(p,
3018 						    ERR_CEASE_OTHER_CHANGE);
3019 					}
3020 				}
3021 			break;
3022 		case IMSG_MRT_OPEN:
3023 		case IMSG_MRT_REOPEN:
3024 			if (imsg.hdr.len > IMSG_HEADER_SIZE +
3025 			    sizeof(struct mrt)) {
3026 				log_warnx("wrong imsg len");
3027 				break;
3028 			}
3029 
3030 			memcpy(&xmrt, imsg.data, sizeof(struct mrt));
3031 			if ((xmrt.wbuf.fd = imsg.fd) == -1)
3032 				log_warnx("expected to receive fd for mrt dump "
3033 				    "but didn't receive any");
3034 
3035 			mrt = mrt_get(&mrthead, &xmrt);
3036 			if (mrt == NULL) {
3037 				/* new dump */
3038 				mrt = calloc(1, sizeof(struct mrt));
3039 				if (mrt == NULL)
3040 					fatal("session_dispatch_imsg");
3041 				memcpy(mrt, &xmrt, sizeof(struct mrt));
3042 				TAILQ_INIT(&mrt->wbuf.bufs);
3043 				LIST_INSERT_HEAD(&mrthead, mrt, entry);
3044 			} else {
3045 				/* old dump reopened */
3046 				close(mrt->wbuf.fd);
3047 				mrt->wbuf.fd = xmrt.wbuf.fd;
3048 			}
3049 			break;
3050 		case IMSG_MRT_CLOSE:
3051 			if (imsg.hdr.len > IMSG_HEADER_SIZE +
3052 			    sizeof(struct mrt)) {
3053 				log_warnx("wrong imsg len");
3054 				break;
3055 			}
3056 
3057 			memcpy(&xmrt, imsg.data, sizeof(struct mrt));
3058 			mrt = mrt_get(&mrthead, &xmrt);
3059 			if (mrt != NULL)
3060 				mrt_done(mrt);
3061 			break;
3062 		case IMSG_CTL_KROUTE:
3063 		case IMSG_CTL_KROUTE_ADDR:
3064 		case IMSG_CTL_SHOW_NEXTHOP:
3065 		case IMSG_CTL_SHOW_INTERFACE:
3066 		case IMSG_CTL_SHOW_FIB_TABLES:
3067 		case IMSG_CTL_SHOW_RTR:
3068 		case IMSG_CTL_SHOW_TIMER:
3069 			if (idx != PFD_PIPE_MAIN)
3070 				fatalx("ctl kroute request not from parent");
3071 			control_imsg_relay(&imsg);
3072 			break;
3073 		case IMSG_CTL_SHOW_RIB:
3074 		case IMSG_CTL_SHOW_RIB_PREFIX:
3075 		case IMSG_CTL_SHOW_RIB_COMMUNITIES:
3076 		case IMSG_CTL_SHOW_RIB_ATTR:
3077 		case IMSG_CTL_SHOW_RIB_MEM:
3078 		case IMSG_CTL_SHOW_RIB_HASH:
3079 		case IMSG_CTL_SHOW_NETWORK:
3080 		case IMSG_CTL_SHOW_NEIGHBOR:
3081 		case IMSG_CTL_SHOW_SET:
3082 			if (idx != PFD_PIPE_ROUTE_CTL)
3083 				fatalx("ctl rib request not from RDE");
3084 			control_imsg_relay(&imsg);
3085 			break;
3086 		case IMSG_CTL_END:
3087 		case IMSG_CTL_RESULT:
3088 			control_imsg_relay(&imsg);
3089 			break;
3090 		case IMSG_UPDATE:
3091 			if (idx != PFD_PIPE_ROUTE)
3092 				fatalx("update request not from RDE");
3093 			if (imsg.hdr.len > IMSG_HEADER_SIZE +
3094 			    MAX_PKTSIZE - MSGSIZE_HEADER ||
3095 			    imsg.hdr.len < IMSG_HEADER_SIZE +
3096 			    MSGSIZE_UPDATE_MIN - MSGSIZE_HEADER)
3097 				log_warnx("RDE sent invalid update");
3098 			else
3099 				session_update(imsg.hdr.peerid, imsg.data,
3100 				    imsg.hdr.len - IMSG_HEADER_SIZE);
3101 			break;
3102 		case IMSG_UPDATE_ERR:
3103 			if (idx != PFD_PIPE_ROUTE)
3104 				fatalx("update request not from RDE");
3105 			if (imsg.hdr.len < IMSG_HEADER_SIZE + 2) {
3106 				log_warnx("RDE sent invalid notification");
3107 				break;
3108 			}
3109 			if ((p = getpeerbyid(conf, imsg.hdr.peerid)) == NULL) {
3110 				log_warnx("no such peer: id=%u",
3111 				    imsg.hdr.peerid);
3112 				break;
3113 			}
3114 			data = imsg.data;
3115 			errcode = *data++;
3116 			subcode = *data++;
3117 
3118 			if (imsg.hdr.len == IMSG_HEADER_SIZE + 2)
3119 				data = NULL;
3120 
3121 			session_notification(p, errcode, subcode,
3122 			    data, imsg.hdr.len - IMSG_HEADER_SIZE - 2);
3123 			switch (errcode) {
3124 			case ERR_CEASE:
3125 				switch (subcode) {
3126 				case ERR_CEASE_MAX_PREFIX:
3127 				case ERR_CEASE_MAX_SENT_PREFIX:
3128 					t = p->conf.max_out_prefix_restart;
3129 					if (subcode == ERR_CEASE_MAX_PREFIX)
3130 						t = p->conf.max_prefix_restart;
3131 
3132 					bgp_fsm(p, EVNT_STOP);
3133 					if (t)
3134 						timer_set(&p->timers,
3135 						    Timer_IdleHold, 60 * t);
3136 					break;
3137 				default:
3138 					bgp_fsm(p, EVNT_CON_FATAL);
3139 					break;
3140 				}
3141 				break;
3142 			default:
3143 				bgp_fsm(p, EVNT_CON_FATAL);
3144 				break;
3145 			}
3146 			break;
3147 		case IMSG_REFRESH:
3148 			if (idx != PFD_PIPE_ROUTE)
3149 				fatalx("route refresh request not from RDE");
3150 			if (imsg.hdr.len < IMSG_HEADER_SIZE + sizeof(rr)) {
3151 				log_warnx("RDE sent invalid refresh msg");
3152 				break;
3153 			}
3154 			if ((p = getpeerbyid(conf, imsg.hdr.peerid)) == NULL) {
3155 				log_warnx("no such peer: id=%u",
3156 				    imsg.hdr.peerid);
3157 				break;
3158 			}
3159 			memcpy(&rr, imsg.data, sizeof(rr));
3160 			if (rr.aid >= AID_MAX)
3161 				fatalx("IMSG_REFRESH: bad AID");
3162 			session_rrefresh(p, rr.aid, rr.subtype);
3163 			break;
3164 		case IMSG_SESSION_RESTARTED:
3165 			if (idx != PFD_PIPE_ROUTE)
3166 				fatalx("update request not from RDE");
3167 			if (imsg.hdr.len < IMSG_HEADER_SIZE + sizeof(aid)) {
3168 				log_warnx("RDE sent invalid restart msg");
3169 				break;
3170 			}
3171 			if ((p = getpeerbyid(conf, imsg.hdr.peerid)) == NULL) {
3172 				log_warnx("no such peer: id=%u",
3173 				    imsg.hdr.peerid);
3174 				break;
3175 			}
3176 			memcpy(&aid, imsg.data, sizeof(aid));
3177 			if (aid >= AID_MAX)
3178 				fatalx("IMSG_SESSION_RESTARTED: bad AID");
3179 			if (p->capa.neg.grestart.flags[aid] &
3180 			    CAPA_GR_RESTARTING) {
3181 				log_peer_warnx(&p->conf,
3182 				    "graceful restart of %s finished",
3183 				    aid2str(aid));
3184 				p->capa.neg.grestart.flags[aid] &=
3185 				    ~CAPA_GR_RESTARTING;
3186 				timer_stop(&p->timers, Timer_RestartTimeout);
3187 
3188 				/* signal back to RDE to cleanup stale routes */
3189 				if (imsg_rde(IMSG_SESSION_RESTARTED,
3190 				    imsg.hdr.peerid, &aid, sizeof(aid)) == -1)
3191 					fatal("imsg_compose: "
3192 					    "IMSG_SESSION_RESTARTED");
3193 			}
3194 			break;
3195 		case IMSG_SESSION_DOWN:
3196 			if (idx != PFD_PIPE_ROUTE)
3197 				fatalx("update request not from RDE");
3198 			if ((p = getpeerbyid(conf, imsg.hdr.peerid)) == NULL) {
3199 				log_warnx("no such peer: id=%u",
3200 				    imsg.hdr.peerid);
3201 				break;
3202 			}
3203 			session_stop(p, ERR_CEASE_ADMIN_DOWN);
3204 			break;
3205 		default:
3206 			break;
3207 		}
3208 		imsg_free(&imsg);
3209 	}
3210 }
3211 
3212 int
3213 la_cmp(struct listen_addr *a, struct listen_addr *b)
3214 {
3215 	struct sockaddr_in	*in_a, *in_b;
3216 	struct sockaddr_in6	*in6_a, *in6_b;
3217 
3218 	if (a->sa.ss_family != b->sa.ss_family)
3219 		return (1);
3220 
3221 	switch (a->sa.ss_family) {
3222 	case AF_INET:
3223 		in_a = (struct sockaddr_in *)&a->sa;
3224 		in_b = (struct sockaddr_in *)&b->sa;
3225 		if (in_a->sin_addr.s_addr != in_b->sin_addr.s_addr)
3226 			return (1);
3227 		if (in_a->sin_port != in_b->sin_port)
3228 			return (1);
3229 		break;
3230 	case AF_INET6:
3231 		in6_a = (struct sockaddr_in6 *)&a->sa;
3232 		in6_b = (struct sockaddr_in6 *)&b->sa;
3233 		if (bcmp(&in6_a->sin6_addr, &in6_b->sin6_addr,
3234 		    sizeof(struct in6_addr)))
3235 			return (1);
3236 		if (in6_a->sin6_port != in6_b->sin6_port)
3237 			return (1);
3238 		break;
3239 	default:
3240 		fatal("king bula sez: unknown address family");
3241 		/* NOTREACHED */
3242 	}
3243 
3244 	return (0);
3245 }
3246 
3247 struct peer *
3248 getpeerbydesc(struct bgpd_config *c, const char *descr)
3249 {
3250 	struct peer	*p, *res = NULL;
3251 	int		 match = 0;
3252 
3253 	RB_FOREACH(p, peer_head, &c->peers)
3254 		if (!strcmp(p->conf.descr, descr)) {
3255 			res = p;
3256 			match++;
3257 		}
3258 
3259 	if (match > 1)
3260 		log_info("neighbor description \"%s\" not unique, request "
3261 		    "aborted", descr);
3262 
3263 	if (match == 1)
3264 		return (res);
3265 	else
3266 		return (NULL);
3267 }
3268 
3269 struct peer *
3270 getpeerbyip(struct bgpd_config *c, struct sockaddr *ip)
3271 {
3272 	struct bgpd_addr addr;
3273 	struct peer	*p, *newpeer, *loose = NULL;
3274 	u_int32_t	 id;
3275 
3276 	sa2addr(ip, &addr, NULL);
3277 
3278 	/* we might want a more effective way to find peers by IP */
3279 	RB_FOREACH(p, peer_head, &c->peers)
3280 		if (!p->conf.template &&
3281 		    !memcmp(&addr, &p->conf.remote_addr, sizeof(addr)))
3282 			return (p);
3283 
3284 	/* try template matching */
3285 	RB_FOREACH(p, peer_head, &c->peers)
3286 		if (p->conf.template &&
3287 		    p->conf.remote_addr.aid == addr.aid &&
3288 		    session_match_mask(p, &addr))
3289 			if (loose == NULL || loose->conf.remote_masklen <
3290 			    p->conf.remote_masklen)
3291 				loose = p;
3292 
3293 	if (loose != NULL) {
3294 		/* clone */
3295 		if ((newpeer = malloc(sizeof(struct peer))) == NULL)
3296 			fatal(NULL);
3297 		memcpy(newpeer, loose, sizeof(struct peer));
3298 		for (id = PEER_ID_DYN_MAX; id > PEER_ID_STATIC_MAX; id--) {
3299 			if (getpeerbyid(c, id) == NULL)	/* we found a free id */
3300 				break;
3301 		}
3302 		newpeer->template = loose;
3303 		session_template_clone(newpeer, ip, id, 0);
3304 		newpeer->state = newpeer->prev_state = STATE_NONE;
3305 		newpeer->reconf_action = RECONF_KEEP;
3306 		newpeer->rbuf = NULL;
3307 		newpeer->rpending = 0;
3308 		init_peer(newpeer);
3309 		bgp_fsm(newpeer, EVNT_START);
3310 		if (RB_INSERT(peer_head, &c->peers, newpeer) != NULL)
3311 			fatalx("%s: peer tree is corrupt", __func__);
3312 		return (newpeer);
3313 	}
3314 
3315 	return (NULL);
3316 }
3317 
3318 struct peer *
3319 getpeerbyid(struct bgpd_config *c, u_int32_t peerid)
3320 {
3321 	static struct peer lookup;
3322 
3323 	lookup.conf.id = peerid;
3324 
3325 	return RB_FIND(peer_head, &c->peers, &lookup);
3326 }
3327 
3328 int
3329 peer_matched(struct peer *p, struct ctl_neighbor *n)
3330 {
3331 	char *s;
3332 
3333 	if (n && n->addr.aid) {
3334 		if (memcmp(&p->conf.remote_addr, &n->addr,
3335 		    sizeof(p->conf.remote_addr)))
3336 			return 0;
3337 	} else if (n && n->descr[0]) {
3338 		s = n->is_group ? p->conf.group : p->conf.descr;
3339 		if (strcmp(s, n->descr))
3340 			return 0;
3341 	}
3342 	return 1;
3343 }
3344 
3345 void
3346 session_template_clone(struct peer *p, struct sockaddr *ip, u_int32_t id,
3347     u_int32_t as)
3348 {
3349 	struct bgpd_addr	remote_addr;
3350 
3351 	if (ip)
3352 		sa2addr(ip, &remote_addr, NULL);
3353 	else
3354 		memcpy(&remote_addr, &p->conf.remote_addr, sizeof(remote_addr));
3355 
3356 	memcpy(&p->conf, &p->template->conf, sizeof(struct peer_config));
3357 
3358 	p->conf.id = id;
3359 
3360 	if (as) {
3361 		p->conf.remote_as = as;
3362 		p->conf.ebgp = (p->conf.remote_as != p->conf.local_as);
3363 		if (!p->conf.ebgp)
3364 			/* force enforce_as off for iBGP sessions */
3365 			p->conf.enforce_as = ENFORCE_AS_OFF;
3366 	}
3367 
3368 	memcpy(&p->conf.remote_addr, &remote_addr, sizeof(remote_addr));
3369 	switch (p->conf.remote_addr.aid) {
3370 	case AID_INET:
3371 		p->conf.remote_masklen = 32;
3372 		break;
3373 	case AID_INET6:
3374 		p->conf.remote_masklen = 128;
3375 		break;
3376 	}
3377 	p->conf.template = 0;
3378 }
3379 
3380 int
3381 session_match_mask(struct peer *p, struct bgpd_addr *a)
3382 {
3383 	struct in_addr	 v4masked;
3384 	struct in6_addr	 v6masked;
3385 
3386 	switch (p->conf.remote_addr.aid) {
3387 	case AID_INET:
3388 		inet4applymask(&v4masked, &a->v4, p->conf.remote_masklen);
3389 		if (p->conf.remote_addr.v4.s_addr == v4masked.s_addr)
3390 			return (1);
3391 		return (0);
3392 	case AID_INET6:
3393 		inet6applymask(&v6masked, &a->v6, p->conf.remote_masklen);
3394 
3395 		if (memcmp(&v6masked, &p->conf.remote_addr.v6,
3396 		    sizeof(v6masked)) == 0)
3397 			return (1);
3398 		return (0);
3399 	}
3400 	return (0);
3401 }
3402 
3403 void
3404 session_down(struct peer *peer)
3405 {
3406 	bzero(&peer->capa.neg, sizeof(peer->capa.neg));
3407 	peer->stats.last_updown = getmonotime();
3408 	/*
3409 	 * session_down is called in the exit code path so check
3410 	 * if the RDE is still around, if not there is no need to
3411 	 * send the message.
3412 	 */
3413 	if (ibuf_rde == NULL)
3414 		return;
3415 	if (imsg_rde(IMSG_SESSION_DOWN, peer->conf.id, NULL, 0) == -1)
3416 		fatalx("imsg_compose error");
3417 }
3418 
3419 void
3420 session_up(struct peer *p)
3421 {
3422 	struct session_up	 sup;
3423 
3424 	if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
3425 	    &p->conf, sizeof(p->conf)) == -1)
3426 		fatalx("imsg_compose error");
3427 
3428 	if (p->local.aid == AID_INET) {
3429 		sup.local_v4_addr = p->local;
3430 		sup.local_v6_addr = p->local_alt;
3431 	} else {
3432 		sup.local_v6_addr = p->local;
3433 		sup.local_v4_addr = p->local_alt;
3434 	}
3435 	sup.remote_addr = p->remote;
3436 
3437 	sup.remote_bgpid = p->remote_bgpid;
3438 	sup.short_as = p->short_as;
3439 	memcpy(&sup.capa, &p->capa.neg, sizeof(sup.capa));
3440 	p->stats.last_updown = getmonotime();
3441 	if (imsg_rde(IMSG_SESSION_UP, p->conf.id, &sup, sizeof(sup)) == -1)
3442 		fatalx("imsg_compose error");
3443 }
3444 
3445 int
3446 imsg_ctl_parent(int type, u_int32_t peerid, pid_t pid, void *data,
3447     u_int16_t datalen)
3448 {
3449 	return (imsg_compose(ibuf_main, type, peerid, pid, -1, data, datalen));
3450 }
3451 
3452 int
3453 imsg_ctl_rde(int type, pid_t pid, void *data, u_int16_t datalen)
3454 {
3455 	if (ibuf_rde_ctl == NULL) {
3456 		log_warnx("Can't send message %u to RDE, ctl pipe closed",
3457 		    type);
3458 		return (0);
3459 	}
3460 	/*
3461 	 * Use control socket to talk to RDE to bypass the queue of the
3462 	 * regular imsg socket.
3463 	 */
3464 	return (imsg_compose(ibuf_rde_ctl, type, 0, pid, -1, data, datalen));
3465 }
3466 
3467 int
3468 imsg_rde(int type, uint32_t peerid, void *data, u_int16_t datalen)
3469 {
3470 	if (ibuf_rde == NULL) {
3471 		log_warnx("Can't send message %u to RDE, pipe closed", type);
3472 		return (0);
3473 	}
3474 
3475 	return (imsg_compose(ibuf_rde, type, peerid, 0, -1, data, datalen));
3476 }
3477 
3478 void
3479 session_demote(struct peer *p, int level)
3480 {
3481 	struct demote_msg	msg;
3482 
3483 	strlcpy(msg.demote_group, p->conf.demote_group,
3484 	    sizeof(msg.demote_group));
3485 	msg.level = level;
3486 	if (imsg_compose(ibuf_main, IMSG_DEMOTE, p->conf.id, 0, -1,
3487 	    &msg, sizeof(msg)) == -1)
3488 		fatalx("imsg_compose error");
3489 
3490 	p->demoted += level;
3491 }
3492 
3493 void
3494 session_stop(struct peer *peer, u_int8_t subcode)
3495 {
3496 	char data[REASON_LEN];
3497 	size_t datalen;
3498 	size_t reason_len;
3499 	char *communication;
3500 
3501 	datalen = 0;
3502 	communication = peer->conf.reason;
3503 
3504 	if ((subcode == ERR_CEASE_ADMIN_DOWN ||
3505 	    subcode == ERR_CEASE_ADMIN_RESET)
3506 	    && communication && *communication) {
3507 		reason_len = strlen(communication);
3508 		if (reason_len > REASON_LEN - 1) {
3509 		    log_peer_warnx(&peer->conf,
3510 			"trying to send overly long shutdown reason");
3511 		} else {
3512 			data[0] = reason_len;
3513 			datalen = reason_len + sizeof(data[0]);
3514 			memcpy(data + 1, communication, reason_len);
3515 		}
3516 	}
3517 	switch (peer->state) {
3518 	case STATE_OPENSENT:
3519 	case STATE_OPENCONFIRM:
3520 	case STATE_ESTABLISHED:
3521 		session_notification(peer, ERR_CEASE, subcode, data, datalen);
3522 		break;
3523 	default:
3524 		/* session not open, no need to send notification */
3525 		break;
3526 	}
3527 	bgp_fsm(peer, EVNT_STOP);
3528 }
3529 
3530 void
3531 merge_peers(struct bgpd_config *c, struct bgpd_config *nc)
3532 {
3533 	struct peer *p, *np, *next;
3534 
3535 	RB_FOREACH(p, peer_head, &c->peers) {
3536 		/* templates are handled specially */
3537 		if (p->template != NULL)
3538 			continue;
3539 		np = getpeerbyid(nc, p->conf.id);
3540 		if (np == NULL) {
3541 			p->reconf_action = RECONF_DELETE;
3542 			continue;
3543 		}
3544 
3545 		/* peer no longer uses TCP MD5SIG so deconfigure */
3546 		if (p->conf.auth.method == AUTH_MD5SIG &&
3547 		    np->conf.auth.method != AUTH_MD5SIG)
3548 			tcp_md5_del_listener(c, p);
3549 		else if (np->conf.auth.method == AUTH_MD5SIG)
3550 			tcp_md5_add_listener(c, np);
3551 
3552 		memcpy(&p->conf, &np->conf, sizeof(p->conf));
3553 		RB_REMOVE(peer_head, &nc->peers, np);
3554 		free(np);
3555 
3556 		p->reconf_action = RECONF_KEEP;
3557 
3558 		/* had demotion, is demoted, demote removed? */
3559 		if (p->demoted && !p->conf.demote_group[0])
3560 			session_demote(p, -1);
3561 
3562 		/* if session is not open then refresh pfkey data */
3563 		if (p->state < STATE_OPENSENT && !p->template)
3564 			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
3565 			    p->conf.id, 0, -1, NULL, 0);
3566 
3567 		/* sync the RDE in case we keep the peer */
3568 		if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
3569 		    &p->conf, sizeof(struct peer_config)) == -1)
3570 			fatalx("imsg_compose error");
3571 
3572 		/* apply the config to all clones of a template */
3573 		if (p->conf.template) {
3574 			struct peer *xp;
3575 			RB_FOREACH(xp, peer_head, &c->peers) {
3576 				if (xp->template != p)
3577 					continue;
3578 				session_template_clone(xp, NULL, xp->conf.id,
3579 				    xp->conf.remote_as);
3580 				if (imsg_rde(IMSG_SESSION_ADD, xp->conf.id,
3581 				    &xp->conf, sizeof(xp->conf)) == -1)
3582 					fatalx("imsg_compose error");
3583 			}
3584 		}
3585 	}
3586 
3587 	if (imsg_rde(IMSG_RECONF_DRAIN, 0, NULL, 0) == -1)
3588 		fatalx("imsg_compose error");
3589 
3590 	/* pfkeys of new peers already loaded by the parent process */
3591 	RB_FOREACH_SAFE(np, peer_head, &nc->peers, next) {
3592 		RB_REMOVE(peer_head, &nc->peers, np);
3593 		if (RB_INSERT(peer_head, &c->peers, np) != NULL)
3594 			fatalx("%s: peer tree is corrupt", __func__);
3595 		if (np->conf.auth.method == AUTH_MD5SIG)
3596 			tcp_md5_add_listener(c, np);
3597 	}
3598 }
3599