1/*	$OpenBSD: session.c,v 1.480 2024/06/10 12:51:25 claudio Exp $ */
2
3/*
4 * Copyright (c) 2003, 2004, 2005 Henning Brauer <henning@openbsd.org>
5 * Copyright (c) 2017 Peter van Dijk <peter.van.dijk@powerdns.com>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/types.h>
21
22#include <sys/mman.h>
23#include <sys/socket.h>
24#include <sys/time.h>
25#include <sys/resource.h>
26#include <sys/un.h>
27#include <netinet/in.h>
28#include <netinet/ip.h>
29#include <netinet/tcp.h>
30#include <arpa/inet.h>
31#include <limits.h>
32
33#include <err.h>
34#include <errno.h>
35#include <fcntl.h>
36#include <ifaddrs.h>
37#include <poll.h>
38#include <pwd.h>
39#include <signal.h>
40#include <stdio.h>
41#include <stdlib.h>
42#include <string.h>
43#include <syslog.h>
44#include <unistd.h>
45
46#include "bgpd.h"
47#include "session.h"
48#include "log.h"
49
50#define PFD_PIPE_MAIN		0
51#define PFD_PIPE_ROUTE		1
52#define PFD_PIPE_ROUTE_CTL	2
53#define PFD_SOCK_CTL		3
54#define PFD_SOCK_RCTL		4
55#define PFD_LISTENERS_START	5
56
57void	session_sighdlr(int);
58int	setup_listeners(u_int *);
59void	init_peer(struct peer *);
60void	start_timer_holdtime(struct peer *);
61void	start_timer_sendholdtime(struct peer *);
62void	start_timer_keepalive(struct peer *);
63void	session_close_connection(struct peer *);
64void	change_state(struct peer *, enum session_state, enum session_events);
65int	session_setup_socket(struct peer *);
66void	session_accept(int);
67int	session_connect(struct peer *);
68void	session_tcp_established(struct peer *);
69int	session_capa_add(struct ibuf *, uint8_t, uint8_t);
70int	session_capa_add_mp(struct ibuf *, uint8_t);
71int	session_capa_add_afi(struct ibuf *, uint8_t, uint8_t);
72struct bgp_msg	*session_newmsg(enum msg_type, uint16_t);
73int	session_sendmsg(struct bgp_msg *, struct peer *);
74void	session_open(struct peer *);
75void	session_keepalive(struct peer *);
76void	session_update(uint32_t, struct ibuf *);
77void	session_notification(struct peer *, uint8_t, uint8_t, struct ibuf *);
78void	session_notification_data(struct peer *, uint8_t, uint8_t, void *,
79	    size_t);
80void	session_rrefresh(struct peer *, uint8_t, uint8_t);
81int	session_graceful_restart(struct peer *);
82int	session_graceful_stop(struct peer *);
83int	session_dispatch_msg(struct pollfd *, struct peer *);
84void	session_process_msg(struct peer *);
85int	parse_header(struct peer *, u_char *, uint16_t *, uint8_t *);
86int	parse_open(struct peer *);
87int	parse_update(struct peer *);
88int	parse_rrefresh(struct peer *);
89void	parse_notification(struct peer *);
90int	parse_capabilities(struct peer *, struct ibuf *, uint32_t *);
91int	capa_neg_calc(struct peer *);
92void	session_dispatch_imsg(struct imsgbuf *, int, u_int *);
93void	session_up(struct peer *);
94void	session_down(struct peer *);
95int	imsg_rde(int, uint32_t, void *, uint16_t);
96void	session_demote(struct peer *, int);
97void	merge_peers(struct bgpd_config *, struct bgpd_config *);
98
99int		 la_cmp(struct listen_addr *, struct listen_addr *);
100void		 session_template_clone(struct peer *, struct sockaddr *,
101		    uint32_t, uint32_t);
102int		 session_match_mask(struct peer *, struct bgpd_addr *);
103
104static struct bgpd_config	*conf, *nconf;
105static struct imsgbuf		*ibuf_rde;
106static struct imsgbuf		*ibuf_rde_ctl;
107static struct imsgbuf		*ibuf_main;
108
109struct bgpd_sysdep	 sysdep;
110volatile sig_atomic_t	 session_quit;
111int			 pending_reconf;
112int			 csock = -1, rcsock = -1;
113u_int			 peer_cnt;
114
115struct mrt_head		 mrthead;
116time_t			 pauseaccept;
117
118static const uint8_t	 marker[MSGSIZE_HEADER_MARKER] = {
119	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
120	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
121};
122
123static inline int
124peer_compare(const struct peer *a, const struct peer *b)
125{
126	return a->conf.id - b->conf.id;
127}
128
129RB_GENERATE(peer_head, peer, entry, peer_compare);
130
131void
132session_sighdlr(int sig)
133{
134	switch (sig) {
135	case SIGINT:
136	case SIGTERM:
137		session_quit = 1;
138		break;
139	}
140}
141
142int
143setup_listeners(u_int *la_cnt)
144{
145	int			 ttl = 255;
146	struct listen_addr	*la;
147	u_int			 cnt = 0;
148
149	TAILQ_FOREACH(la, conf->listen_addrs, entry) {
150		la->reconf = RECONF_NONE;
151		cnt++;
152
153		if (la->flags & LISTENER_LISTENING)
154			continue;
155
156		if (la->fd == -1) {
157			log_warn("cannot establish listener on %s: invalid fd",
158			    log_sockaddr((struct sockaddr *)&la->sa,
159			    la->sa_len));
160			continue;
161		}
162
163		if (tcp_md5_prep_listener(la, &conf->peers) == -1)
164			fatal("tcp_md5_prep_listener");
165
166		/* set ttl to 255 so that ttl-security works */
167		if (la->sa.ss_family == AF_INET && setsockopt(la->fd,
168		    IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)) == -1) {
169			log_warn("setup_listeners setsockopt TTL");
170			continue;
171		}
172		if (la->sa.ss_family == AF_INET6 && setsockopt(la->fd,
173		    IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) == -1) {
174			log_warn("setup_listeners setsockopt hoplimit");
175			continue;
176		}
177
178		if (listen(la->fd, MAX_BACKLOG)) {
179			close(la->fd);
180			fatal("listen");
181		}
182
183		la->flags |= LISTENER_LISTENING;
184
185		log_info("listening on %s",
186		    log_sockaddr((struct sockaddr *)&la->sa, la->sa_len));
187	}
188
189	*la_cnt = cnt;
190
191	return (0);
192}
193
194void
195session_main(int debug, int verbose)
196{
197	int			 timeout;
198	unsigned int		 i, j, idx_peers, idx_listeners, idx_mrts;
199	u_int			 pfd_elms = 0, peer_l_elms = 0, mrt_l_elms = 0;
200	u_int			 listener_cnt, ctl_cnt, mrt_cnt;
201	u_int			 new_cnt;
202	struct passwd		*pw;
203	struct peer		*p, **peer_l = NULL, *next;
204	struct mrt		*m, *xm, **mrt_l = NULL;
205	struct pollfd		*pfd = NULL;
206	struct listen_addr	*la;
207	void			*newp;
208	time_t			 now;
209	short			 events;
210
211	log_init(debug, LOG_DAEMON);
212	log_setverbose(verbose);
213
214	log_procinit(log_procnames[PROC_SE]);
215
216	if ((pw = getpwnam(BGPD_USER)) == NULL)
217		fatal(NULL);
218
219	if (chroot(pw->pw_dir) == -1)
220		fatal("chroot");
221	if (chdir("/") == -1)
222		fatal("chdir(\"/\")");
223
224	setproctitle("session engine");
225
226	if (setgroups(1, &pw->pw_gid) ||
227	    setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) ||
228	    setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid))
229		fatal("can't drop privileges");
230
231	if (pledge("stdio inet recvfd", NULL) == -1)
232		fatal("pledge");
233
234	signal(SIGTERM, session_sighdlr);
235	signal(SIGINT, session_sighdlr);
236	signal(SIGPIPE, SIG_IGN);
237	signal(SIGHUP, SIG_IGN);
238	signal(SIGALRM, SIG_IGN);
239	signal(SIGUSR1, SIG_IGN);
240
241	if ((ibuf_main = malloc(sizeof(struct imsgbuf))) == NULL)
242		fatal(NULL);
243	imsg_init(ibuf_main, 3);
244
245	LIST_INIT(&mrthead);
246	listener_cnt = 0;
247	peer_cnt = 0;
248	ctl_cnt = 0;
249
250	conf = new_config();
251	log_info("session engine ready");
252
253	while (session_quit == 0) {
254		/* check for peers to be initialized or deleted */
255		if (!pending_reconf) {
256			RB_FOREACH_SAFE(p, peer_head, &conf->peers, next) {
257				/* cloned peer that idled out? */
258				if (p->template && (p->state == STATE_IDLE ||
259				    p->state == STATE_ACTIVE) &&
260				    getmonotime() - p->stats.last_updown >=
261				    INTERVAL_HOLD_CLONED)
262					p->reconf_action = RECONF_DELETE;
263
264				/* new peer that needs init? */
265				if (p->state == STATE_NONE)
266					init_peer(p);
267
268				/* deletion due? */
269				if (p->reconf_action == RECONF_DELETE) {
270					if (p->demoted)
271						session_demote(p, -1);
272					p->conf.demote_group[0] = 0;
273					session_stop(p, ERR_CEASE_PEER_UNCONF,
274					    NULL);
275					timer_remove_all(&p->timers);
276					tcp_md5_del_listener(conf, p);
277					RB_REMOVE(peer_head, &conf->peers, p);
278					log_peer_warnx(&p->conf, "removed");
279					free(p);
280					peer_cnt--;
281					continue;
282				}
283				p->reconf_action = RECONF_NONE;
284			}
285		}
286
287		if (peer_cnt > peer_l_elms) {
288			if ((newp = reallocarray(peer_l, peer_cnt,
289			    sizeof(struct peer *))) == NULL) {
290				/* panic for now */
291				log_warn("could not resize peer_l from %u -> %u"
292				    " entries", peer_l_elms, peer_cnt);
293				fatalx("exiting");
294			}
295			peer_l = newp;
296			peer_l_elms = peer_cnt;
297		}
298
299		mrt_cnt = 0;
300		for (m = LIST_FIRST(&mrthead); m != NULL; m = xm) {
301			xm = LIST_NEXT(m, entry);
302			if (m->state == MRT_STATE_REMOVE) {
303				mrt_clean(m);
304				LIST_REMOVE(m, entry);
305				free(m);
306				continue;
307			}
308			if (m->wbuf.queued)
309				mrt_cnt++;
310		}
311
312		if (mrt_cnt > mrt_l_elms) {
313			if ((newp = reallocarray(mrt_l, mrt_cnt,
314			    sizeof(struct mrt *))) == NULL) {
315				/* panic for now */
316				log_warn("could not resize mrt_l from %u -> %u"
317				    " entries", mrt_l_elms, mrt_cnt);
318				fatalx("exiting");
319			}
320			mrt_l = newp;
321			mrt_l_elms = mrt_cnt;
322		}
323
324		new_cnt = PFD_LISTENERS_START + listener_cnt + peer_cnt +
325		    ctl_cnt + mrt_cnt;
326		if (new_cnt > pfd_elms) {
327			if ((newp = reallocarray(pfd, new_cnt,
328			    sizeof(struct pollfd))) == NULL) {
329				/* panic for now */
330				log_warn("could not resize pfd from %u -> %u"
331				    " entries", pfd_elms, new_cnt);
332				fatalx("exiting");
333			}
334			pfd = newp;
335			pfd_elms = new_cnt;
336		}
337
338		memset(pfd, 0, sizeof(struct pollfd) * pfd_elms);
339
340		set_pollfd(&pfd[PFD_PIPE_MAIN], ibuf_main);
341		set_pollfd(&pfd[PFD_PIPE_ROUTE], ibuf_rde);
342		set_pollfd(&pfd[PFD_PIPE_ROUTE_CTL], ibuf_rde_ctl);
343
344		if (pauseaccept == 0) {
345			pfd[PFD_SOCK_CTL].fd = csock;
346			pfd[PFD_SOCK_CTL].events = POLLIN;
347			pfd[PFD_SOCK_RCTL].fd = rcsock;
348			pfd[PFD_SOCK_RCTL].events = POLLIN;
349		} else {
350			pfd[PFD_SOCK_CTL].fd = -1;
351			pfd[PFD_SOCK_RCTL].fd = -1;
352		}
353
354		i = PFD_LISTENERS_START;
355		TAILQ_FOREACH(la, conf->listen_addrs, entry) {
356			if (pauseaccept == 0) {
357				pfd[i].fd = la->fd;
358				pfd[i].events = POLLIN;
359			} else
360				pfd[i].fd = -1;
361			i++;
362		}
363		idx_listeners = i;
364		timeout = 240;	/* loop every 240s at least */
365
366		now = getmonotime();
367		RB_FOREACH(p, peer_head, &conf->peers) {
368			time_t	nextaction;
369			struct timer *pt;
370
371			/* check timers */
372			if ((pt = timer_nextisdue(&p->timers, now)) != NULL) {
373				switch (pt->type) {
374				case Timer_Hold:
375					bgp_fsm(p, EVNT_TIMER_HOLDTIME);
376					break;
377				case Timer_SendHold:
378					bgp_fsm(p, EVNT_TIMER_SENDHOLD);
379					break;
380				case Timer_ConnectRetry:
381					bgp_fsm(p, EVNT_TIMER_CONNRETRY);
382					break;
383				case Timer_Keepalive:
384					bgp_fsm(p, EVNT_TIMER_KEEPALIVE);
385					break;
386				case Timer_IdleHold:
387					bgp_fsm(p, EVNT_START);
388					break;
389				case Timer_IdleHoldReset:
390					p->IdleHoldTime =
391					    INTERVAL_IDLE_HOLD_INITIAL;
392					p->errcnt = 0;
393					timer_stop(&p->timers,
394					    Timer_IdleHoldReset);
395					break;
396				case Timer_CarpUndemote:
397					timer_stop(&p->timers,
398					    Timer_CarpUndemote);
399					if (p->demoted &&
400					    p->state == STATE_ESTABLISHED)
401						session_demote(p, -1);
402					break;
403				case Timer_RestartTimeout:
404					timer_stop(&p->timers,
405					    Timer_RestartTimeout);
406					session_graceful_stop(p);
407					break;
408				default:
409					fatalx("King Bula lost in time");
410				}
411			}
412			if ((nextaction = timer_nextduein(&p->timers,
413			    now)) != -1 && nextaction < timeout)
414				timeout = nextaction;
415
416			/* are we waiting for a write? */
417			events = POLLIN;
418			if (p->wbuf.queued > 0 || p->state == STATE_CONNECT)
419				events |= POLLOUT;
420			/* is there still work to do? */
421			if (p->rpending && p->rbuf && p->rbuf->wpos)
422				timeout = 0;
423
424			/* poll events */
425			if (p->fd != -1 && events != 0) {
426				pfd[i].fd = p->fd;
427				pfd[i].events = events;
428				peer_l[i - idx_listeners] = p;
429				i++;
430			}
431		}
432
433		idx_peers = i;
434
435		LIST_FOREACH(m, &mrthead, entry)
436			if (m->wbuf.queued) {
437				pfd[i].fd = m->wbuf.fd;
438				pfd[i].events = POLLOUT;
439				mrt_l[i - idx_peers] = m;
440				i++;
441			}
442
443		idx_mrts = i;
444
445		i += control_fill_pfds(pfd + i, pfd_elms -i);
446
447		if (i > pfd_elms)
448			fatalx("poll pfd overflow");
449
450		if (pauseaccept && timeout > 1)
451			timeout = 1;
452		if (timeout < 0)
453			timeout = 0;
454		if (poll(pfd, i, timeout * 1000) == -1) {
455			if (errno == EINTR)
456				continue;
457			fatal("poll error");
458		}
459
460		/*
461		 * If we previously saw fd exhaustion, we stop accept()
462		 * for 1 second to throttle the accept() loop.
463		 */
464		if (pauseaccept && getmonotime() > pauseaccept + 1)
465			pauseaccept = 0;
466
467		if (handle_pollfd(&pfd[PFD_PIPE_MAIN], ibuf_main) == -1) {
468			log_warnx("SE: Lost connection to parent");
469			session_quit = 1;
470			continue;
471		} else
472			session_dispatch_imsg(ibuf_main, PFD_PIPE_MAIN,
473			    &listener_cnt);
474
475		if (handle_pollfd(&pfd[PFD_PIPE_ROUTE], ibuf_rde) == -1) {
476			log_warnx("SE: Lost connection to RDE");
477			msgbuf_clear(&ibuf_rde->w);
478			free(ibuf_rde);
479			ibuf_rde = NULL;
480		} else
481			session_dispatch_imsg(ibuf_rde, PFD_PIPE_ROUTE,
482			    &listener_cnt);
483
484		if (handle_pollfd(&pfd[PFD_PIPE_ROUTE_CTL], ibuf_rde_ctl) ==
485		    -1) {
486			log_warnx("SE: Lost connection to RDE control");
487			msgbuf_clear(&ibuf_rde_ctl->w);
488			free(ibuf_rde_ctl);
489			ibuf_rde_ctl = NULL;
490		} else
491			session_dispatch_imsg(ibuf_rde_ctl, PFD_PIPE_ROUTE_CTL,
492			    &listener_cnt);
493
494		if (pfd[PFD_SOCK_CTL].revents & POLLIN)
495			ctl_cnt += control_accept(csock, 0);
496
497		if (pfd[PFD_SOCK_RCTL].revents & POLLIN)
498			ctl_cnt += control_accept(rcsock, 1);
499
500		for (j = PFD_LISTENERS_START; j < idx_listeners; j++)
501			if (pfd[j].revents & POLLIN)
502				session_accept(pfd[j].fd);
503
504		for (; j < idx_peers; j++)
505			session_dispatch_msg(&pfd[j],
506			    peer_l[j - idx_listeners]);
507
508		RB_FOREACH(p, peer_head, &conf->peers)
509			if (p->rbuf && p->rbuf->wpos)
510				session_process_msg(p);
511
512		for (; j < idx_mrts; j++)
513			if (pfd[j].revents & POLLOUT)
514				mrt_write(mrt_l[j - idx_peers]);
515
516		for (; j < i; j++)
517			ctl_cnt -= control_dispatch_msg(&pfd[j], &conf->peers);
518	}
519
520	RB_FOREACH_SAFE(p, peer_head, &conf->peers, next) {
521		session_stop(p, ERR_CEASE_ADMIN_DOWN, "bgpd shutting down");
522		timer_remove_all(&p->timers);
523		tcp_md5_del_listener(conf, p);
524		RB_REMOVE(peer_head, &conf->peers, p);
525		free(p);
526	}
527
528	while ((m = LIST_FIRST(&mrthead)) != NULL) {
529		mrt_clean(m);
530		LIST_REMOVE(m, entry);
531		free(m);
532	}
533
534	free_config(conf);
535	free(peer_l);
536	free(mrt_l);
537	free(pfd);
538
539	/* close pipes */
540	if (ibuf_rde) {
541		msgbuf_write(&ibuf_rde->w);
542		msgbuf_clear(&ibuf_rde->w);
543		close(ibuf_rde->fd);
544		free(ibuf_rde);
545	}
546	if (ibuf_rde_ctl) {
547		msgbuf_clear(&ibuf_rde_ctl->w);
548		close(ibuf_rde_ctl->fd);
549		free(ibuf_rde_ctl);
550	}
551	msgbuf_write(&ibuf_main->w);
552	msgbuf_clear(&ibuf_main->w);
553	close(ibuf_main->fd);
554	free(ibuf_main);
555
556	control_shutdown(csock);
557	control_shutdown(rcsock);
558	log_info("session engine exiting");
559	exit(0);
560}
561
562void
563init_peer(struct peer *p)
564{
565	TAILQ_INIT(&p->timers);
566	p->fd = p->wbuf.fd = -1;
567
568	if (p->conf.if_depend[0])
569		imsg_compose(ibuf_main, IMSG_SESSION_DEPENDON, 0, 0, -1,
570		    p->conf.if_depend, sizeof(p->conf.if_depend));
571	else
572		p->depend_ok = 1;
573
574	peer_cnt++;
575
576	change_state(p, STATE_IDLE, EVNT_NONE);
577	if (p->conf.down)
578		timer_stop(&p->timers, Timer_IdleHold); /* no autostart */
579	else
580		timer_set(&p->timers, Timer_IdleHold, SESSION_CLEAR_DELAY);
581
582	p->stats.last_updown = getmonotime();
583
584	/*
585	 * on startup, demote if requested.
586	 * do not handle new peers. they must reach ESTABLISHED beforehand.
587	 * peers added at runtime have reconf_action set to RECONF_REINIT.
588	 */
589	if (p->reconf_action != RECONF_REINIT && p->conf.demote_group[0])
590		session_demote(p, +1);
591}
592
593void
594bgp_fsm(struct peer *peer, enum session_events event)
595{
596	switch (peer->state) {
597	case STATE_NONE:
598		/* nothing */
599		break;
600	case STATE_IDLE:
601		switch (event) {
602		case EVNT_START:
603			timer_stop(&peer->timers, Timer_Hold);
604			timer_stop(&peer->timers, Timer_SendHold);
605			timer_stop(&peer->timers, Timer_Keepalive);
606			timer_stop(&peer->timers, Timer_IdleHold);
607
608			/* allocate read buffer */
609			peer->rbuf = calloc(1, sizeof(struct ibuf_read));
610			if (peer->rbuf == NULL)
611				fatal(NULL);
612
613			/* init write buffer */
614			msgbuf_init(&peer->wbuf);
615
616			if (!peer->depend_ok)
617				timer_stop(&peer->timers, Timer_ConnectRetry);
618			else if (peer->passive || peer->conf.passive ||
619			    peer->conf.template) {
620				change_state(peer, STATE_ACTIVE, event);
621				timer_stop(&peer->timers, Timer_ConnectRetry);
622			} else {
623				change_state(peer, STATE_CONNECT, event);
624				timer_set(&peer->timers, Timer_ConnectRetry,
625				    conf->connectretry);
626				session_connect(peer);
627			}
628			peer->passive = 0;
629			break;
630		case EVNT_STOP:
631			timer_stop(&peer->timers, Timer_IdleHold);
632			break;
633		default:
634			/* ignore */
635			break;
636		}
637		break;
638	case STATE_CONNECT:
639		switch (event) {
640		case EVNT_START:
641			/* ignore */
642			break;
643		case EVNT_CON_OPEN:
644			session_tcp_established(peer);
645			session_open(peer);
646			timer_stop(&peer->timers, Timer_ConnectRetry);
647			peer->holdtime = INTERVAL_HOLD_INITIAL;
648			start_timer_holdtime(peer);
649			change_state(peer, STATE_OPENSENT, event);
650			break;
651		case EVNT_CON_OPENFAIL:
652			timer_set(&peer->timers, Timer_ConnectRetry,
653			    conf->connectretry);
654			session_close_connection(peer);
655			change_state(peer, STATE_ACTIVE, event);
656			break;
657		case EVNT_TIMER_CONNRETRY:
658			timer_set(&peer->timers, Timer_ConnectRetry,
659			    conf->connectretry);
660			session_connect(peer);
661			break;
662		default:
663			change_state(peer, STATE_IDLE, event);
664			break;
665		}
666		break;
667	case STATE_ACTIVE:
668		switch (event) {
669		case EVNT_START:
670			/* ignore */
671			break;
672		case EVNT_CON_OPEN:
673			session_tcp_established(peer);
674			session_open(peer);
675			timer_stop(&peer->timers, Timer_ConnectRetry);
676			peer->holdtime = INTERVAL_HOLD_INITIAL;
677			start_timer_holdtime(peer);
678			change_state(peer, STATE_OPENSENT, event);
679			break;
680		case EVNT_CON_OPENFAIL:
681			timer_set(&peer->timers, Timer_ConnectRetry,
682			    conf->connectretry);
683			session_close_connection(peer);
684			change_state(peer, STATE_ACTIVE, event);
685			break;
686		case EVNT_TIMER_CONNRETRY:
687			timer_set(&peer->timers, Timer_ConnectRetry,
688			    peer->holdtime);
689			change_state(peer, STATE_CONNECT, event);
690			session_connect(peer);
691			break;
692		default:
693			change_state(peer, STATE_IDLE, event);
694			break;
695		}
696		break;
697	case STATE_OPENSENT:
698		switch (event) {
699		case EVNT_START:
700			/* ignore */
701			break;
702		case EVNT_STOP:
703			change_state(peer, STATE_IDLE, event);
704			break;
705		case EVNT_CON_CLOSED:
706			session_close_connection(peer);
707			timer_set(&peer->timers, Timer_ConnectRetry,
708			    conf->connectretry);
709			change_state(peer, STATE_ACTIVE, event);
710			break;
711		case EVNT_CON_FATAL:
712			change_state(peer, STATE_IDLE, event);
713			break;
714		case EVNT_TIMER_HOLDTIME:
715			session_notification(peer, ERR_HOLDTIMEREXPIRED,
716			    0, NULL);
717			change_state(peer, STATE_IDLE, event);
718			break;
719		case EVNT_TIMER_SENDHOLD:
720			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
721			    0, NULL);
722			change_state(peer, STATE_IDLE, event);
723			break;
724		case EVNT_RCVD_OPEN:
725			/* parse_open calls change_state itself on failure */
726			if (parse_open(peer))
727				break;
728			session_keepalive(peer);
729			change_state(peer, STATE_OPENCONFIRM, event);
730			break;
731		case EVNT_RCVD_NOTIFICATION:
732			parse_notification(peer);
733			break;
734		default:
735			session_notification(peer,
736			    ERR_FSM, ERR_FSM_UNEX_OPENSENT, NULL);
737			change_state(peer, STATE_IDLE, event);
738			break;
739		}
740		break;
741	case STATE_OPENCONFIRM:
742		switch (event) {
743		case EVNT_START:
744			/* ignore */
745			break;
746		case EVNT_STOP:
747			change_state(peer, STATE_IDLE, event);
748			break;
749		case EVNT_CON_CLOSED:
750		case EVNT_CON_FATAL:
751			change_state(peer, STATE_IDLE, event);
752			break;
753		case EVNT_TIMER_HOLDTIME:
754			session_notification(peer, ERR_HOLDTIMEREXPIRED,
755			    0, NULL);
756			change_state(peer, STATE_IDLE, event);
757			break;
758		case EVNT_TIMER_SENDHOLD:
759			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
760			    0, NULL);
761			change_state(peer, STATE_IDLE, event);
762			break;
763		case EVNT_TIMER_KEEPALIVE:
764			session_keepalive(peer);
765			break;
766		case EVNT_RCVD_KEEPALIVE:
767			start_timer_holdtime(peer);
768			change_state(peer, STATE_ESTABLISHED, event);
769			break;
770		case EVNT_RCVD_NOTIFICATION:
771			parse_notification(peer);
772			break;
773		default:
774			session_notification(peer,
775			    ERR_FSM, ERR_FSM_UNEX_OPENCONFIRM, NULL);
776			change_state(peer, STATE_IDLE, event);
777			break;
778		}
779		break;
780	case STATE_ESTABLISHED:
781		switch (event) {
782		case EVNT_START:
783			/* ignore */
784			break;
785		case EVNT_STOP:
786			change_state(peer, STATE_IDLE, event);
787			break;
788		case EVNT_CON_CLOSED:
789		case EVNT_CON_FATAL:
790			change_state(peer, STATE_IDLE, event);
791			break;
792		case EVNT_TIMER_HOLDTIME:
793			session_notification(peer, ERR_HOLDTIMEREXPIRED,
794			    0, NULL);
795			change_state(peer, STATE_IDLE, event);
796			break;
797		case EVNT_TIMER_SENDHOLD:
798			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
799			    0, NULL);
800			change_state(peer, STATE_IDLE, event);
801			break;
802		case EVNT_TIMER_KEEPALIVE:
803			session_keepalive(peer);
804			break;
805		case EVNT_RCVD_KEEPALIVE:
806			start_timer_holdtime(peer);
807			break;
808		case EVNT_RCVD_UPDATE:
809			start_timer_holdtime(peer);
810			if (parse_update(peer))
811				change_state(peer, STATE_IDLE, event);
812			else
813				start_timer_holdtime(peer);
814			break;
815		case EVNT_RCVD_NOTIFICATION:
816			parse_notification(peer);
817			break;
818		default:
819			session_notification(peer,
820			    ERR_FSM, ERR_FSM_UNEX_ESTABLISHED, NULL);
821			change_state(peer, STATE_IDLE, event);
822			break;
823		}
824		break;
825	}
826}
827
828void
829start_timer_holdtime(struct peer *peer)
830{
831	if (peer->holdtime > 0)
832		timer_set(&peer->timers, Timer_Hold, peer->holdtime);
833	else
834		timer_stop(&peer->timers, Timer_Hold);
835}
836
837void
838start_timer_sendholdtime(struct peer *peer)
839{
840	uint16_t holdtime = INTERVAL_HOLD;
841
842	if (peer->holdtime > INTERVAL_HOLD)
843		holdtime = peer->holdtime;
844
845	if (peer->holdtime > 0)
846		timer_set(&peer->timers, Timer_SendHold, holdtime);
847	else
848		timer_stop(&peer->timers, Timer_SendHold);
849}
850
851void
852start_timer_keepalive(struct peer *peer)
853{
854	if (peer->holdtime > 0)
855		timer_set(&peer->timers, Timer_Keepalive, peer->holdtime / 3);
856	else
857		timer_stop(&peer->timers, Timer_Keepalive);
858}
859
860void
861session_close_connection(struct peer *peer)
862{
863	if (peer->fd != -1) {
864		close(peer->fd);
865		pauseaccept = 0;
866	}
867	peer->fd = peer->wbuf.fd = -1;
868}
869
870void
871change_state(struct peer *peer, enum session_state state,
872    enum session_events event)
873{
874	struct mrt	*mrt;
875
876	switch (state) {
877	case STATE_IDLE:
878		/* carp demotion first. new peers handled in init_peer */
879		if (peer->state == STATE_ESTABLISHED &&
880		    peer->conf.demote_group[0] && !peer->demoted)
881			session_demote(peer, +1);
882
883		/*
884		 * try to write out what's buffered (maybe a notification),
885		 * don't bother if it fails
886		 */
887		if (peer->state >= STATE_OPENSENT && peer->wbuf.queued)
888			msgbuf_write(&peer->wbuf);
889
890		/*
891		 * we must start the timer for the next EVNT_START
892		 * if we are coming here due to an error and the
893		 * session was not established successfully before, the
894		 * starttimerinterval needs to be exponentially increased
895		 */
896		if (peer->IdleHoldTime == 0)
897			peer->IdleHoldTime = INTERVAL_IDLE_HOLD_INITIAL;
898		peer->holdtime = INTERVAL_HOLD_INITIAL;
899		timer_stop(&peer->timers, Timer_ConnectRetry);
900		timer_stop(&peer->timers, Timer_Keepalive);
901		timer_stop(&peer->timers, Timer_Hold);
902		timer_stop(&peer->timers, Timer_SendHold);
903		timer_stop(&peer->timers, Timer_IdleHold);
904		timer_stop(&peer->timers, Timer_IdleHoldReset);
905		session_close_connection(peer);
906		msgbuf_clear(&peer->wbuf);
907		free(peer->rbuf);
908		peer->rbuf = NULL;
909		peer->rpending = 0;
910		memset(&peer->capa.peer, 0, sizeof(peer->capa.peer));
911		if (!peer->template)
912			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
913			    peer->conf.id, 0, -1, NULL, 0);
914
915		if (event != EVNT_STOP) {
916			timer_set(&peer->timers, Timer_IdleHold,
917			    peer->IdleHoldTime);
918			if (event != EVNT_NONE &&
919			    peer->IdleHoldTime < MAX_IDLE_HOLD/2)
920				peer->IdleHoldTime *= 2;
921		}
922		if (peer->state == STATE_ESTABLISHED) {
923			if (peer->capa.neg.grestart.restart == 2 &&
924			    (event == EVNT_CON_CLOSED ||
925			    event == EVNT_CON_FATAL)) {
926				/* don't punish graceful restart */
927				timer_set(&peer->timers, Timer_IdleHold, 0);
928				peer->IdleHoldTime /= 2;
929				session_graceful_restart(peer);
930			} else
931				session_down(peer);
932		}
933		if (peer->state == STATE_NONE ||
934		    peer->state == STATE_ESTABLISHED) {
935			/* initialize capability negotiation structures */
936			memcpy(&peer->capa.ann, &peer->conf.capabilities,
937			    sizeof(peer->capa.ann));
938		}
939		break;
940	case STATE_CONNECT:
941		if (peer->state == STATE_ESTABLISHED &&
942		    peer->capa.neg.grestart.restart == 2) {
943			/* do the graceful restart dance */
944			session_graceful_restart(peer);
945			peer->holdtime = INTERVAL_HOLD_INITIAL;
946			timer_stop(&peer->timers, Timer_ConnectRetry);
947			timer_stop(&peer->timers, Timer_Keepalive);
948			timer_stop(&peer->timers, Timer_Hold);
949			timer_stop(&peer->timers, Timer_SendHold);
950			timer_stop(&peer->timers, Timer_IdleHold);
951			timer_stop(&peer->timers, Timer_IdleHoldReset);
952			session_close_connection(peer);
953			msgbuf_clear(&peer->wbuf);
954			memset(&peer->capa.peer, 0, sizeof(peer->capa.peer));
955		}
956		break;
957	case STATE_ACTIVE:
958		if (!peer->template)
959			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
960			    peer->conf.id, 0, -1, NULL, 0);
961		break;
962	case STATE_OPENSENT:
963		break;
964	case STATE_OPENCONFIRM:
965		break;
966	case STATE_ESTABLISHED:
967		timer_set(&peer->timers, Timer_IdleHoldReset,
968		    peer->IdleHoldTime);
969		if (peer->demoted)
970			timer_set(&peer->timers, Timer_CarpUndemote,
971			    INTERVAL_HOLD_DEMOTED);
972		session_up(peer);
973		break;
974	default:		/* something seriously fucked */
975		break;
976	}
977
978	log_statechange(peer, state, event);
979	LIST_FOREACH(mrt, &mrthead, entry) {
980		if (!(mrt->type == MRT_ALL_IN || mrt->type == MRT_ALL_OUT))
981			continue;
982		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
983		    mrt->peer_id == peer->conf.id || (mrt->group_id != 0 &&
984		    mrt->group_id == peer->conf.groupid))
985			mrt_dump_state(mrt, peer->state, state, peer);
986	}
987	peer->prev_state = peer->state;
988	peer->state = state;
989}
990
991void
992session_accept(int listenfd)
993{
994	int			 connfd;
995	socklen_t		 len;
996	struct sockaddr_storage	 cliaddr;
997	struct peer		*p = NULL;
998
999	len = sizeof(cliaddr);
1000	if ((connfd = accept4(listenfd,
1001	    (struct sockaddr *)&cliaddr, &len,
1002	    SOCK_CLOEXEC | SOCK_NONBLOCK)) == -1) {
1003		if (errno == ENFILE || errno == EMFILE)
1004			pauseaccept = getmonotime();
1005		else if (errno != EWOULDBLOCK && errno != EINTR &&
1006		    errno != ECONNABORTED)
1007			log_warn("accept");
1008		return;
1009	}
1010
1011	p = getpeerbyip(conf, (struct sockaddr *)&cliaddr);
1012
1013	if (p != NULL && p->state == STATE_IDLE && p->errcnt < 2) {
1014		if (timer_running(&p->timers, Timer_IdleHold, NULL)) {
1015			/* fast reconnect after clear */
1016			p->passive = 1;
1017			bgp_fsm(p, EVNT_START);
1018		}
1019	}
1020
1021	if (p != NULL &&
1022	    (p->state == STATE_CONNECT || p->state == STATE_ACTIVE)) {
1023		if (p->fd != -1) {
1024			if (p->state == STATE_CONNECT)
1025				session_close_connection(p);
1026			else {
1027				close(connfd);
1028				return;
1029			}
1030		}
1031
1032open:
1033		if (p->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
1034			log_peer_warnx(&p->conf,
1035			    "ipsec or md5sig configured but not available");
1036			close(connfd);
1037			return;
1038		}
1039
1040		if (tcp_md5_check(connfd, p) == -1) {
1041			close(connfd);
1042			return;
1043		}
1044		p->fd = p->wbuf.fd = connfd;
1045		if (session_setup_socket(p)) {
1046			close(connfd);
1047			return;
1048		}
1049		bgp_fsm(p, EVNT_CON_OPEN);
1050		return;
1051	} else if (p != NULL && p->state == STATE_ESTABLISHED &&
1052	    p->capa.neg.grestart.restart == 2) {
1053		/* first do the graceful restart dance */
1054		change_state(p, STATE_CONNECT, EVNT_CON_CLOSED);
1055		/* then do part of the open dance */
1056		goto open;
1057	} else {
1058		log_conn_attempt(p, (struct sockaddr *)&cliaddr, len);
1059		close(connfd);
1060	}
1061}
1062
1063int
1064session_connect(struct peer *peer)
1065{
1066	struct sockaddr		*sa;
1067	struct bgpd_addr	*bind_addr = NULL;
1068	socklen_t		 sa_len;
1069
1070	/*
1071	 * we do not need the overcomplicated collision detection RFC 1771
1072	 * describes; we simply make sure there is only ever one concurrent
1073	 * tcp connection per peer.
1074	 */
1075	if (peer->fd != -1)
1076		return (-1);
1077
1078	if ((peer->fd = socket(aid2af(peer->conf.remote_addr.aid),
1079	    SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, IPPROTO_TCP)) == -1) {
1080		log_peer_warn(&peer->conf, "session_connect socket");
1081		bgp_fsm(peer, EVNT_CON_OPENFAIL);
1082		return (-1);
1083	}
1084
1085	if (peer->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
1086		log_peer_warnx(&peer->conf,
1087		    "ipsec or md5sig configured but not available");
1088		bgp_fsm(peer, EVNT_CON_OPENFAIL);
1089		return (-1);
1090	}
1091
1092	tcp_md5_set(peer->fd, peer);
1093	peer->wbuf.fd = peer->fd;
1094
1095	/* if local-address is set we need to bind() */
1096	switch (peer->conf.remote_addr.aid) {
1097	case AID_INET:
1098		bind_addr = &peer->conf.local_addr_v4;
1099		break;
1100	case AID_INET6:
1101		bind_addr = &peer->conf.local_addr_v6;
1102		break;
1103	}
1104	if ((sa = addr2sa(bind_addr, 0, &sa_len)) != NULL) {
1105		if (bind(peer->fd, sa, sa_len) == -1) {
1106			log_peer_warn(&peer->conf, "session_connect bind");
1107			bgp_fsm(peer, EVNT_CON_OPENFAIL);
1108			return (-1);
1109		}
1110	}
1111
1112	if (session_setup_socket(peer)) {
1113		bgp_fsm(peer, EVNT_CON_OPENFAIL);
1114		return (-1);
1115	}
1116
1117	sa = addr2sa(&peer->conf.remote_addr, peer->conf.remote_port, &sa_len);
1118	if (connect(peer->fd, sa, sa_len) == -1) {
1119		if (errno != EINPROGRESS) {
1120			if (errno != peer->lasterr)
1121				log_peer_warn(&peer->conf, "connect");
1122			peer->lasterr = errno;
1123			bgp_fsm(peer, EVNT_CON_OPENFAIL);
1124			return (-1);
1125		}
1126	} else
1127		bgp_fsm(peer, EVNT_CON_OPEN);
1128
1129	return (0);
1130}
1131
1132int
1133session_setup_socket(struct peer *p)
1134{
1135	int	ttl = p->conf.distance;
1136	int	pre = IPTOS_PREC_INTERNETCONTROL;
1137	int	nodelay = 1;
1138	int	bsize;
1139
1140	switch (p->conf.remote_addr.aid) {
1141	case AID_INET:
1142		/* set precedence, see RFC 1771 appendix 5 */
1143		if (setsockopt(p->fd, IPPROTO_IP, IP_TOS, &pre, sizeof(pre)) ==
1144		    -1) {
1145			log_peer_warn(&p->conf,
1146			    "session_setup_socket setsockopt TOS");
1147			return (-1);
1148		}
1149
1150		if (p->conf.ebgp) {
1151			/*
1152			 * set TTL to foreign router's distance
1153			 * 1=direct n=multihop with ttlsec, we always use 255
1154			 */
1155			if (p->conf.ttlsec) {
1156				ttl = 256 - p->conf.distance;
1157				if (setsockopt(p->fd, IPPROTO_IP, IP_MINTTL,
1158				    &ttl, sizeof(ttl)) == -1) {
1159					log_peer_warn(&p->conf,
1160					    "session_setup_socket: "
1161					    "setsockopt MINTTL");
1162					return (-1);
1163				}
1164				ttl = 255;
1165			}
1166
1167			if (setsockopt(p->fd, IPPROTO_IP, IP_TTL, &ttl,
1168			    sizeof(ttl)) == -1) {
1169				log_peer_warn(&p->conf,
1170				    "session_setup_socket setsockopt TTL");
1171				return (-1);
1172			}
1173		}
1174		break;
1175	case AID_INET6:
1176		if (setsockopt(p->fd, IPPROTO_IPV6, IPV6_TCLASS, &pre,
1177		    sizeof(pre)) == -1) {
1178			log_peer_warn(&p->conf, "session_setup_socket "
1179			    "setsockopt TCLASS");
1180			return (-1);
1181		}
1182
1183		if (p->conf.ebgp) {
1184			/*
1185			 * set hoplimit to foreign router's distance
1186			 * 1=direct n=multihop with ttlsec, we always use 255
1187			 */
1188			if (p->conf.ttlsec) {
1189				ttl = 256 - p->conf.distance;
1190				if (setsockopt(p->fd, IPPROTO_IPV6,
1191				    IPV6_MINHOPCOUNT, &ttl, sizeof(ttl))
1192				    == -1) {
1193					log_peer_warn(&p->conf,
1194					    "session_setup_socket: "
1195					    "setsockopt MINHOPCOUNT");
1196					return (-1);
1197				}
1198				ttl = 255;
1199			}
1200			if (setsockopt(p->fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
1201			    &ttl, sizeof(ttl)) == -1) {
1202				log_peer_warn(&p->conf,
1203				    "session_setup_socket setsockopt hoplimit");
1204				return (-1);
1205			}
1206		}
1207		break;
1208	}
1209
1210	/* set TCP_NODELAY */
1211	if (setsockopt(p->fd, IPPROTO_TCP, TCP_NODELAY, &nodelay,
1212	    sizeof(nodelay)) == -1) {
1213		log_peer_warn(&p->conf,
1214		    "session_setup_socket setsockopt TCP_NODELAY");
1215		return (-1);
1216	}
1217
1218	/* limit bufsize. no biggie if it fails */
1219	bsize = 65535;
1220	setsockopt(p->fd, SOL_SOCKET, SO_RCVBUF, &bsize, sizeof(bsize));
1221	setsockopt(p->fd, SOL_SOCKET, SO_SNDBUF, &bsize, sizeof(bsize));
1222
1223	return (0);
1224}
1225
1226/*
1227 * compare the bgpd_addr with the sockaddr by converting the latter into
1228 * a bgpd_addr. Return true if the two are equal, including any scope
1229 */
1230static int
1231sa_equal(struct bgpd_addr *ba, struct sockaddr *b)
1232{
1233	struct bgpd_addr bb;
1234
1235	sa2addr(b, &bb, NULL);
1236	return (memcmp(ba, &bb, sizeof(*ba)) == 0);
1237}
1238
1239static void
1240get_alternate_addr(struct bgpd_addr *local, struct bgpd_addr *remote,
1241    struct bgpd_addr *alt, unsigned int *scope)
1242{
1243	struct ifaddrs	*ifap, *ifa, *match;
1244	int connected = 0;
1245	u_int8_t plen;
1246
1247	if (getifaddrs(&ifap) == -1)
1248		fatal("getifaddrs");
1249
1250	for (match = ifap; match != NULL; match = match->ifa_next) {
1251		if (match->ifa_addr == NULL)
1252			continue;
1253		if (match->ifa_addr->sa_family != AF_INET &&
1254		    match->ifa_addr->sa_family != AF_INET6)
1255			continue;
1256		if (sa_equal(local, match->ifa_addr)) {
1257			if (remote->aid == AID_INET6 &&
1258			    IN6_IS_ADDR_LINKLOCAL(&remote->v6)) {
1259				/* IPv6 LLA are by definition connected */
1260				connected = 1;
1261			} else if (match->ifa_flags & IFF_POINTOPOINT &&
1262			    match->ifa_dstaddr != NULL) {
1263				if (sa_equal(remote, match->ifa_dstaddr))
1264					connected = 1;
1265			} else if (match->ifa_netmask != NULL) {
1266				plen = mask2prefixlen(
1267				    match->ifa_addr->sa_family,
1268				    match->ifa_netmask);
1269				if (prefix_compare(local, remote, plen) == 0)
1270					connected = 1;
1271			}
1272			break;
1273		}
1274	}
1275
1276	if (match == NULL) {
1277		log_warnx("%s: local address not found", __func__);
1278		return;
1279	}
1280	if (connected)
1281		*scope = if_nametoindex(match->ifa_name);
1282	else
1283		*scope = 0;
1284
1285	switch (local->aid) {
1286	case AID_INET6:
1287		for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) {
1288			if (ifa->ifa_addr != NULL &&
1289			    ifa->ifa_addr->sa_family == AF_INET &&
1290			    strcmp(ifa->ifa_name, match->ifa_name) == 0) {
1291				sa2addr(ifa->ifa_addr, alt, NULL);
1292				break;
1293			}
1294		}
1295		break;
1296	case AID_INET:
1297		for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) {
1298			if (ifa->ifa_addr != NULL &&
1299			    ifa->ifa_addr->sa_family == AF_INET6 &&
1300			    strcmp(ifa->ifa_name, match->ifa_name) == 0) {
1301				struct sockaddr_in6 *s =
1302				    (struct sockaddr_in6 *)ifa->ifa_addr;
1303
1304				/* only accept global scope addresses */
1305				if (IN6_IS_ADDR_LINKLOCAL(&s->sin6_addr) ||
1306				    IN6_IS_ADDR_SITELOCAL(&s->sin6_addr))
1307					continue;
1308				sa2addr(ifa->ifa_addr, alt, NULL);
1309				break;
1310			}
1311		}
1312		break;
1313	default:
1314		log_warnx("%s: unsupported address family %s", __func__,
1315		    aid2str(local->aid));
1316		break;
1317	}
1318
1319	freeifaddrs(ifap);
1320}
1321
1322void
1323session_tcp_established(struct peer *peer)
1324{
1325	struct sockaddr_storage	ss;
1326	socklen_t		len;
1327
1328	len = sizeof(ss);
1329	if (getsockname(peer->fd, (struct sockaddr *)&ss, &len) == -1)
1330		log_warn("getsockname");
1331	sa2addr((struct sockaddr *)&ss, &peer->local, &peer->local_port);
1332	len = sizeof(ss);
1333	if (getpeername(peer->fd, (struct sockaddr *)&ss, &len) == -1)
1334		log_warn("getpeername");
1335	sa2addr((struct sockaddr *)&ss, &peer->remote, &peer->remote_port);
1336
1337	get_alternate_addr(&peer->local, &peer->remote, &peer->local_alt,
1338	    &peer->if_scope);
1339}
1340
1341int
1342session_capa_add(struct ibuf *opb, uint8_t capa_code, uint8_t capa_len)
1343{
1344	int errs = 0;
1345
1346	errs += ibuf_add_n8(opb, capa_code);
1347	errs += ibuf_add_n8(opb, capa_len);
1348	return (errs);
1349}
1350
1351int
1352session_capa_add_mp(struct ibuf *buf, uint8_t aid)
1353{
1354	uint16_t		 afi;
1355	uint8_t			 safi;
1356	int			 errs = 0;
1357
1358	if (aid2afi(aid, &afi, &safi) == -1) {
1359		log_warn("%s: bad AID", __func__);
1360		return (-1);
1361	}
1362
1363	errs += ibuf_add_n16(buf, afi);
1364	errs += ibuf_add_zero(buf, 1);
1365	errs += ibuf_add_n8(buf, safi);
1366
1367	return (errs);
1368}
1369
1370int
1371session_capa_add_afi(struct ibuf *b, uint8_t aid, uint8_t flags)
1372{
1373	u_int		errs = 0;
1374	uint16_t	afi;
1375	uint8_t		safi;
1376
1377	if (aid2afi(aid, &afi, &safi)) {
1378		log_warn("%s: bad AID", __func__);
1379		return (-1);
1380	}
1381
1382	errs += ibuf_add_n16(b, afi);
1383	errs += ibuf_add_n8(b, safi);
1384	errs += ibuf_add_n8(b, flags);
1385
1386	return (errs);
1387}
1388
1389struct bgp_msg *
1390session_newmsg(enum msg_type msgtype, uint16_t len)
1391{
1392	struct bgp_msg		*msg;
1393	struct ibuf		*buf;
1394	int			 errs = 0;
1395
1396	if ((buf = ibuf_open(len)) == NULL)
1397		return (NULL);
1398
1399	errs += ibuf_add(buf, marker, sizeof(marker));
1400	errs += ibuf_add_n16(buf, len);
1401	errs += ibuf_add_n8(buf, msgtype);
1402
1403	if (errs || (msg = calloc(1, sizeof(*msg))) == NULL) {
1404		ibuf_free(buf);
1405		return (NULL);
1406	}
1407
1408	msg->buf = buf;
1409	msg->type = msgtype;
1410	msg->len = len;
1411
1412	return (msg);
1413}
1414
1415int
1416session_sendmsg(struct bgp_msg *msg, struct peer *p)
1417{
1418	struct mrt		*mrt;
1419
1420	LIST_FOREACH(mrt, &mrthead, entry) {
1421		if (!(mrt->type == MRT_ALL_OUT || (msg->type == UPDATE &&
1422		    mrt->type == MRT_UPDATE_OUT)))
1423			continue;
1424		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
1425		    mrt->peer_id == p->conf.id || (mrt->group_id != 0 &&
1426		    mrt->group_id == p->conf.groupid))
1427			mrt_dump_bgp_msg(mrt, ibuf_data(msg->buf), msg->len, p,
1428			    msg->type);
1429	}
1430
1431	ibuf_close(&p->wbuf, msg->buf);
1432	if (!p->throttled && p->wbuf.queued > SESS_MSG_HIGH_MARK) {
1433		if (imsg_rde(IMSG_XOFF, p->conf.id, NULL, 0) == -1)
1434			log_peer_warn(&p->conf, "imsg_compose XOFF");
1435		else
1436			p->throttled = 1;
1437	}
1438
1439	free(msg);
1440	return (0);
1441}
1442
1443/*
1444 * Translate between internal roles and the value expected by RFC 9234.
1445 */
1446static uint8_t
1447role2capa(enum role role)
1448{
1449	switch (role) {
1450	case ROLE_CUSTOMER:
1451		return CAPA_ROLE_CUSTOMER;
1452	case ROLE_PROVIDER:
1453		return CAPA_ROLE_PROVIDER;
1454	case ROLE_RS:
1455		return CAPA_ROLE_RS;
1456	case ROLE_RS_CLIENT:
1457		return CAPA_ROLE_RS_CLIENT;
1458	case ROLE_PEER:
1459		return CAPA_ROLE_PEER;
1460	default:
1461		fatalx("Unsupported role for role capability");
1462	}
1463}
1464
1465static enum role
1466capa2role(uint8_t val)
1467{
1468	switch (val) {
1469	case CAPA_ROLE_PROVIDER:
1470		return ROLE_PROVIDER;
1471	case CAPA_ROLE_RS:
1472		return ROLE_RS;
1473	case CAPA_ROLE_RS_CLIENT:
1474		return ROLE_RS_CLIENT;
1475	case CAPA_ROLE_CUSTOMER:
1476		return ROLE_CUSTOMER;
1477	case CAPA_ROLE_PEER:
1478		return ROLE_PEER;
1479	default:
1480		return ROLE_NONE;
1481	}
1482}
1483
1484void
1485session_open(struct peer *p)
1486{
1487	struct bgp_msg		*buf;
1488	struct ibuf		*opb;
1489	size_t			 len, optparamlen;
1490	uint16_t		 holdtime;
1491	uint8_t			 i;
1492	int			 errs = 0, extlen = 0;
1493	int			 mpcapa = 0;
1494
1495
1496	if ((opb = ibuf_dynamic(0, UINT16_MAX - 3)) == NULL) {
1497		bgp_fsm(p, EVNT_CON_FATAL);
1498		return;
1499	}
1500
1501	/* multiprotocol extensions, RFC 4760 */
1502	for (i = AID_MIN; i < AID_MAX; i++)
1503		if (p->capa.ann.mp[i]) {	/* 4 bytes data */
1504			errs += session_capa_add(opb, CAPA_MP, 4);
1505			errs += session_capa_add_mp(opb, i);
1506			mpcapa++;
1507		}
1508
1509	/* route refresh, RFC 2918 */
1510	if (p->capa.ann.refresh)	/* no data */
1511		errs += session_capa_add(opb, CAPA_REFRESH, 0);
1512
1513	/* BGP open policy, RFC 9234, only for ebgp sessions */
1514	if (p->conf.ebgp && p->capa.ann.policy &&
1515	    p->conf.role != ROLE_NONE &&
1516	    (p->capa.ann.mp[AID_INET] || p->capa.ann.mp[AID_INET6] ||
1517	    mpcapa == 0)) {
1518		errs += session_capa_add(opb, CAPA_ROLE, 1);
1519		errs += ibuf_add_n8(opb, role2capa(p->conf.role));
1520	}
1521
1522	/* graceful restart and End-of-RIB marker, RFC 4724 */
1523	if (p->capa.ann.grestart.restart) {
1524		int		rst = 0;
1525		uint16_t	hdr = 0;
1526
1527		for (i = AID_MIN; i < AID_MAX; i++) {
1528			if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
1529				rst++;
1530		}
1531
1532		/* Only set the R-flag if no graceful restart is ongoing */
1533		if (!rst)
1534			hdr |= CAPA_GR_R_FLAG;
1535		errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr));
1536		errs += ibuf_add_n16(opb, hdr);
1537	}
1538
1539	/* 4-bytes AS numbers, RFC6793 */
1540	if (p->capa.ann.as4byte) {	/* 4 bytes data */
1541		errs += session_capa_add(opb, CAPA_AS4BYTE, sizeof(uint32_t));
1542		errs += ibuf_add_n32(opb, p->conf.local_as);
1543	}
1544
1545	/* advertisement of multiple paths, RFC7911 */
1546	if (p->capa.ann.add_path[AID_MIN]) {	/* variable */
1547		uint8_t	aplen;
1548
1549		if (mpcapa)
1550			aplen = 4 * mpcapa;
1551		else	/* AID_INET */
1552			aplen = 4;
1553		errs += session_capa_add(opb, CAPA_ADD_PATH, aplen);
1554		if (mpcapa) {
1555			for (i = AID_MIN; i < AID_MAX; i++) {
1556				if (p->capa.ann.mp[i]) {
1557					errs += session_capa_add_afi(opb,
1558					    i, p->capa.ann.add_path[i] &
1559					    CAPA_AP_MASK);
1560				}
1561			}
1562		} else {	/* AID_INET */
1563			errs += session_capa_add_afi(opb, AID_INET,
1564			    p->capa.ann.add_path[AID_INET] & CAPA_AP_MASK);
1565		}
1566	}
1567
1568	/* enhanced route-refresh, RFC7313 */
1569	if (p->capa.ann.enhanced_rr)	/* no data */
1570		errs += session_capa_add(opb, CAPA_ENHANCED_RR, 0);
1571
1572	if (errs) {
1573		ibuf_free(opb);
1574		bgp_fsm(p, EVNT_CON_FATAL);
1575		return;
1576	}
1577
1578	optparamlen = ibuf_size(opb);
1579	len = MSGSIZE_OPEN_MIN + optparamlen;
1580	if (optparamlen == 0) {
1581		/* nothing */
1582	} else if (optparamlen + 2 >= 255) {
1583		/* RFC9072: use 255 as magic size and request extra header */
1584		optparamlen = 255;
1585		extlen = 1;
1586		/* 3 byte OPT_PARAM_EXT_LEN and OPT_PARAM_CAPABILITIES */
1587		len += 2 * 3;
1588	} else {
1589		/* regular capabilities header */
1590		optparamlen += 2;
1591		len += 2;
1592	}
1593
1594	if ((buf = session_newmsg(OPEN, len)) == NULL) {
1595		ibuf_free(opb);
1596		bgp_fsm(p, EVNT_CON_FATAL);
1597		return;
1598	}
1599
1600	if (p->conf.holdtime)
1601		holdtime = p->conf.holdtime;
1602	else
1603		holdtime = conf->holdtime;
1604
1605	errs += ibuf_add_n8(buf->buf, 4);
1606	errs += ibuf_add_n16(buf->buf, p->conf.local_short_as);
1607	errs += ibuf_add_n16(buf->buf, holdtime);
1608	/* is already in network byte order */
1609	errs += ibuf_add_n32(buf->buf, conf->bgpid);
1610	errs += ibuf_add_n8(buf->buf, optparamlen);
1611
1612	if (extlen) {
1613		/* RFC9072 extra header which spans over the capabilities hdr */
1614		errs += ibuf_add_n8(buf->buf, OPT_PARAM_EXT_LEN);
1615		errs += ibuf_add_n16(buf->buf, ibuf_size(opb) + 1 + 2);
1616	}
1617
1618	if (optparamlen) {
1619		errs += ibuf_add_n8(buf->buf, OPT_PARAM_CAPABILITIES);
1620
1621		if (extlen) {
1622			/* RFC9072: 2-byte extended length */
1623			errs += ibuf_add_n16(buf->buf, ibuf_size(opb));
1624		} else {
1625			errs += ibuf_add_n8(buf->buf, ibuf_size(opb));
1626		}
1627		errs += ibuf_add_buf(buf->buf, opb);
1628	}
1629
1630	ibuf_free(opb);
1631
1632	if (errs) {
1633		ibuf_free(buf->buf);
1634		free(buf);
1635		bgp_fsm(p, EVNT_CON_FATAL);
1636		return;
1637	}
1638
1639	if (session_sendmsg(buf, p) == -1) {
1640		bgp_fsm(p, EVNT_CON_FATAL);
1641		return;
1642	}
1643
1644	p->stats.msg_sent_open++;
1645}
1646
1647void
1648session_keepalive(struct peer *p)
1649{
1650	struct bgp_msg		*buf;
1651
1652	if ((buf = session_newmsg(KEEPALIVE, MSGSIZE_KEEPALIVE)) == NULL ||
1653	    session_sendmsg(buf, p) == -1) {
1654		bgp_fsm(p, EVNT_CON_FATAL);
1655		return;
1656	}
1657
1658	start_timer_keepalive(p);
1659	p->stats.msg_sent_keepalive++;
1660}
1661
1662void
1663session_update(uint32_t peerid, struct ibuf *ibuf)
1664{
1665	struct peer		*p;
1666	struct bgp_msg		*buf;
1667
1668	if ((p = getpeerbyid(conf, peerid)) == NULL) {
1669		log_warnx("no such peer: id=%u", peerid);
1670		return;
1671	}
1672
1673	if (p->state != STATE_ESTABLISHED)
1674		return;
1675
1676	if ((buf = session_newmsg(UPDATE, MSGSIZE_HEADER + ibuf_size(ibuf))) ==
1677	    NULL) {
1678		bgp_fsm(p, EVNT_CON_FATAL);
1679		return;
1680	}
1681
1682	if (ibuf_add_buf(buf->buf, ibuf)) {
1683		ibuf_free(buf->buf);
1684		free(buf);
1685		bgp_fsm(p, EVNT_CON_FATAL);
1686		return;
1687	}
1688
1689	if (session_sendmsg(buf, p) == -1) {
1690		bgp_fsm(p, EVNT_CON_FATAL);
1691		return;
1692	}
1693
1694	start_timer_keepalive(p);
1695	p->stats.msg_sent_update++;
1696}
1697
1698void
1699session_notification_data(struct peer *p, uint8_t errcode, uint8_t subcode,
1700    void *data, size_t datalen)
1701{
1702	struct ibuf ibuf;
1703
1704	ibuf_from_buffer(&ibuf, data, datalen);
1705	session_notification(p, errcode, subcode, &ibuf);
1706}
1707
1708void
1709session_notification(struct peer *p, uint8_t errcode, uint8_t subcode,
1710    struct ibuf *ibuf)
1711{
1712	struct bgp_msg		*buf;
1713	int			 errs = 0;
1714	size_t			 datalen = 0;
1715
1716	switch (p->state) {
1717	case STATE_OPENSENT:
1718	case STATE_OPENCONFIRM:
1719	case STATE_ESTABLISHED:
1720		break;
1721	default:
1722		/* session not open, no need to send notification */
1723		log_notification(p, errcode, subcode, ibuf, "dropping");
1724		return;
1725	}
1726
1727	log_notification(p, errcode, subcode, ibuf, "sending");
1728
1729	/* cap to maximum size */
1730	if (ibuf != NULL) {
1731		if (ibuf_size(ibuf) >
1732		    MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN) {
1733			log_peer_warnx(&p->conf,
1734			    "oversized notification, data trunkated");
1735			ibuf_truncate(ibuf, MAX_PKTSIZE -
1736			    MSGSIZE_NOTIFICATION_MIN);
1737		}
1738		datalen = ibuf_size(ibuf);
1739	}
1740
1741	if ((buf = session_newmsg(NOTIFICATION,
1742	    MSGSIZE_NOTIFICATION_MIN + datalen)) == NULL) {
1743		bgp_fsm(p, EVNT_CON_FATAL);
1744		return;
1745	}
1746
1747	errs += ibuf_add_n8(buf->buf, errcode);
1748	errs += ibuf_add_n8(buf->buf, subcode);
1749
1750	if (ibuf != NULL)
1751		errs += ibuf_add_buf(buf->buf, ibuf);
1752
1753	if (errs) {
1754		ibuf_free(buf->buf);
1755		free(buf);
1756		bgp_fsm(p, EVNT_CON_FATAL);
1757		return;
1758	}
1759
1760	if (session_sendmsg(buf, p) == -1) {
1761		bgp_fsm(p, EVNT_CON_FATAL);
1762		return;
1763	}
1764
1765	p->stats.msg_sent_notification++;
1766	p->stats.last_sent_errcode = errcode;
1767	p->stats.last_sent_suberr = subcode;
1768}
1769
1770int
1771session_neighbor_rrefresh(struct peer *p)
1772{
1773	uint8_t	i;
1774
1775	if (!(p->capa.neg.refresh || p->capa.neg.enhanced_rr))
1776		return (-1);
1777
1778	for (i = AID_MIN; i < AID_MAX; i++) {
1779		if (p->capa.neg.mp[i] != 0)
1780			session_rrefresh(p, i, ROUTE_REFRESH_REQUEST);
1781	}
1782
1783	return (0);
1784}
1785
1786void
1787session_rrefresh(struct peer *p, uint8_t aid, uint8_t subtype)
1788{
1789	struct bgp_msg		*buf;
1790	int			 errs = 0;
1791	uint16_t		 afi;
1792	uint8_t			 safi;
1793
1794	switch (subtype) {
1795	case ROUTE_REFRESH_REQUEST:
1796		p->stats.refresh_sent_req++;
1797		break;
1798	case ROUTE_REFRESH_BEGIN_RR:
1799	case ROUTE_REFRESH_END_RR:
1800		/* requires enhanced route refresh */
1801		if (!p->capa.neg.enhanced_rr)
1802			return;
1803		if (subtype == ROUTE_REFRESH_BEGIN_RR)
1804			p->stats.refresh_sent_borr++;
1805		else
1806			p->stats.refresh_sent_eorr++;
1807		break;
1808	default:
1809		fatalx("session_rrefresh: bad subtype %d", subtype);
1810	}
1811
1812	if (aid2afi(aid, &afi, &safi) == -1)
1813		fatalx("session_rrefresh: bad afi/safi pair");
1814
1815	if ((buf = session_newmsg(RREFRESH, MSGSIZE_RREFRESH)) == NULL) {
1816		bgp_fsm(p, EVNT_CON_FATAL);
1817		return;
1818	}
1819
1820	errs += ibuf_add_n16(buf->buf, afi);
1821	errs += ibuf_add_n8(buf->buf, subtype);
1822	errs += ibuf_add_n8(buf->buf, safi);
1823
1824	if (errs) {
1825		ibuf_free(buf->buf);
1826		free(buf);
1827		bgp_fsm(p, EVNT_CON_FATAL);
1828		return;
1829	}
1830
1831	if (session_sendmsg(buf, p) == -1) {
1832		bgp_fsm(p, EVNT_CON_FATAL);
1833		return;
1834	}
1835
1836	p->stats.msg_sent_rrefresh++;
1837}
1838
1839int
1840session_graceful_restart(struct peer *p)
1841{
1842	uint8_t	i;
1843
1844	timer_set(&p->timers, Timer_RestartTimeout,
1845	    p->capa.neg.grestart.timeout);
1846
1847	for (i = AID_MIN; i < AID_MAX; i++) {
1848		if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) {
1849			if (imsg_rde(IMSG_SESSION_STALE, p->conf.id,
1850			    &i, sizeof(i)) == -1)
1851				return (-1);
1852			log_peer_warnx(&p->conf,
1853			    "graceful restart of %s, keeping routes",
1854			    aid2str(i));
1855			p->capa.neg.grestart.flags[i] |= CAPA_GR_RESTARTING;
1856		} else if (p->capa.neg.mp[i]) {
1857			if (imsg_rde(IMSG_SESSION_NOGRACE, p->conf.id,
1858			    &i, sizeof(i)) == -1)
1859				return (-1);
1860			log_peer_warnx(&p->conf,
1861			    "graceful restart of %s, flushing routes",
1862			    aid2str(i));
1863		}
1864	}
1865	return (0);
1866}
1867
1868int
1869session_graceful_stop(struct peer *p)
1870{
1871	uint8_t	i;
1872
1873	for (i = AID_MIN; i < AID_MAX; i++) {
1874		/*
1875		 * Only flush if the peer is restarting and the timeout fired.
1876		 * In all other cases the session was already flushed when the
1877		 * session went down or when the new open message was parsed.
1878		 */
1879		if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING) {
1880			log_peer_warnx(&p->conf, "graceful restart of %s, "
1881			    "time-out, flushing", aid2str(i));
1882			if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
1883			    &i, sizeof(i)) == -1)
1884				return (-1);
1885		}
1886		p->capa.neg.grestart.flags[i] &= ~CAPA_GR_RESTARTING;
1887	}
1888	return (0);
1889}
1890
1891int
1892session_dispatch_msg(struct pollfd *pfd, struct peer *p)
1893{
1894	ssize_t		n;
1895	socklen_t	len;
1896	int		error;
1897
1898	if (p->state == STATE_CONNECT) {
1899		if (pfd->revents & POLLOUT) {
1900			if (pfd->revents & POLLIN) {
1901				/* error occurred */
1902				len = sizeof(error);
1903				if (getsockopt(pfd->fd, SOL_SOCKET, SO_ERROR,
1904				    &error, &len) == -1 || error) {
1905					if (error)
1906						errno = error;
1907					if (errno != p->lasterr) {
1908						log_peer_warn(&p->conf,
1909						    "socket error");
1910						p->lasterr = errno;
1911					}
1912					bgp_fsm(p, EVNT_CON_OPENFAIL);
1913					return (1);
1914				}
1915			}
1916			bgp_fsm(p, EVNT_CON_OPEN);
1917			return (1);
1918		}
1919		if (pfd->revents & POLLHUP) {
1920			bgp_fsm(p, EVNT_CON_OPENFAIL);
1921			return (1);
1922		}
1923		if (pfd->revents & (POLLERR|POLLNVAL)) {
1924			bgp_fsm(p, EVNT_CON_FATAL);
1925			return (1);
1926		}
1927		return (0);
1928	}
1929
1930	if (pfd->revents & POLLHUP) {
1931		bgp_fsm(p, EVNT_CON_CLOSED);
1932		return (1);
1933	}
1934	if (pfd->revents & (POLLERR|POLLNVAL)) {
1935		bgp_fsm(p, EVNT_CON_FATAL);
1936		return (1);
1937	}
1938
1939	if (pfd->revents & POLLOUT && p->wbuf.queued) {
1940		if ((error = msgbuf_write(&p->wbuf)) <= 0 && errno != EAGAIN) {
1941			if (error == 0)
1942				log_peer_warnx(&p->conf, "Connection closed");
1943			else if (error == -1)
1944				log_peer_warn(&p->conf, "write error");
1945			bgp_fsm(p, EVNT_CON_FATAL);
1946			return (1);
1947		}
1948		p->stats.last_write = getmonotime();
1949		start_timer_sendholdtime(p);
1950		if (p->throttled && p->wbuf.queued < SESS_MSG_LOW_MARK) {
1951			if (imsg_rde(IMSG_XON, p->conf.id, NULL, 0) == -1)
1952				log_peer_warn(&p->conf, "imsg_compose XON");
1953			else
1954				p->throttled = 0;
1955		}
1956		if (!(pfd->revents & POLLIN))
1957			return (1);
1958	}
1959
1960	if (p->rbuf && pfd->revents & POLLIN) {
1961		if ((n = read(p->fd, p->rbuf->buf + p->rbuf->wpos,
1962		    sizeof(p->rbuf->buf) - p->rbuf->wpos)) == -1) {
1963			if (errno != EINTR && errno != EAGAIN) {
1964				log_peer_warn(&p->conf, "read error");
1965				bgp_fsm(p, EVNT_CON_FATAL);
1966			}
1967			return (1);
1968		}
1969		if (n == 0) {	/* connection closed */
1970			bgp_fsm(p, EVNT_CON_CLOSED);
1971			return (1);
1972		}
1973
1974		p->rbuf->wpos += n;
1975		p->stats.last_read = getmonotime();
1976		return (1);
1977	}
1978	return (0);
1979}
1980
1981void
1982session_process_msg(struct peer *p)
1983{
1984	struct mrt	*mrt;
1985	ssize_t		rpos, av, left;
1986	int		processed = 0;
1987	uint16_t	msglen;
1988	uint8_t		msgtype;
1989
1990	rpos = 0;
1991	av = p->rbuf->wpos;
1992	p->rpending = 0;
1993
1994	/*
1995	 * session might drop to IDLE -> buffers deallocated
1996	 * we MUST check rbuf != NULL before use
1997	 */
1998	for (;;) {
1999		if (p->rbuf == NULL)
2000			return;
2001		if (rpos + MSGSIZE_HEADER > av)
2002			break;
2003		if (parse_header(p, p->rbuf->buf + rpos, &msglen,
2004		    &msgtype) == -1)
2005			return;
2006		if (rpos + msglen > av)
2007			break;
2008		p->rbuf->rptr = p->rbuf->buf + rpos;
2009
2010		/* dump to MRT as soon as we have a full packet */
2011		LIST_FOREACH(mrt, &mrthead, entry) {
2012			if (!(mrt->type == MRT_ALL_IN || (msgtype == UPDATE &&
2013			    mrt->type == MRT_UPDATE_IN)))
2014				continue;
2015			if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
2016			    mrt->peer_id == p->conf.id || (mrt->group_id != 0 &&
2017			    mrt->group_id == p->conf.groupid))
2018				mrt_dump_bgp_msg(mrt, p->rbuf->rptr, msglen, p,
2019				    msgtype);
2020		}
2021
2022		switch (msgtype) {
2023		case OPEN:
2024			bgp_fsm(p, EVNT_RCVD_OPEN);
2025			p->stats.msg_rcvd_open++;
2026			break;
2027		case UPDATE:
2028			bgp_fsm(p, EVNT_RCVD_UPDATE);
2029			p->stats.msg_rcvd_update++;
2030			break;
2031		case NOTIFICATION:
2032			bgp_fsm(p, EVNT_RCVD_NOTIFICATION);
2033			p->stats.msg_rcvd_notification++;
2034			break;
2035		case KEEPALIVE:
2036			bgp_fsm(p, EVNT_RCVD_KEEPALIVE);
2037			p->stats.msg_rcvd_keepalive++;
2038			break;
2039		case RREFRESH:
2040			parse_rrefresh(p);
2041			p->stats.msg_rcvd_rrefresh++;
2042			break;
2043		default:	/* cannot happen */
2044			session_notification_data(p, ERR_HEADER, ERR_HDR_TYPE,
2045			    &msgtype, 1);
2046			log_warnx("received message with unknown type %u",
2047			    msgtype);
2048			bgp_fsm(p, EVNT_CON_FATAL);
2049		}
2050		rpos += msglen;
2051		if (++processed > MSG_PROCESS_LIMIT) {
2052			p->rpending = 1;
2053			break;
2054		}
2055	}
2056
2057	if (p->rbuf == NULL)
2058		return;
2059	if (rpos < av) {
2060		left = av - rpos;
2061		memmove(&p->rbuf->buf, p->rbuf->buf + rpos, left);
2062		p->rbuf->wpos = left;
2063	} else
2064		p->rbuf->wpos = 0;
2065}
2066
2067int
2068parse_header(struct peer *peer, u_char *data, uint16_t *len, uint8_t *type)
2069{
2070	u_char			*p;
2071	uint16_t		 olen;
2072
2073	/* caller MUST make sure we are getting 19 bytes! */
2074	p = data;
2075	if (memcmp(p, marker, sizeof(marker))) {
2076		log_peer_warnx(&peer->conf, "sync error");
2077		session_notification(peer, ERR_HEADER, ERR_HDR_SYNC, NULL);
2078		bgp_fsm(peer, EVNT_CON_FATAL);
2079		return (-1);
2080	}
2081	p += MSGSIZE_HEADER_MARKER;
2082
2083	memcpy(&olen, p, 2);
2084	*len = ntohs(olen);
2085	p += 2;
2086	memcpy(type, p, 1);
2087
2088	if (*len < MSGSIZE_HEADER || *len > MAX_PKTSIZE) {
2089		log_peer_warnx(&peer->conf,
2090		    "received message: illegal length: %u byte", *len);
2091		session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
2092		    &olen, sizeof(olen));
2093		bgp_fsm(peer, EVNT_CON_FATAL);
2094		return (-1);
2095	}
2096
2097	switch (*type) {
2098	case OPEN:
2099		if (*len < MSGSIZE_OPEN_MIN) {
2100			log_peer_warnx(&peer->conf,
2101			    "received OPEN: illegal len: %u byte", *len);
2102			session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
2103			    &olen, sizeof(olen));
2104			bgp_fsm(peer, EVNT_CON_FATAL);
2105			return (-1);
2106		}
2107		break;
2108	case NOTIFICATION:
2109		if (*len < MSGSIZE_NOTIFICATION_MIN) {
2110			log_peer_warnx(&peer->conf,
2111			    "received NOTIFICATION: illegal len: %u byte",
2112			    *len);
2113			session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
2114			    &olen, sizeof(olen));
2115			bgp_fsm(peer, EVNT_CON_FATAL);
2116			return (-1);
2117		}
2118		break;
2119	case UPDATE:
2120		if (*len < MSGSIZE_UPDATE_MIN) {
2121			log_peer_warnx(&peer->conf,
2122			    "received UPDATE: illegal len: %u byte", *len);
2123			session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
2124			    &olen, sizeof(olen));
2125			bgp_fsm(peer, EVNT_CON_FATAL);
2126			return (-1);
2127		}
2128		break;
2129	case KEEPALIVE:
2130		if (*len != MSGSIZE_KEEPALIVE) {
2131			log_peer_warnx(&peer->conf,
2132			    "received KEEPALIVE: illegal len: %u byte", *len);
2133			session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
2134			    &olen, sizeof(olen));
2135			bgp_fsm(peer, EVNT_CON_FATAL);
2136			return (-1);
2137		}
2138		break;
2139	case RREFRESH:
2140		if (*len < MSGSIZE_RREFRESH_MIN) {
2141			log_peer_warnx(&peer->conf,
2142			    "received RREFRESH: illegal len: %u byte", *len);
2143			session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
2144			    &olen, sizeof(olen));
2145			bgp_fsm(peer, EVNT_CON_FATAL);
2146			return (-1);
2147		}
2148		break;
2149	default:
2150		log_peer_warnx(&peer->conf,
2151		    "received msg with unknown type %u", *type);
2152		session_notification_data(peer, ERR_HEADER, ERR_HDR_TYPE,
2153		    type, 1);
2154		bgp_fsm(peer, EVNT_CON_FATAL);
2155		return (-1);
2156	}
2157	return (0);
2158}
2159
2160int
2161parse_open(struct peer *peer)
2162{
2163	struct ibuf	 ibuf;
2164	u_char		*p;
2165	uint8_t		 version, rversion;
2166	uint16_t	 short_as, msglen;
2167	uint16_t	 holdtime, myholdtime;
2168	uint32_t	 as, bgpid;
2169	uint8_t		 optparamlen;
2170
2171	p = peer->rbuf->rptr;
2172	p += MSGSIZE_HEADER_MARKER;
2173	memcpy(&msglen, p, sizeof(msglen));
2174	msglen = ntohs(msglen);
2175
2176	p = peer->rbuf->rptr;
2177	p += MSGSIZE_HEADER;	/* header is already checked */
2178	msglen -= MSGSIZE_HEADER;
2179
2180	/* XXX */
2181	ibuf_from_buffer(&ibuf, p, msglen);
2182
2183	if (ibuf_get_n8(&ibuf, &version) == -1 ||
2184	    ibuf_get_n16(&ibuf, &short_as) == -1 ||
2185	    ibuf_get_n16(&ibuf, &holdtime) == -1 ||
2186	    ibuf_get_n32(&ibuf, &bgpid) == -1 ||
2187	    ibuf_get_n8(&ibuf, &optparamlen) == -1)
2188		goto bad_len;
2189
2190	if (version != BGP_VERSION) {
2191		log_peer_warnx(&peer->conf,
2192		    "peer wants unrecognized version %u", version);
2193		if (version > BGP_VERSION)
2194			rversion = version - BGP_VERSION;
2195		else
2196			rversion = BGP_VERSION;
2197		session_notification_data(peer, ERR_OPEN, ERR_OPEN_VERSION,
2198		    &rversion, sizeof(rversion));
2199		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2200		return (-1);
2201	}
2202
2203	as = peer->short_as = short_as;
2204	if (as == 0) {
2205		log_peer_warnx(&peer->conf,
2206		    "peer requests unacceptable AS %u", as);
2207		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL);
2208		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2209		return (-1);
2210	}
2211
2212	if (holdtime && holdtime < peer->conf.min_holdtime) {
2213		log_peer_warnx(&peer->conf,
2214		    "peer requests unacceptable holdtime %u", holdtime);
2215		session_notification(peer, ERR_OPEN, ERR_OPEN_HOLDTIME, NULL);
2216		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2217		return (-1);
2218	}
2219
2220	myholdtime = peer->conf.holdtime;
2221	if (!myholdtime)
2222		myholdtime = conf->holdtime;
2223	if (holdtime < myholdtime)
2224		peer->holdtime = holdtime;
2225	else
2226		peer->holdtime = myholdtime;
2227
2228	/* check bgpid for validity - just disallow 0 */
2229	if (bgpid == 0) {
2230		log_peer_warnx(&peer->conf, "peer BGPID 0 unacceptable");
2231		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID, NULL);
2232		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2233		return (-1);
2234	}
2235	peer->remote_bgpid = bgpid;
2236
2237	if (optparamlen != 0) {
2238		struct ibuf oparams, op;
2239		uint8_t ext_type, op_type;
2240		uint16_t ext_len, op_len;
2241
2242		ibuf_from_ibuf(&oparams, &ibuf);
2243
2244		/* check for RFC9072 encoding */
2245		if (ibuf_get_n8(&oparams, &ext_type) == -1)
2246			goto bad_len;
2247		if (ext_type == OPT_PARAM_EXT_LEN) {
2248			if (ibuf_get_n16(&oparams, &ext_len) == -1)
2249				goto bad_len;
2250			/* skip RFC9072 header */
2251			if (ibuf_skip(&ibuf, 3) == -1)
2252				goto bad_len;
2253		} else {
2254			ext_len = optparamlen;
2255			ibuf_rewind(&oparams);
2256		}
2257
2258		if (ibuf_truncate(&oparams, ext_len) == -1 ||
2259		    ibuf_skip(&ibuf, ext_len) == -1)
2260			goto bad_len;
2261
2262		while (ibuf_size(&oparams) > 0) {
2263			if (ibuf_get_n8(&oparams, &op_type) == -1)
2264				goto bad_len;
2265
2266			if (ext_type == OPT_PARAM_EXT_LEN) {
2267				if (ibuf_get_n16(&oparams, &op_len) == -1)
2268					goto bad_len;
2269			} else {
2270				uint8_t tmp;
2271				if (ibuf_get_n8(&oparams, &tmp) == -1)
2272					goto bad_len;
2273				op_len = tmp;
2274			}
2275
2276			if (ibuf_get_ibuf(&oparams, op_len, &op) == -1)
2277				goto bad_len;
2278
2279			switch (op_type) {
2280			case OPT_PARAM_CAPABILITIES:		/* RFC 3392 */
2281				if (parse_capabilities(peer, &op, &as) == -1) {
2282					session_notification(peer, ERR_OPEN, 0,
2283					    NULL);
2284					change_state(peer, STATE_IDLE,
2285					    EVNT_RCVD_OPEN);
2286					return (-1);
2287				}
2288				break;
2289			case OPT_PARAM_AUTH:			/* deprecated */
2290			default:
2291				/*
2292				 * unsupported type
2293				 * the RFCs tell us to leave the data section
2294				 * empty and notify the peer with ERR_OPEN,
2295				 * ERR_OPEN_OPT. How the peer should know
2296				 * _which_ optional parameter we don't support
2297				 * is beyond me.
2298				 */
2299				log_peer_warnx(&peer->conf,
2300				    "received OPEN message with unsupported "
2301				    "optional parameter: type %u", op_type);
2302				session_notification(peer, ERR_OPEN,
2303				    ERR_OPEN_OPT, NULL);
2304				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2305				return (-1);
2306			}
2307		}
2308	}
2309
2310	if (ibuf_size(&ibuf) != 0) {
2311 bad_len:
2312		log_peer_warnx(&peer->conf,
2313		    "corrupt OPEN message received: length mismatch");
2314		session_notification(peer, ERR_OPEN, 0, NULL);
2315		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2316		return (-1);
2317	}
2318
2319	/* if remote-as is zero and it's a cloned neighbor, accept any */
2320	if (peer->template && !peer->conf.remote_as && as != AS_TRANS) {
2321		peer->conf.remote_as = as;
2322		peer->conf.ebgp = (peer->conf.remote_as != peer->conf.local_as);
2323		if (!peer->conf.ebgp)
2324			/* force enforce_as off for iBGP sessions */
2325			peer->conf.enforce_as = ENFORCE_AS_OFF;
2326	}
2327
2328	if (peer->conf.remote_as != as) {
2329		log_peer_warnx(&peer->conf, "peer sent wrong AS %s",
2330		    log_as(as));
2331		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL);
2332		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2333		return (-1);
2334	}
2335
2336	/* on iBGP sessions check for bgpid collision */
2337	if (!peer->conf.ebgp && peer->remote_bgpid == conf->bgpid) {
2338		struct in_addr ina;
2339		ina.s_addr = htonl(bgpid);
2340		log_peer_warnx(&peer->conf, "peer BGPID %s conflicts with ours",
2341		    inet_ntoa(ina));
2342		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID, NULL);
2343		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2344		return (-1);
2345	}
2346
2347	if (capa_neg_calc(peer) == -1) {
2348		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2349		return (-1);
2350	}
2351
2352	return (0);
2353}
2354
2355int
2356parse_update(struct peer *peer)
2357{
2358	u_char		*p;
2359	uint16_t	 datalen;
2360
2361	/*
2362	 * we pass the message verbatim to the rde.
2363	 * in case of errors the whole session is reset with a
2364	 * notification anyway, we only need to know the peer
2365	 */
2366	p = peer->rbuf->rptr;
2367	p += MSGSIZE_HEADER_MARKER;
2368	memcpy(&datalen, p, sizeof(datalen));
2369	datalen = ntohs(datalen);
2370
2371	p = peer->rbuf->rptr;
2372	p += MSGSIZE_HEADER;	/* header is already checked */
2373	datalen -= MSGSIZE_HEADER;
2374
2375	if (imsg_rde(IMSG_UPDATE, peer->conf.id, p, datalen) == -1)
2376		return (-1);
2377
2378	return (0);
2379}
2380
2381int
2382parse_rrefresh(struct peer *peer)
2383{
2384	struct route_refresh rr;
2385	struct ibuf ibuf;
2386	uint16_t afi, datalen;
2387	uint8_t aid, safi, subtype;
2388	u_char *p;
2389
2390	p = peer->rbuf->rptr;
2391	p += MSGSIZE_HEADER_MARKER;
2392	memcpy(&datalen, p, sizeof(datalen));
2393	datalen = ntohs(datalen);
2394
2395	p = peer->rbuf->rptr;
2396	p += MSGSIZE_HEADER;	/* header is already checked */
2397	datalen -= MSGSIZE_HEADER;
2398
2399	/* XXX */
2400	ibuf_from_buffer(&ibuf, p, datalen);
2401
2402	if (ibuf_get_n16(&ibuf, &afi) == -1 ||
2403	    ibuf_get_n8(&ibuf, &subtype) == -1 ||
2404	    ibuf_get_n8(&ibuf, &safi) == -1) {
2405		/* minimum size checked in session_process_msg() */
2406		fatalx("%s: message too small", __func__);
2407	}
2408
2409	/* check subtype if peer announced enhanced route refresh */
2410	if (peer->capa.neg.enhanced_rr) {
2411		switch (subtype) {
2412		case ROUTE_REFRESH_REQUEST:
2413			/* no ORF support, so no oversized RREFRESH msgs */
2414			if (datalen != MSGSIZE_RREFRESH) {
2415				log_peer_warnx(&peer->conf,
2416				    "received RREFRESH: illegal len: %u byte",
2417				    datalen);
2418				datalen = htons(datalen);
2419				session_notification_data(peer, ERR_HEADER,
2420				    ERR_HDR_LEN, &datalen, sizeof(datalen));
2421				bgp_fsm(peer, EVNT_CON_FATAL);
2422				return (-1);
2423			}
2424			peer->stats.refresh_rcvd_req++;
2425			break;
2426		case ROUTE_REFRESH_BEGIN_RR:
2427		case ROUTE_REFRESH_END_RR:
2428			/* special handling for RFC7313 */
2429			if (datalen != MSGSIZE_RREFRESH) {
2430				log_peer_warnx(&peer->conf,
2431				    "received RREFRESH: illegal len: %u byte",
2432				    datalen);
2433				ibuf_rewind(&ibuf);
2434				session_notification(peer, ERR_RREFRESH,
2435				    ERR_RR_INV_LEN, &ibuf);
2436				bgp_fsm(peer, EVNT_CON_FATAL);
2437				return (-1);
2438			}
2439			if (subtype == ROUTE_REFRESH_BEGIN_RR)
2440				peer->stats.refresh_rcvd_borr++;
2441			else
2442				peer->stats.refresh_rcvd_eorr++;
2443			break;
2444		default:
2445			log_peer_warnx(&peer->conf, "peer sent bad refresh, "
2446			    "bad subtype %d", subtype);
2447			return (0);
2448		}
2449	} else {
2450		/* force subtype to default */
2451		subtype = ROUTE_REFRESH_REQUEST;
2452		peer->stats.refresh_rcvd_req++;
2453	}
2454
2455	/* afi/safi unchecked -	unrecognized values will be ignored anyway */
2456	if (afi2aid(afi, safi, &aid) == -1) {
2457		log_peer_warnx(&peer->conf, "peer sent bad refresh, "
2458		    "invalid afi/safi pair");
2459		return (0);
2460	}
2461
2462	if (!peer->capa.neg.refresh && !peer->capa.neg.enhanced_rr) {
2463		log_peer_warnx(&peer->conf, "peer sent unexpected refresh");
2464		return (0);
2465	}
2466
2467	rr.aid = aid;
2468	rr.subtype = subtype;
2469
2470	if (imsg_rde(IMSG_REFRESH, peer->conf.id, &rr, sizeof(rr)) == -1)
2471		return (-1);
2472
2473	return (0);
2474}
2475
2476void
2477parse_notification(struct peer *peer)
2478{
2479	struct ibuf	 ibuf;
2480	u_char		*p;
2481	uint16_t	 datalen;
2482	uint8_t		 errcode, subcode;
2483	uint8_t		 reason_len;
2484
2485	/* just log */
2486	p = peer->rbuf->rptr;
2487	p += MSGSIZE_HEADER_MARKER;
2488	memcpy(&datalen, p, sizeof(datalen));
2489	datalen = ntohs(datalen);
2490
2491	p = peer->rbuf->rptr;
2492	p += MSGSIZE_HEADER;	/* header is already checked */
2493	datalen -= MSGSIZE_HEADER;
2494
2495	/* XXX */
2496	ibuf_from_buffer(&ibuf, p, datalen);
2497
2498	if (ibuf_get_n8(&ibuf, &errcode) == -1 ||
2499	    ibuf_get_n8(&ibuf, &subcode) == -1) {
2500		log_peer_warnx(&peer->conf, "received bad notification");
2501		goto done;
2502	}
2503
2504	peer->errcnt++;
2505	peer->stats.last_rcvd_errcode = errcode;
2506	peer->stats.last_rcvd_suberr = subcode;
2507
2508	log_notification(peer, errcode, subcode, &ibuf, "received");
2509
2510	CTASSERT(sizeof(peer->stats.last_reason) > UINT8_MAX);
2511	memset(peer->stats.last_reason, 0, sizeof(peer->stats.last_reason));
2512	if (errcode == ERR_CEASE &&
2513	    (subcode == ERR_CEASE_ADMIN_DOWN ||
2514	     subcode == ERR_CEASE_ADMIN_RESET)) {
2515		/* check if shutdown reason is included */
2516		if (ibuf_get_n8(&ibuf, &reason_len) != -1 && reason_len != 0) {
2517			if (ibuf_get(&ibuf, peer->stats.last_reason,
2518			    reason_len) == -1)
2519				log_peer_warnx(&peer->conf,
2520				    "received truncated shutdown reason");
2521		}
2522	}
2523
2524done:
2525	change_state(peer, STATE_IDLE, EVNT_RCVD_NOTIFICATION);
2526}
2527
2528int
2529parse_capabilities(struct peer *peer, struct ibuf *buf, uint32_t *as)
2530{
2531	struct ibuf	 capabuf;
2532	uint16_t	 afi, gr_header;
2533	uint8_t		 capa_code, capa_len;
2534	uint8_t		 safi, aid, role, flags;
2535
2536	while (ibuf_size(buf) > 0) {
2537		if (ibuf_get_n8(buf, &capa_code) == -1 ||
2538		    ibuf_get_n8(buf, &capa_len) == -1) {
2539			log_peer_warnx(&peer->conf, "Bad capabilities attr "
2540			    "length: too short");
2541			return (-1);
2542		}
2543		if (ibuf_get_ibuf(buf, capa_len, &capabuf) == -1) {
2544			log_peer_warnx(&peer->conf,
2545			    "Received bad capabilities attr length: "
2546			    "len %zu smaller than capa_len %u",
2547			    ibuf_size(buf), capa_len);
2548			return (-1);
2549		}
2550
2551		switch (capa_code) {
2552		case CAPA_MP:			/* RFC 4760 */
2553			if (capa_len != 4 ||
2554			    ibuf_get_n16(&capabuf, &afi) == -1 ||
2555			    ibuf_skip(&capabuf, 1) == -1 ||
2556			    ibuf_get_n8(&capabuf, &safi) == -1) {
2557				log_peer_warnx(&peer->conf,
2558				    "Received bad multi protocol capability");
2559				break;
2560			}
2561			if (afi2aid(afi, safi, &aid) == -1) {
2562				log_peer_warnx(&peer->conf,
2563				    "Received multi protocol capability: "
2564				    " unknown AFI %u, safi %u pair",
2565				    afi, safi);
2566				peer->capa.peer.mp[AID_UNSPEC] = 1;
2567				break;
2568			}
2569			peer->capa.peer.mp[aid] = 1;
2570			break;
2571		case CAPA_REFRESH:
2572			peer->capa.peer.refresh = 1;
2573			break;
2574		case CAPA_ROLE:
2575			if (capa_len != 1 ||
2576			    ibuf_get_n8(&capabuf, &role) == -1) {
2577				log_peer_warnx(&peer->conf,
2578				    "Received bad role capability");
2579				break;
2580			}
2581			if (!peer->conf.ebgp) {
2582				log_peer_warnx(&peer->conf,
2583				    "Received role capability on iBGP session");
2584				break;
2585			}
2586			peer->capa.peer.policy = 1;
2587			peer->remote_role = capa2role(role);
2588			break;
2589		case CAPA_RESTART:
2590			if (capa_len == 2) {
2591				/* peer only supports EoR marker */
2592				peer->capa.peer.grestart.restart = 1;
2593				peer->capa.peer.grestart.timeout = 0;
2594				break;
2595			} else if (capa_len % 4 != 2) {
2596				log_peer_warnx(&peer->conf,
2597				    "Bad graceful restart capability");
2598				peer->capa.peer.grestart.restart = 0;
2599				peer->capa.peer.grestart.timeout = 0;
2600				break;
2601			}
2602
2603			if (ibuf_get_n16(&capabuf, &gr_header) == -1) {
2604 bad_gr_restart:
2605				log_peer_warnx(&peer->conf,
2606				    "Bad graceful restart capability");
2607				peer->capa.peer.grestart.restart = 0;
2608				peer->capa.peer.grestart.timeout = 0;
2609				break;
2610			}
2611
2612			peer->capa.peer.grestart.timeout =
2613			    gr_header & CAPA_GR_TIMEMASK;
2614			if (peer->capa.peer.grestart.timeout == 0) {
2615				log_peer_warnx(&peer->conf, "Received "
2616				    "graceful restart with zero timeout");
2617				peer->capa.peer.grestart.restart = 0;
2618				break;
2619			}
2620
2621			while (ibuf_size(&capabuf) > 0) {
2622				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
2623				    ibuf_get_n8(&capabuf, &safi) == -1 ||
2624				    ibuf_get_n8(&capabuf, &flags) == -1)
2625					goto bad_gr_restart;
2626				if (afi2aid(afi, safi, &aid) == -1) {
2627					log_peer_warnx(&peer->conf,
2628					    "Received graceful restart capa: "
2629					    " unknown AFI %u, safi %u pair",
2630					    afi, safi);
2631					continue;
2632				}
2633				peer->capa.peer.grestart.flags[aid] |=
2634				    CAPA_GR_PRESENT;
2635				if (flags & CAPA_GR_F_FLAG)
2636					peer->capa.peer.grestart.flags[aid] |=
2637					    CAPA_GR_FORWARD;
2638				if (gr_header & CAPA_GR_R_FLAG)
2639					peer->capa.peer.grestart.flags[aid] |=
2640					    CAPA_GR_RESTART;
2641				peer->capa.peer.grestart.restart = 2;
2642			}
2643			break;
2644		case CAPA_AS4BYTE:
2645			if (capa_len != 4 ||
2646			    ibuf_get_n32(&capabuf, as) == -1) {
2647				log_peer_warnx(&peer->conf,
2648				    "Received bad AS4BYTE capability");
2649				peer->capa.peer.as4byte = 0;
2650				break;
2651			}
2652			if (*as == 0) {
2653				log_peer_warnx(&peer->conf,
2654				    "peer requests unacceptable AS %u", *as);
2655				session_notification(peer, ERR_OPEN,
2656				    ERR_OPEN_AS, NULL);
2657				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
2658				return (-1);
2659			}
2660			peer->capa.peer.as4byte = 1;
2661			break;
2662		case CAPA_ADD_PATH:
2663			if (capa_len % 4 != 0) {
2664 bad_add_path:
2665				log_peer_warnx(&peer->conf,
2666				    "Received bad ADD-PATH capability");
2667				memset(peer->capa.peer.add_path, 0,
2668				    sizeof(peer->capa.peer.add_path));
2669				break;
2670			}
2671			while (ibuf_size(&capabuf) > 0) {
2672				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
2673				    ibuf_get_n8(&capabuf, &safi) == -1 ||
2674				    ibuf_get_n8(&capabuf, &flags) == -1)
2675					goto bad_add_path;
2676				if (afi2aid(afi, safi, &aid) == -1) {
2677					log_peer_warnx(&peer->conf,
2678					    "Received ADD-PATH capa: "
2679					    " unknown AFI %u, safi %u pair",
2680					    afi, safi);
2681					memset(peer->capa.peer.add_path, 0,
2682					    sizeof(peer->capa.peer.add_path));
2683					break;
2684				}
2685				if (flags & ~CAPA_AP_BIDIR) {
2686					log_peer_warnx(&peer->conf,
2687					    "Received ADD-PATH capa: "
2688					    " bad flags %x", flags);
2689					memset(peer->capa.peer.add_path, 0,
2690					    sizeof(peer->capa.peer.add_path));
2691					break;
2692				}
2693				peer->capa.peer.add_path[aid] = flags;
2694			}
2695			break;
2696		case CAPA_ENHANCED_RR:
2697			peer->capa.peer.enhanced_rr = 1;
2698			break;
2699		default:
2700			break;
2701		}
2702	}
2703
2704	return (0);
2705}
2706
2707int
2708capa_neg_calc(struct peer *p)
2709{
2710	struct ibuf *ebuf;
2711	uint8_t	i, hasmp = 0, capa_code, capa_len, capa_aid = 0;
2712
2713	/* a capability is accepted only if both sides announced it */
2714
2715	p->capa.neg.refresh =
2716	    (p->capa.ann.refresh && p->capa.peer.refresh) != 0;
2717	p->capa.neg.enhanced_rr =
2718	    (p->capa.ann.enhanced_rr && p->capa.peer.enhanced_rr) != 0;
2719	p->capa.neg.as4byte =
2720	    (p->capa.ann.as4byte && p->capa.peer.as4byte) != 0;
2721
2722	/* MP: both side must agree on the AFI,SAFI pair */
2723	if (p->capa.peer.mp[AID_UNSPEC])
2724		hasmp = 1;
2725	for (i = AID_MIN; i < AID_MAX; i++) {
2726		if (p->capa.ann.mp[i] && p->capa.peer.mp[i])
2727			p->capa.neg.mp[i] = 1;
2728		else
2729			p->capa.neg.mp[i] = 0;
2730		if (p->capa.ann.mp[i] || p->capa.peer.mp[i])
2731			hasmp = 1;
2732	}
2733	/* if no MP capability present default to IPv4 unicast mode */
2734	if (!hasmp)
2735		p->capa.neg.mp[AID_INET] = 1;
2736
2737	/*
2738	 * graceful restart: the peer capabilities are of interest here.
2739	 * It is necessary to compare the new values with the previous ones
2740	 * and act accordingly. AFI/SAFI that are not part in the MP capability
2741	 * are treated as not being present.
2742	 * Also make sure that a flush happens if the session stopped
2743	 * supporting graceful restart.
2744	 */
2745
2746	for (i = AID_MIN; i < AID_MAX; i++) {
2747		int8_t	negflags;
2748
2749		/* disable GR if the AFI/SAFI is not present */
2750		if ((p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
2751		    p->capa.neg.mp[i] == 0))
2752			p->capa.peer.grestart.flags[i] = 0;	/* disable */
2753		/* look at current GR state and decide what to do */
2754		negflags = p->capa.neg.grestart.flags[i];
2755		p->capa.neg.grestart.flags[i] = p->capa.peer.grestart.flags[i];
2756		if (negflags & CAPA_GR_RESTARTING) {
2757			if (p->capa.ann.grestart.restart != 0 &&
2758			    p->capa.peer.grestart.flags[i] & CAPA_GR_FORWARD) {
2759				p->capa.neg.grestart.flags[i] |=
2760				    CAPA_GR_RESTARTING;
2761			} else {
2762				if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
2763				    &i, sizeof(i)) == -1) {
2764					log_peer_warnx(&p->conf,
2765					    "imsg send failed");
2766					return (-1);
2767				}
2768				log_peer_warnx(&p->conf, "graceful restart of "
2769				    "%s, not restarted, flushing", aid2str(i));
2770			}
2771		}
2772	}
2773	p->capa.neg.grestart.timeout = p->capa.peer.grestart.timeout;
2774	p->capa.neg.grestart.restart = p->capa.peer.grestart.restart;
2775	if (p->capa.ann.grestart.restart == 0)
2776		p->capa.neg.grestart.restart = 0;
2777
2778	/*
2779	 * ADD-PATH: set only those bits where both sides agree.
2780	 * For this compare our send bit with the recv bit from the peer
2781	 * and vice versa.
2782	 * The flags are stored from this systems view point.
2783	 * At index 0 the flags are set if any per-AID flag is set.
2784	 */
2785	memset(p->capa.neg.add_path, 0, sizeof(p->capa.neg.add_path));
2786	for (i = AID_MIN; i < AID_MAX; i++) {
2787		if (p->capa.neg.mp[i] == 0)
2788			continue;
2789		if ((p->capa.ann.add_path[i] & CAPA_AP_RECV) &&
2790		    (p->capa.peer.add_path[i] & CAPA_AP_SEND)) {
2791			p->capa.neg.add_path[i] |= CAPA_AP_RECV;
2792			p->capa.neg.add_path[0] |= CAPA_AP_RECV;
2793		}
2794		if ((p->capa.ann.add_path[i] & CAPA_AP_SEND) &&
2795		    (p->capa.peer.add_path[i] & CAPA_AP_RECV)) {
2796			p->capa.neg.add_path[i] |= CAPA_AP_SEND;
2797			p->capa.neg.add_path[0] |= CAPA_AP_SEND;
2798		}
2799	}
2800
2801	/*
2802	 * Open policy: check that the policy is sensible.
2803	 *
2804	 * Make sure that the roles match and set the negotiated capability
2805	 * to the role of the peer. So the RDE can inject the OTC attribute.
2806	 * See RFC 9234, section 4.2.
2807	 * These checks should only happen on ebgp sessions.
2808	 */
2809	if (p->capa.ann.policy != 0 && p->capa.peer.policy != 0 &&
2810	    p->conf.ebgp) {
2811		switch (p->conf.role) {
2812		case ROLE_PROVIDER:
2813			if (p->remote_role != ROLE_CUSTOMER)
2814				goto policyfail;
2815			break;
2816		case ROLE_RS:
2817			if (p->remote_role != ROLE_RS_CLIENT)
2818				goto policyfail;
2819			break;
2820		case ROLE_RS_CLIENT:
2821			if (p->remote_role != ROLE_RS)
2822				goto policyfail;
2823			break;
2824		case ROLE_CUSTOMER:
2825			if (p->remote_role != ROLE_PROVIDER)
2826				goto policyfail;
2827			break;
2828		case ROLE_PEER:
2829			if (p->remote_role != ROLE_PEER)
2830				goto policyfail;
2831			break;
2832		default:
2833 policyfail:
2834			log_peer_warnx(&p->conf, "open policy role mismatch: "
2835			    "our role %s, their role %s",
2836			    log_policy(p->conf.role),
2837			    log_policy(p->remote_role));
2838			session_notification(p, ERR_OPEN, ERR_OPEN_ROLE, NULL);
2839			return (-1);
2840		}
2841		p->capa.neg.policy = 1;
2842	}
2843
2844	/* enforce presence of open policy role capability */
2845	if (p->capa.ann.policy == 2 && p->capa.peer.policy == 0 &&
2846	    p->conf.ebgp) {
2847		log_peer_warnx(&p->conf, "open policy role enforced but "
2848		    "not present");
2849		session_notification(p, ERR_OPEN, ERR_OPEN_ROLE, NULL);
2850		return (-1);
2851	}
2852
2853	/* enforce presence of other capabilities */
2854	if (p->capa.ann.refresh == 2 && p->capa.neg.refresh == 0) {
2855		capa_code = CAPA_REFRESH;
2856		capa_len = 0;
2857		goto fail;
2858	}
2859	if (p->capa.ann.enhanced_rr == 2 && p->capa.neg.enhanced_rr == 0) {
2860		capa_code = CAPA_ENHANCED_RR;
2861		capa_len = 0;
2862		goto fail;
2863	}
2864	if (p->capa.ann.as4byte == 2 && p->capa.neg.as4byte == 0) {
2865		capa_code = CAPA_AS4BYTE;
2866		capa_len = 4;
2867		goto fail;
2868	}
2869	if (p->capa.ann.grestart.restart == 2 &&
2870	    p->capa.neg.grestart.restart == 0) {
2871		capa_code = CAPA_RESTART;
2872		capa_len = 2;
2873		goto fail;
2874	}
2875	for (i = AID_MIN; i < AID_MAX; i++) {
2876		if (p->capa.ann.mp[i] == 2 && p->capa.neg.mp[i] == 0) {
2877			capa_code = CAPA_MP;
2878			capa_len = 4;
2879			capa_aid = i;
2880			goto fail;
2881		}
2882	}
2883
2884	for (i = AID_MIN; i < AID_MAX; i++) {
2885		if (p->capa.neg.mp[i] == 0)
2886			continue;
2887		if ((p->capa.ann.add_path[i] & CAPA_AP_RECV_ENFORCE) &&
2888		    (p->capa.neg.add_path[i] & CAPA_AP_RECV) == 0) {
2889			capa_code = CAPA_ADD_PATH;
2890			capa_len = 4;
2891			capa_aid = i;
2892			goto fail;
2893		}
2894		if ((p->capa.ann.add_path[i] & CAPA_AP_SEND_ENFORCE) &&
2895		    (p->capa.neg.add_path[i] & CAPA_AP_SEND) == 0) {
2896			capa_code = CAPA_ADD_PATH;
2897			capa_len = 4;
2898			capa_aid = i;
2899			goto fail;
2900		}
2901	}
2902
2903	return (0);
2904
2905 fail:
2906	if ((ebuf = ibuf_dynamic(2, 256)) == NULL)
2907		return (-1);
2908	/* best effort, no problem if it fails */
2909	session_capa_add(ebuf, capa_code, capa_len);
2910	if (capa_code == CAPA_MP)
2911		session_capa_add_mp(ebuf, capa_aid);
2912	else if (capa_code == CAPA_ADD_PATH)
2913		session_capa_add_afi(ebuf, capa_aid, 0);
2914	else if (capa_len > 0)
2915		ibuf_add_zero(ebuf, capa_len);
2916
2917	session_notification(p, ERR_OPEN, ERR_OPEN_CAPA, ebuf);
2918	ibuf_free(ebuf);
2919	return (-1);
2920}
2921
2922void
2923session_dispatch_imsg(struct imsgbuf *imsgbuf, int idx, u_int *listener_cnt)
2924{
2925	struct imsg		 imsg;
2926	struct ibuf		 ibuf;
2927	struct mrt		 xmrt;
2928	struct route_refresh	 rr;
2929	struct mrt		*mrt;
2930	struct imsgbuf		*i;
2931	struct peer		*p;
2932	struct listen_addr	*la, *next, nla;
2933	struct session_dependon	 sdon;
2934	struct bgpd_config	 tconf;
2935	size_t			 len;
2936	uint32_t		 peerid;
2937	int			 n, fd, depend_ok, restricted;
2938	uint16_t		 t;
2939	uint8_t			 aid, errcode, subcode;
2940
2941	while (imsgbuf) {
2942		if ((n = imsg_get(imsgbuf, &imsg)) == -1)
2943			fatal("session_dispatch_imsg: imsg_get error");
2944
2945		if (n == 0)
2946			break;
2947
2948		peerid = imsg_get_id(&imsg);
2949		switch (imsg_get_type(&imsg)) {
2950		case IMSG_SOCKET_CONN:
2951		case IMSG_SOCKET_CONN_CTL:
2952			if (idx != PFD_PIPE_MAIN)
2953				fatalx("reconf request not from parent");
2954			if ((fd = imsg_get_fd(&imsg)) == -1) {
2955				log_warnx("expected to receive imsg fd to "
2956				    "RDE but didn't receive any");
2957				break;
2958			}
2959			if ((i = malloc(sizeof(struct imsgbuf))) == NULL)
2960				fatal(NULL);
2961			imsg_init(i, fd);
2962			if (imsg_get_type(&imsg) == IMSG_SOCKET_CONN) {
2963				if (ibuf_rde) {
2964					log_warnx("Unexpected imsg connection "
2965					    "to RDE received");
2966					msgbuf_clear(&ibuf_rde->w);
2967					free(ibuf_rde);
2968				}
2969				ibuf_rde = i;
2970			} else {
2971				if (ibuf_rde_ctl) {
2972					log_warnx("Unexpected imsg ctl "
2973					    "connection to RDE received");
2974					msgbuf_clear(&ibuf_rde_ctl->w);
2975					free(ibuf_rde_ctl);
2976				}
2977				ibuf_rde_ctl = i;
2978			}
2979			break;
2980		case IMSG_RECONF_CONF:
2981			if (idx != PFD_PIPE_MAIN)
2982				fatalx("reconf request not from parent");
2983			if (imsg_get_data(&imsg, &tconf, sizeof(tconf)) == -1)
2984				fatal("imsg_get_data");
2985
2986			nconf = new_config();
2987			copy_config(nconf, &tconf);
2988			pending_reconf = 1;
2989			break;
2990		case IMSG_RECONF_PEER:
2991			if (idx != PFD_PIPE_MAIN)
2992				fatalx("reconf request not from parent");
2993			if ((p = calloc(1, sizeof(struct peer))) == NULL)
2994				fatal("new_peer");
2995			if (imsg_get_data(&imsg, &p->conf, sizeof(p->conf)) ==
2996			    -1)
2997				fatal("imsg_get_data");
2998			p->state = p->prev_state = STATE_NONE;
2999			p->reconf_action = RECONF_REINIT;
3000			if (RB_INSERT(peer_head, &nconf->peers, p) != NULL)
3001				fatalx("%s: peer tree is corrupt", __func__);
3002			break;
3003		case IMSG_RECONF_LISTENER:
3004			if (idx != PFD_PIPE_MAIN)
3005				fatalx("reconf request not from parent");
3006			if (nconf == NULL)
3007				fatalx("IMSG_RECONF_LISTENER but no config");
3008			if (imsg_get_data(&imsg, &nla, sizeof(nla)) == -1)
3009				fatal("imsg_get_data");
3010			TAILQ_FOREACH(la, conf->listen_addrs, entry)
3011				if (!la_cmp(la, &nla))
3012					break;
3013
3014			if (la == NULL) {
3015				if (nla.reconf != RECONF_REINIT)
3016					fatalx("king bula sez: "
3017					    "expected REINIT");
3018
3019				if ((nla.fd = imsg_get_fd(&imsg)) == -1)
3020					log_warnx("expected to receive fd for "
3021					    "%s but didn't receive any",
3022					    log_sockaddr((struct sockaddr *)
3023					    &nla.sa, nla.sa_len));
3024
3025				la = calloc(1, sizeof(struct listen_addr));
3026				if (la == NULL)
3027					fatal(NULL);
3028				memcpy(&la->sa, &nla.sa, sizeof(la->sa));
3029				la->flags = nla.flags;
3030				la->fd = nla.fd;
3031				la->reconf = RECONF_REINIT;
3032				TAILQ_INSERT_TAIL(nconf->listen_addrs, la,
3033				    entry);
3034			} else {
3035				if (nla.reconf != RECONF_KEEP)
3036					fatalx("king bula sez: expected KEEP");
3037				la->reconf = RECONF_KEEP;
3038			}
3039
3040			break;
3041		case IMSG_RECONF_CTRL:
3042			if (idx != PFD_PIPE_MAIN)
3043				fatalx("reconf request not from parent");
3044
3045			if (imsg_get_data(&imsg, &restricted,
3046			    sizeof(restricted)) == -1)
3047				fatal("imsg_get_data");
3048			if ((fd = imsg_get_fd(&imsg)) == -1) {
3049				log_warnx("expected to receive fd for control "
3050				    "socket but didn't receive any");
3051				break;
3052			}
3053			if (restricted) {
3054				control_shutdown(rcsock);
3055				rcsock = fd;
3056			} else {
3057				control_shutdown(csock);
3058				csock = fd;
3059			}
3060			break;
3061		case IMSG_RECONF_DRAIN:
3062			switch (idx) {
3063			case PFD_PIPE_ROUTE:
3064				if (nconf != NULL)
3065					fatalx("got unexpected %s from RDE",
3066					    "IMSG_RECONF_DONE");
3067				imsg_compose(ibuf_main, IMSG_RECONF_DONE, 0, 0,
3068				    -1, NULL, 0);
3069				break;
3070			case PFD_PIPE_MAIN:
3071				if (nconf == NULL)
3072					fatalx("got unexpected %s from parent",
3073					    "IMSG_RECONF_DONE");
3074				imsg_compose(ibuf_main, IMSG_RECONF_DRAIN, 0, 0,
3075				    -1, NULL, 0);
3076				break;
3077			default:
3078				fatalx("reconf request not from parent or RDE");
3079			}
3080			break;
3081		case IMSG_RECONF_DONE:
3082			if (idx != PFD_PIPE_MAIN)
3083				fatalx("reconf request not from parent");
3084			if (nconf == NULL)
3085				fatalx("got IMSG_RECONF_DONE but no config");
3086			copy_config(conf, nconf);
3087			merge_peers(conf, nconf);
3088
3089			/* delete old listeners */
3090			TAILQ_FOREACH_SAFE(la, conf->listen_addrs, entry,
3091			    next) {
3092				if (la->reconf == RECONF_NONE) {
3093					log_info("not listening on %s any more",
3094					    log_sockaddr((struct sockaddr *)
3095					    &la->sa, la->sa_len));
3096					TAILQ_REMOVE(conf->listen_addrs, la,
3097					    entry);
3098					close(la->fd);
3099					free(la);
3100				}
3101			}
3102
3103			/* add new listeners */
3104			TAILQ_CONCAT(conf->listen_addrs, nconf->listen_addrs,
3105			    entry);
3106
3107			setup_listeners(listener_cnt);
3108			free_config(nconf);
3109			nconf = NULL;
3110			pending_reconf = 0;
3111			log_info("SE reconfigured");
3112			/*
3113			 * IMSG_RECONF_DONE is sent when the RDE drained
3114			 * the peer config sent in merge_peers().
3115			 */
3116			break;
3117		case IMSG_SESSION_DEPENDON:
3118			if (idx != PFD_PIPE_MAIN)
3119				fatalx("IFINFO message not from parent");
3120			if (imsg_get_data(&imsg, &sdon, sizeof(sdon)) == -1)
3121				fatalx("DEPENDON imsg with wrong len");
3122			depend_ok = sdon.depend_state;
3123
3124			RB_FOREACH(p, peer_head, &conf->peers)
3125				if (!strcmp(p->conf.if_depend, sdon.ifname)) {
3126					if (depend_ok && !p->depend_ok) {
3127						p->depend_ok = depend_ok;
3128						bgp_fsm(p, EVNT_START);
3129					} else if (!depend_ok && p->depend_ok) {
3130						p->depend_ok = depend_ok;
3131						session_stop(p,
3132						    ERR_CEASE_OTHER_CHANGE,
3133						    NULL);
3134					}
3135				}
3136			break;
3137		case IMSG_MRT_OPEN:
3138		case IMSG_MRT_REOPEN:
3139			if (idx != PFD_PIPE_MAIN)
3140				fatalx("mrt request not from parent");
3141			if (imsg_get_data(&imsg, &xmrt, sizeof(xmrt)) == -1) {
3142				log_warnx("mrt open, wrong imsg len");
3143				break;
3144			}
3145
3146			if ((xmrt.wbuf.fd = imsg_get_fd(&imsg)) == -1) {
3147				log_warnx("expected to receive fd for mrt dump "
3148				    "but didn't receive any");
3149				break;
3150			}
3151
3152			mrt = mrt_get(&mrthead, &xmrt);
3153			if (mrt == NULL) {
3154				/* new dump */
3155				mrt = calloc(1, sizeof(struct mrt));
3156				if (mrt == NULL)
3157					fatal("session_dispatch_imsg");
3158				memcpy(mrt, &xmrt, sizeof(struct mrt));
3159				TAILQ_INIT(&mrt->wbuf.bufs);
3160				LIST_INSERT_HEAD(&mrthead, mrt, entry);
3161			} else {
3162				/* old dump reopened */
3163				close(mrt->wbuf.fd);
3164				mrt->wbuf.fd = xmrt.wbuf.fd;
3165			}
3166			break;
3167		case IMSG_MRT_CLOSE:
3168			if (idx != PFD_PIPE_MAIN)
3169				fatalx("mrt request not from parent");
3170			if (imsg_get_data(&imsg, &xmrt, sizeof(xmrt)) == -1) {
3171				log_warnx("mrt close, wrong imsg len");
3172				break;
3173			}
3174
3175			mrt = mrt_get(&mrthead, &xmrt);
3176			if (mrt != NULL)
3177				mrt_done(mrt);
3178			break;
3179		case IMSG_CTL_KROUTE:
3180		case IMSG_CTL_KROUTE_ADDR:
3181		case IMSG_CTL_SHOW_NEXTHOP:
3182		case IMSG_CTL_SHOW_INTERFACE:
3183		case IMSG_CTL_SHOW_FIB_TABLES:
3184		case IMSG_CTL_SHOW_RTR:
3185		case IMSG_CTL_SHOW_TIMER:
3186			if (idx != PFD_PIPE_MAIN)
3187				fatalx("ctl kroute request not from parent");
3188			control_imsg_relay(&imsg, NULL);
3189			break;
3190		case IMSG_CTL_SHOW_NEIGHBOR:
3191			if (idx != PFD_PIPE_ROUTE_CTL)
3192				fatalx("ctl rib request not from RDE");
3193			p = getpeerbyid(conf, peerid);
3194			control_imsg_relay(&imsg, p);
3195			break;
3196		case IMSG_CTL_SHOW_RIB:
3197		case IMSG_CTL_SHOW_RIB_PREFIX:
3198		case IMSG_CTL_SHOW_RIB_COMMUNITIES:
3199		case IMSG_CTL_SHOW_RIB_ATTR:
3200		case IMSG_CTL_SHOW_RIB_MEM:
3201		case IMSG_CTL_SHOW_NETWORK:
3202		case IMSG_CTL_SHOW_FLOWSPEC:
3203		case IMSG_CTL_SHOW_SET:
3204			if (idx != PFD_PIPE_ROUTE_CTL)
3205				fatalx("ctl rib request not from RDE");
3206			control_imsg_relay(&imsg, NULL);
3207			break;
3208		case IMSG_CTL_END:
3209		case IMSG_CTL_RESULT:
3210			control_imsg_relay(&imsg, NULL);
3211			break;
3212		case IMSG_UPDATE:
3213			if (idx != PFD_PIPE_ROUTE)
3214				fatalx("update request not from RDE");
3215			len = imsg_get_len(&imsg);
3216			if (imsg_get_ibuf(&imsg, &ibuf) == -1 ||
3217			    len > MAX_PKTSIZE - MSGSIZE_HEADER ||
3218			    len < MSGSIZE_UPDATE_MIN - MSGSIZE_HEADER)
3219				log_warnx("RDE sent invalid update");
3220			else
3221				session_update(peerid, &ibuf);
3222			break;
3223		case IMSG_UPDATE_ERR:
3224			if (idx != PFD_PIPE_ROUTE)
3225				fatalx("update request not from RDE");
3226			if ((p = getpeerbyid(conf, peerid)) == NULL) {
3227				log_warnx("no such peer: id=%u", peerid);
3228				break;
3229			}
3230			if (imsg_get_ibuf(&imsg, &ibuf) == -1 ||
3231			    ibuf_get_n8(&ibuf, &errcode) == -1 ||
3232			    ibuf_get_n8(&ibuf, &subcode) == -1) {
3233				log_warnx("RDE sent invalid notification");
3234				break;
3235			}
3236
3237			session_notification(p, errcode, subcode, &ibuf);
3238			switch (errcode) {
3239			case ERR_CEASE:
3240				switch (subcode) {
3241				case ERR_CEASE_MAX_PREFIX:
3242				case ERR_CEASE_MAX_SENT_PREFIX:
3243					t = p->conf.max_out_prefix_restart;
3244					if (subcode == ERR_CEASE_MAX_PREFIX)
3245						t = p->conf.max_prefix_restart;
3246
3247					bgp_fsm(p, EVNT_STOP);
3248					if (t)
3249						timer_set(&p->timers,
3250						    Timer_IdleHold, 60 * t);
3251					break;
3252				default:
3253					bgp_fsm(p, EVNT_CON_FATAL);
3254					break;
3255				}
3256				break;
3257			default:
3258				bgp_fsm(p, EVNT_CON_FATAL);
3259				break;
3260			}
3261			break;
3262		case IMSG_REFRESH:
3263			if (idx != PFD_PIPE_ROUTE)
3264				fatalx("route refresh request not from RDE");
3265			if (imsg_get_data(&imsg, &rr, sizeof(rr)) == -1) {
3266				log_warnx("RDE sent invalid refresh msg");
3267				break;
3268			}
3269			if ((p = getpeerbyid(conf, peerid)) == NULL) {
3270				log_warnx("no such peer: id=%u", peerid);
3271				break;
3272			}
3273			if (rr.aid < AID_MIN || rr.aid >= AID_MAX)
3274				fatalx("IMSG_REFRESH: bad AID");
3275			session_rrefresh(p, rr.aid, rr.subtype);
3276			break;
3277		case IMSG_SESSION_RESTARTED:
3278			if (idx != PFD_PIPE_ROUTE)
3279				fatalx("session restart not from RDE");
3280			if (imsg_get_data(&imsg, &aid, sizeof(aid)) == -1) {
3281				log_warnx("RDE sent invalid restart msg");
3282				break;
3283			}
3284			if ((p = getpeerbyid(conf, peerid)) == NULL) {
3285				log_warnx("no such peer: id=%u", peerid);
3286				break;
3287			}
3288			if (aid < AID_MIN || aid >= AID_MAX)
3289				fatalx("IMSG_SESSION_RESTARTED: bad AID");
3290			if (p->capa.neg.grestart.flags[aid] &
3291			    CAPA_GR_RESTARTING) {
3292				log_peer_warnx(&p->conf,
3293				    "graceful restart of %s finished",
3294				    aid2str(aid));
3295				p->capa.neg.grestart.flags[aid] &=
3296				    ~CAPA_GR_RESTARTING;
3297				timer_stop(&p->timers, Timer_RestartTimeout);
3298
3299				/* signal back to RDE to cleanup stale routes */
3300				if (imsg_rde(IMSG_SESSION_RESTARTED,
3301				    peerid, &aid, sizeof(aid)) == -1)
3302					fatal("imsg_compose: "
3303					    "IMSG_SESSION_RESTARTED");
3304			}
3305			break;
3306		default:
3307			break;
3308		}
3309		imsg_free(&imsg);
3310	}
3311}
3312
3313int
3314la_cmp(struct listen_addr *a, struct listen_addr *b)
3315{
3316	struct sockaddr_in	*in_a, *in_b;
3317	struct sockaddr_in6	*in6_a, *in6_b;
3318
3319	if (a->sa.ss_family != b->sa.ss_family)
3320		return (1);
3321
3322	switch (a->sa.ss_family) {
3323	case AF_INET:
3324		in_a = (struct sockaddr_in *)&a->sa;
3325		in_b = (struct sockaddr_in *)&b->sa;
3326		if (in_a->sin_addr.s_addr != in_b->sin_addr.s_addr)
3327			return (1);
3328		if (in_a->sin_port != in_b->sin_port)
3329			return (1);
3330		break;
3331	case AF_INET6:
3332		in6_a = (struct sockaddr_in6 *)&a->sa;
3333		in6_b = (struct sockaddr_in6 *)&b->sa;
3334		if (memcmp(&in6_a->sin6_addr, &in6_b->sin6_addr,
3335		    sizeof(struct in6_addr)))
3336			return (1);
3337		if (in6_a->sin6_port != in6_b->sin6_port)
3338			return (1);
3339		break;
3340	default:
3341		fatal("king bula sez: unknown address family");
3342		/* NOTREACHED */
3343	}
3344
3345	return (0);
3346}
3347
3348struct peer *
3349getpeerbydesc(struct bgpd_config *c, const char *descr)
3350{
3351	struct peer	*p, *res = NULL;
3352	int		 match = 0;
3353
3354	RB_FOREACH(p, peer_head, &c->peers)
3355		if (!strcmp(p->conf.descr, descr)) {
3356			res = p;
3357			match++;
3358		}
3359
3360	if (match > 1)
3361		log_info("neighbor description \"%s\" not unique, request "
3362		    "aborted", descr);
3363
3364	if (match == 1)
3365		return (res);
3366	else
3367		return (NULL);
3368}
3369
3370struct peer *
3371getpeerbyip(struct bgpd_config *c, struct sockaddr *ip)
3372{
3373	struct bgpd_addr addr;
3374	struct peer	*p, *newpeer, *loose = NULL;
3375	uint32_t	 id;
3376
3377	sa2addr(ip, &addr, NULL);
3378
3379	/* we might want a more effective way to find peers by IP */
3380	RB_FOREACH(p, peer_head, &c->peers)
3381		if (!p->conf.template &&
3382		    !memcmp(&addr, &p->conf.remote_addr, sizeof(addr)))
3383			return (p);
3384
3385	/* try template matching */
3386	RB_FOREACH(p, peer_head, &c->peers)
3387		if (p->conf.template &&
3388		    p->conf.remote_addr.aid == addr.aid &&
3389		    session_match_mask(p, &addr))
3390			if (loose == NULL || loose->conf.remote_masklen <
3391			    p->conf.remote_masklen)
3392				loose = p;
3393
3394	if (loose != NULL) {
3395		/* clone */
3396		if ((newpeer = malloc(sizeof(struct peer))) == NULL)
3397			fatal(NULL);
3398		memcpy(newpeer, loose, sizeof(struct peer));
3399		for (id = PEER_ID_DYN_MAX; id > PEER_ID_STATIC_MAX; id--) {
3400			if (getpeerbyid(c, id) == NULL)	/* we found a free id */
3401				break;
3402		}
3403		newpeer->template = loose;
3404		session_template_clone(newpeer, ip, id, 0);
3405		newpeer->state = newpeer->prev_state = STATE_NONE;
3406		newpeer->reconf_action = RECONF_KEEP;
3407		newpeer->rbuf = NULL;
3408		newpeer->rpending = 0;
3409		init_peer(newpeer);
3410		bgp_fsm(newpeer, EVNT_START);
3411		if (RB_INSERT(peer_head, &c->peers, newpeer) != NULL)
3412			fatalx("%s: peer tree is corrupt", __func__);
3413		return (newpeer);
3414	}
3415
3416	return (NULL);
3417}
3418
3419struct peer *
3420getpeerbyid(struct bgpd_config *c, uint32_t peerid)
3421{
3422	static struct peer lookup;
3423
3424	lookup.conf.id = peerid;
3425
3426	return RB_FIND(peer_head, &c->peers, &lookup);
3427}
3428
3429int
3430peer_matched(struct peer *p, struct ctl_neighbor *n)
3431{
3432	char *s;
3433
3434	if (n && n->addr.aid) {
3435		if (memcmp(&p->conf.remote_addr, &n->addr,
3436		    sizeof(p->conf.remote_addr)))
3437			return 0;
3438	} else if (n && n->descr[0]) {
3439		s = n->is_group ? p->conf.group : p->conf.descr;
3440		/* cannot trust n->descr to be properly terminated */
3441		if (strncmp(s, n->descr, sizeof(n->descr)))
3442			return 0;
3443	}
3444	return 1;
3445}
3446
3447void
3448session_template_clone(struct peer *p, struct sockaddr *ip, uint32_t id,
3449    uint32_t as)
3450{
3451	struct bgpd_addr	remote_addr;
3452
3453	if (ip)
3454		sa2addr(ip, &remote_addr, NULL);
3455	else
3456		memcpy(&remote_addr, &p->conf.remote_addr, sizeof(remote_addr));
3457
3458	memcpy(&p->conf, &p->template->conf, sizeof(struct peer_config));
3459
3460	p->conf.id = id;
3461
3462	if (as) {
3463		p->conf.remote_as = as;
3464		p->conf.ebgp = (p->conf.remote_as != p->conf.local_as);
3465		if (!p->conf.ebgp)
3466			/* force enforce_as off for iBGP sessions */
3467			p->conf.enforce_as = ENFORCE_AS_OFF;
3468	}
3469
3470	memcpy(&p->conf.remote_addr, &remote_addr, sizeof(remote_addr));
3471	switch (p->conf.remote_addr.aid) {
3472	case AID_INET:
3473		p->conf.remote_masklen = 32;
3474		break;
3475	case AID_INET6:
3476		p->conf.remote_masklen = 128;
3477		break;
3478	}
3479	p->conf.template = 0;
3480}
3481
3482int
3483session_match_mask(struct peer *p, struct bgpd_addr *a)
3484{
3485	struct bgpd_addr masked;
3486
3487	applymask(&masked, a, p->conf.remote_masklen);
3488	if (memcmp(&masked, &p->conf.remote_addr, sizeof(masked)) == 0)
3489		return (1);
3490	return (0);
3491}
3492
3493void
3494session_down(struct peer *peer)
3495{
3496	memset(&peer->capa.neg, 0, sizeof(peer->capa.neg));
3497	peer->stats.last_updown = getmonotime();
3498	/*
3499	 * session_down is called in the exit code path so check
3500	 * if the RDE is still around, if not there is no need to
3501	 * send the message.
3502	 */
3503	if (ibuf_rde == NULL)
3504		return;
3505	if (imsg_rde(IMSG_SESSION_DOWN, peer->conf.id, NULL, 0) == -1)
3506		fatalx("imsg_compose error");
3507}
3508
3509void
3510session_up(struct peer *p)
3511{
3512	struct session_up	 sup;
3513
3514	/* clear last errors, now that the session is up */
3515	p->stats.last_sent_errcode = 0;
3516	p->stats.last_sent_suberr = 0;
3517	p->stats.last_rcvd_errcode = 0;
3518	p->stats.last_rcvd_suberr = 0;
3519	memset(p->stats.last_reason, 0, sizeof(p->stats.last_reason));
3520
3521	if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
3522	    &p->conf, sizeof(p->conf)) == -1)
3523		fatalx("imsg_compose error");
3524
3525	if (p->local.aid == AID_INET) {
3526		sup.local_v4_addr = p->local;
3527		sup.local_v6_addr = p->local_alt;
3528	} else {
3529		sup.local_v6_addr = p->local;
3530		sup.local_v4_addr = p->local_alt;
3531	}
3532	sup.remote_addr = p->remote;
3533	sup.if_scope = p->if_scope;
3534
3535	sup.remote_bgpid = p->remote_bgpid;
3536	sup.short_as = p->short_as;
3537	memcpy(&sup.capa, &p->capa.neg, sizeof(sup.capa));
3538	p->stats.last_updown = getmonotime();
3539	if (imsg_rde(IMSG_SESSION_UP, p->conf.id, &sup, sizeof(sup)) == -1)
3540		fatalx("imsg_compose error");
3541}
3542
3543int
3544imsg_ctl_parent(struct imsg *imsg)
3545{
3546	return imsg_forward(ibuf_main, imsg);
3547}
3548
3549int
3550imsg_ctl_rde(struct imsg *imsg)
3551{
3552	if (ibuf_rde_ctl == NULL)
3553		return (0);
3554	/*
3555	 * Use control socket to talk to RDE to bypass the queue of the
3556	 * regular imsg socket.
3557	 */
3558	return imsg_forward(ibuf_rde_ctl, imsg);
3559}
3560
3561int
3562imsg_ctl_rde_msg(int type, uint32_t peerid, pid_t pid)
3563{
3564	if (ibuf_rde_ctl == NULL)
3565		return (0);
3566
3567	/*
3568	 * Use control socket to talk to RDE to bypass the queue of the
3569	 * regular imsg socket.
3570	 */
3571	return imsg_compose(ibuf_rde_ctl, type, peerid, pid, -1, NULL, 0);
3572}
3573
3574int
3575imsg_rde(int type, uint32_t peerid, void *data, uint16_t datalen)
3576{
3577	if (ibuf_rde == NULL)
3578		return (0);
3579
3580	return imsg_compose(ibuf_rde, type, peerid, 0, -1, data, datalen);
3581}
3582
3583void
3584session_demote(struct peer *p, int level)
3585{
3586	struct demote_msg	msg;
3587
3588	strlcpy(msg.demote_group, p->conf.demote_group,
3589	    sizeof(msg.demote_group));
3590	msg.level = level;
3591	if (imsg_compose(ibuf_main, IMSG_DEMOTE, p->conf.id, 0, -1,
3592	    &msg, sizeof(msg)) == -1)
3593		fatalx("imsg_compose error");
3594
3595	p->demoted += level;
3596}
3597
3598void
3599session_stop(struct peer *peer, uint8_t subcode, const char *reason)
3600{
3601	struct ibuf *ibuf;
3602
3603	if (reason != NULL)
3604		strlcpy(peer->conf.reason, reason, sizeof(peer->conf.reason));
3605
3606	ibuf = ibuf_dynamic(0, REASON_LEN);
3607
3608	if ((subcode == ERR_CEASE_ADMIN_DOWN ||
3609	    subcode == ERR_CEASE_ADMIN_RESET) &&
3610	    reason != NULL && *reason != '\0' &&
3611	    ibuf != NULL) {
3612		if (ibuf_add_n8(ibuf, strlen(reason)) == -1 ||
3613		    ibuf_add(ibuf, reason, strlen(reason))) {
3614			log_peer_warnx(&peer->conf,
3615			    "trying to send overly long shutdown reason");
3616			ibuf_free(ibuf);
3617			ibuf = NULL;
3618		}
3619	}
3620	switch (peer->state) {
3621	case STATE_OPENSENT:
3622	case STATE_OPENCONFIRM:
3623	case STATE_ESTABLISHED:
3624		session_notification(peer, ERR_CEASE, subcode, ibuf);
3625		break;
3626	default:
3627		/* session not open, no need to send notification */
3628		if (subcode >= sizeof(suberr_cease_names) / sizeof(char *) ||
3629		    suberr_cease_names[subcode] == NULL)
3630			log_peer_warnx(&peer->conf, "session stop: %s, "
3631			    "unknown subcode %u", errnames[ERR_CEASE], subcode);
3632		else
3633			log_peer_warnx(&peer->conf, "session stop: %s, %s",
3634			    errnames[ERR_CEASE], suberr_cease_names[subcode]);
3635		break;
3636	}
3637	ibuf_free(ibuf);
3638	bgp_fsm(peer, EVNT_STOP);
3639}
3640
3641void
3642merge_peers(struct bgpd_config *c, struct bgpd_config *nc)
3643{
3644	struct peer *p, *np, *next;
3645
3646	RB_FOREACH(p, peer_head, &c->peers) {
3647		/* templates are handled specially */
3648		if (p->template != NULL)
3649			continue;
3650		np = getpeerbyid(nc, p->conf.id);
3651		if (np == NULL) {
3652			p->reconf_action = RECONF_DELETE;
3653			continue;
3654		}
3655
3656		/* peer no longer uses TCP MD5SIG so deconfigure */
3657		if (p->conf.auth.method == AUTH_MD5SIG &&
3658		    np->conf.auth.method != AUTH_MD5SIG)
3659			tcp_md5_del_listener(c, p);
3660		else if (np->conf.auth.method == AUTH_MD5SIG)
3661			tcp_md5_add_listener(c, np);
3662
3663		memcpy(&p->conf, &np->conf, sizeof(p->conf));
3664		RB_REMOVE(peer_head, &nc->peers, np);
3665		free(np);
3666
3667		p->reconf_action = RECONF_KEEP;
3668
3669		/* had demotion, is demoted, demote removed? */
3670		if (p->demoted && !p->conf.demote_group[0])
3671			session_demote(p, -1);
3672
3673		/* if session is not open then refresh pfkey data */
3674		if (p->state < STATE_OPENSENT && !p->template)
3675			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
3676			    p->conf.id, 0, -1, NULL, 0);
3677
3678		/* sync the RDE in case we keep the peer */
3679		if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
3680		    &p->conf, sizeof(struct peer_config)) == -1)
3681			fatalx("imsg_compose error");
3682
3683		/* apply the config to all clones of a template */
3684		if (p->conf.template) {
3685			struct peer *xp;
3686			RB_FOREACH(xp, peer_head, &c->peers) {
3687				if (xp->template != p)
3688					continue;
3689				session_template_clone(xp, NULL, xp->conf.id,
3690				    xp->conf.remote_as);
3691				if (imsg_rde(IMSG_SESSION_ADD, xp->conf.id,
3692				    &xp->conf, sizeof(xp->conf)) == -1)
3693					fatalx("imsg_compose error");
3694			}
3695		}
3696	}
3697
3698	if (imsg_rde(IMSG_RECONF_DRAIN, 0, NULL, 0) == -1)
3699		fatalx("imsg_compose error");
3700
3701	/* pfkeys of new peers already loaded by the parent process */
3702	RB_FOREACH_SAFE(np, peer_head, &nc->peers, next) {
3703		RB_REMOVE(peer_head, &nc->peers, np);
3704		if (RB_INSERT(peer_head, &c->peers, np) != NULL)
3705			fatalx("%s: peer tree is corrupt", __func__);
3706		if (np->conf.auth.method == AUTH_MD5SIG)
3707			tcp_md5_add_listener(c, np);
3708	}
3709}
3710