1/*	$OpenBSD: rde_update.c,v 1.168 2024/05/30 08:29:30 claudio Exp $ */
2
3/*
4 * Copyright (c) 2004 Claudio Jeker <claudio@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18#include <sys/types.h>
19#include <sys/queue.h>
20#include <sys/tree.h>
21
22#include <limits.h>
23#include <stdlib.h>
24#include <string.h>
25#include <stdio.h>
26
27#include "bgpd.h"
28#include "rde.h"
29#include "log.h"
30
31enum up_state {
32	UP_OK,
33	UP_ERR_LIMIT,
34	UP_FILTERED,
35	UP_EXCLUDED,
36};
37
38static struct community	comm_no_advertise = {
39	.flags = COMMUNITY_TYPE_BASIC,
40	.data1 = COMMUNITY_WELLKNOWN,
41	.data2 = COMMUNITY_NO_ADVERTISE
42};
43static struct community	comm_no_export = {
44	.flags = COMMUNITY_TYPE_BASIC,
45	.data1 = COMMUNITY_WELLKNOWN,
46	.data2 = COMMUNITY_NO_EXPORT
47};
48static struct community	comm_no_expsubconfed = {
49	.flags = COMMUNITY_TYPE_BASIC,
50	.data1 = COMMUNITY_WELLKNOWN,
51	.data2 = COMMUNITY_NO_EXPSUBCONFED
52};
53
54static void up_prep_adjout(struct rde_peer *, struct filterstate *, uint8_t);
55
56static int
57up_test_update(struct rde_peer *peer, struct prefix *p)
58{
59	struct rde_aspath	*asp;
60	struct rde_community	*comm;
61	struct rde_peer		*frompeer;
62
63	frompeer = prefix_peer(p);
64	asp = prefix_aspath(p);
65	comm = prefix_communities(p);
66
67	if (asp == NULL || asp->flags & F_ATTR_PARSE_ERR)
68		fatalx("try to send out a botched path");
69	if (asp->flags & (F_ATTR_LOOP | F_ATTR_OTC_LEAK))
70		fatalx("try to send out a looped path");
71
72	if (peer == frompeer)
73		/* Do not send routes back to sender */
74		return (0);
75
76	if (!frompeer->conf.ebgp && !peer->conf.ebgp) {
77		/*
78		 * route reflector redistribution rules:
79		 * 1. if announce is set                -> announce
80		 * 2. from non-client, to non-client    -> no
81		 * 3. from client, to non-client        -> yes
82		 * 4. from non-client, to client        -> yes
83		 * 5. from client, to client            -> yes
84		 */
85		if (frompeer->conf.reflector_client == 0 &&
86		    peer->conf.reflector_client == 0 &&
87		    (asp->flags & F_PREFIX_ANNOUNCED) == 0)
88			/* Do not redistribute updates to ibgp peers */
89			return (0);
90	}
91
92	/* well known communities */
93	if (community_match(comm, &comm_no_advertise, NULL))
94		return (0);
95	if (peer->conf.ebgp) {
96		if (community_match(comm, &comm_no_export, NULL))
97			return (0);
98		if (community_match(comm, &comm_no_expsubconfed, NULL))
99			return (0);
100	}
101
102	return (1);
103}
104
105/* RFC9234 open policy handling */
106static int
107up_enforce_open_policy(struct rde_peer *peer, struct filterstate *state,
108    uint8_t aid)
109{
110	/* only for IPv4 and IPv6 unicast */
111	if (aid != AID_INET && aid != AID_INET6)
112		return 0;
113
114	/*
115	 * do not propagate (consider it filtered) if OTC is present and
116	 * local role is peer, customer or rs-client.
117	 */
118	if (peer->role == ROLE_PEER || peer->role == ROLE_CUSTOMER ||
119	    peer->role == ROLE_RS_CLIENT)
120		if (state->aspath.flags & F_ATTR_OTC)
121			return 1;
122
123	/*
124	 * add OTC attribute if not present towards peers, customers and
125	 * rs-clients (local roles peer, provider, rs).
126	 */
127	if (peer->role == ROLE_PEER || peer->role == ROLE_PROVIDER ||
128	    peer->role == ROLE_RS)
129		if ((state->aspath.flags & F_ATTR_OTC) == 0) {
130			uint32_t tmp;
131
132			tmp = htonl(peer->conf.local_as);
133			if (attr_optadd(&state->aspath,
134			    ATTR_OPTIONAL|ATTR_TRANSITIVE, ATTR_OTC,
135			    &tmp, sizeof(tmp)) == -1)
136				log_peer_warnx(&peer->conf,
137				    "failed to add OTC attribute");
138			state->aspath.flags |= F_ATTR_OTC;
139		}
140
141	return 0;
142}
143
144/*
145 * Process a single prefix by passing it through the various filter stages
146 * and if not filtered out update the Adj-RIB-Out. Returns:
147 * - UP_OK if prefix was added
148 * - UP_ERR_LIMIT if the peer outbound prefix limit was reached
149 * - UP_FILTERED if prefix was filtered out
150 * - UP_EXCLUDED if prefix was excluded because of up_test_update()
151 */
152static enum up_state
153up_process_prefix(struct rde_peer *peer, struct prefix *new, struct prefix *p)
154{
155	struct filterstate state;
156	struct bgpd_addr addr;
157	int excluded = 0;
158
159	/*
160	 * up_test_update() needs to run before the output filters
161	 * else the well known communities won't work properly.
162	 * The output filters would not be able to add well known
163	 * communities.
164	 */
165	if (!up_test_update(peer, new))
166		excluded = 1;
167
168	rde_filterstate_prep(&state, new);
169	pt_getaddr(new->pt, &addr);
170	if (rde_filter(peer->out_rules, peer, prefix_peer(new), &addr,
171	    new->pt->prefixlen, &state) == ACTION_DENY) {
172		rde_filterstate_clean(&state);
173		return UP_FILTERED;
174	}
175
176	/* Open Policy Check: acts like an output filter */
177	if (up_enforce_open_policy(peer, &state, new->pt->aid)) {
178		rde_filterstate_clean(&state);
179		return UP_FILTERED;
180	}
181
182	if (excluded) {
183		rde_filterstate_clean(&state);
184		return UP_EXCLUDED;
185	}
186
187	/* from here on we know this is an update */
188	if (p == (void *)-1)
189		p = prefix_adjout_get(peer, new->path_id_tx, new->pt);
190
191	up_prep_adjout(peer, &state, new->pt->aid);
192	prefix_adjout_update(p, peer, &state, new->pt, new->path_id_tx);
193	rde_filterstate_clean(&state);
194
195	/* max prefix checker outbound */
196	if (peer->conf.max_out_prefix &&
197	    peer->stats.prefix_out_cnt > peer->conf.max_out_prefix) {
198		log_peer_warnx(&peer->conf,
199		    "outbound prefix limit reached (>%u/%u)",
200		    peer->stats.prefix_out_cnt, peer->conf.max_out_prefix);
201		rde_update_err(peer, ERR_CEASE,
202		    ERR_CEASE_MAX_SENT_PREFIX, NULL);
203		return UP_ERR_LIMIT;
204	}
205
206	return UP_OK;
207}
208
209void
210up_generate_updates(struct rde_peer *peer, struct rib_entry *re)
211{
212	struct prefix		*new, *p;
213
214	p = prefix_adjout_first(peer, re->prefix);
215
216	new = prefix_best(re);
217	while (new != NULL) {
218		switch (up_process_prefix(peer, new, p)) {
219		case UP_OK:
220		case UP_ERR_LIMIT:
221			return;
222		case UP_FILTERED:
223			if (peer->flags & PEERFLAG_EVALUATE_ALL) {
224				new = TAILQ_NEXT(new, entry.list.rib);
225				if (new != NULL && prefix_eligible(new))
226					continue;
227			}
228			goto done;
229		case UP_EXCLUDED:
230			goto done;
231		}
232	}
233
234done:
235	/* withdraw prefix */
236	if (p != NULL)
237		prefix_adjout_withdraw(p);
238}
239
240/*
241 * Generate updates for the add-path send case. Depending on the
242 * peer eval settings prefixes are selected and distributed.
243 * This highly depends on the Adj-RIB-Out to handle prefixes with no
244 * changes gracefully. It may be possible to improve the API so that
245 * less churn is needed.
246 */
247void
248up_generate_addpath(struct rde_peer *peer, struct rib_entry *re)
249{
250	struct prefix		*head, *new, *p;
251	int			maxpaths = 0, extrapaths = 0, extra;
252	int			checkmode = 1;
253
254	head = prefix_adjout_first(peer, re->prefix);
255
256	/* mark all paths as stale */
257	for (p = head; p != NULL; p = prefix_adjout_next(peer, p))
258		p->flags |= PREFIX_FLAG_STALE;
259
260	/* update paths */
261	new = prefix_best(re);
262	while (new != NULL) {
263		/* check limits and stop when a limit is reached */
264		if (peer->eval.maxpaths != 0 &&
265		    maxpaths >= peer->eval.maxpaths)
266			break;
267		if (peer->eval.extrapaths != 0 &&
268		    extrapaths >= peer->eval.extrapaths)
269			break;
270
271		extra = 1;
272		if (checkmode) {
273			switch (peer->eval.mode) {
274			case ADDPATH_EVAL_BEST:
275				if (new->dmetric == PREFIX_DMETRIC_BEST)
276					extra = 0;
277				else
278					checkmode = 0;
279				break;
280			case ADDPATH_EVAL_ECMP:
281				if (new->dmetric == PREFIX_DMETRIC_BEST ||
282				    new->dmetric == PREFIX_DMETRIC_ECMP)
283					extra = 0;
284				else
285					checkmode = 0;
286				break;
287			case ADDPATH_EVAL_AS_WIDE:
288				if (new->dmetric == PREFIX_DMETRIC_BEST ||
289				    new->dmetric == PREFIX_DMETRIC_ECMP ||
290				    new->dmetric == PREFIX_DMETRIC_AS_WIDE)
291					extra = 0;
292				else
293					checkmode = 0;
294				break;
295			case ADDPATH_EVAL_ALL:
296				/* nothing to check */
297				checkmode = 0;
298				break;
299			default:
300				fatalx("unknown add-path eval mode");
301			}
302		}
303
304		switch (up_process_prefix(peer, new, (void *)-1)) {
305		case UP_OK:
306			maxpaths++;
307			extrapaths += extra;
308			break;
309		case UP_FILTERED:
310		case UP_EXCLUDED:
311			break;
312		case UP_ERR_LIMIT:
313			/* just give up */
314			return;
315		}
316
317		/* only allow valid prefixes */
318		new = TAILQ_NEXT(new, entry.list.rib);
319		if (new == NULL || !prefix_eligible(new))
320			break;
321	}
322
323	/* withdraw stale paths */
324	for (p = head; p != NULL; p = prefix_adjout_next(peer, p)) {
325		if (p->flags & PREFIX_FLAG_STALE)
326			prefix_adjout_withdraw(p);
327	}
328}
329
330/*
331 * Generate updates for the add-path send all case. Since all prefixes
332 * are distributed just remove old and add new.
333 */
334void
335up_generate_addpath_all(struct rde_peer *peer, struct rib_entry *re,
336    struct prefix *new, struct prefix *old)
337{
338	struct prefix		*p, *head = NULL;
339	int			all = 0;
340
341	/*
342	 * if old and new are NULL then insert all prefixes from best,
343	 * clearing old routes in the process
344	 */
345	if (old == NULL && new == NULL) {
346		/* mark all paths as stale */
347		head = prefix_adjout_first(peer, re->prefix);
348		for (p = head; p != NULL; p = prefix_adjout_next(peer, p))
349			p->flags |= PREFIX_FLAG_STALE;
350
351		new = prefix_best(re);
352		all = 1;
353	}
354
355	if (new != NULL && !prefix_eligible(new)) {
356		/* only allow valid prefixes */
357		new = NULL;
358	}
359
360	if (old != NULL) {
361		/* withdraw stale paths */
362		p = prefix_adjout_get(peer, old->path_id_tx, old->pt);
363		if (p != NULL)
364			prefix_adjout_withdraw(p);
365	}
366
367	/* add new path (or multiple if all is set) */
368	while (new != NULL) {
369		switch (up_process_prefix(peer, new, (void *)-1)) {
370		case UP_OK:
371		case UP_FILTERED:
372		case UP_EXCLUDED:
373			break;
374		case UP_ERR_LIMIT:
375			/* just give up */
376			return;
377		}
378
379		if (!all)
380			break;
381
382		/* only allow valid prefixes */
383		new = TAILQ_NEXT(new, entry.list.rib);
384		if (new == NULL || !prefix_eligible(new))
385			break;
386	}
387
388	if (all) {
389		/* withdraw stale paths */
390		for (p = head; p != NULL; p = prefix_adjout_next(peer, p)) {
391			if (p->flags & PREFIX_FLAG_STALE)
392				prefix_adjout_withdraw(p);
393		}
394	}
395}
396
397/* send a default route to the specified peer */
398void
399up_generate_default(struct rde_peer *peer, uint8_t aid)
400{
401	extern struct rde_peer	*peerself;
402	struct filterstate	 state;
403	struct rde_aspath	*asp;
404	struct prefix		*p;
405	struct pt_entry		*pte;
406	struct bgpd_addr	 addr;
407
408	if (peer->capa.mp[aid] == 0)
409		return;
410
411	rde_filterstate_init(&state);
412	asp = &state.aspath;
413	asp->aspath = aspath_get(NULL, 0);
414	asp->origin = ORIGIN_IGP;
415	rde_filterstate_set_vstate(&state, ROA_NOTFOUND, ASPA_NEVER_KNOWN);
416	/* the other default values are OK, nexthop is once again NULL */
417
418	/*
419	 * XXX apply default overrides. Not yet possible, mainly a parse.y
420	 * problem.
421	 */
422	/* rde_apply_set(asp, peerself, peerself, set, af); */
423
424	memset(&addr, 0, sizeof(addr));
425	addr.aid = aid;
426	p = prefix_adjout_lookup(peer, &addr, 0);
427
428	/* outbound filter as usual */
429	if (rde_filter(peer->out_rules, peer, peerself, &addr, 0, &state) ==
430	    ACTION_DENY) {
431		rde_filterstate_clean(&state);
432		return;
433	}
434
435	up_prep_adjout(peer, &state, addr.aid);
436	/* can't use pt_fill here since prefix_adjout_update keeps a ref */
437	pte = pt_get(&addr, 0);
438	if (pte == NULL)
439		pte = pt_add(&addr, 0);
440	prefix_adjout_update(p, peer, &state, pte, 0);
441	rde_filterstate_clean(&state);
442
443	/* max prefix checker outbound */
444	if (peer->conf.max_out_prefix &&
445	    peer->stats.prefix_out_cnt > peer->conf.max_out_prefix) {
446		log_peer_warnx(&peer->conf,
447		    "outbound prefix limit reached (>%u/%u)",
448		    peer->stats.prefix_out_cnt, peer->conf.max_out_prefix);
449		rde_update_err(peer, ERR_CEASE,
450		    ERR_CEASE_MAX_SENT_PREFIX, NULL);
451	}
452}
453
454static struct bgpd_addr *
455up_get_nexthop(struct rde_peer *peer, struct filterstate *state, uint8_t aid)
456{
457	struct bgpd_addr *peer_local = NULL;
458
459	switch (aid) {
460	case AID_INET:
461	case AID_VPN_IPv4:
462		if (peer->local_v4_addr.aid == AID_INET)
463			peer_local = &peer->local_v4_addr;
464		break;
465	case AID_INET6:
466	case AID_VPN_IPv6:
467		if (peer->local_v6_addr.aid == AID_INET6)
468			peer_local = &peer->local_v6_addr;
469		break;
470	case AID_FLOWSPECv4:
471	case AID_FLOWSPECv6:
472		/* flowspec has no nexthop */
473		return (NULL);
474	default:
475		fatalx("%s, bad AID %s", __func__, aid2str(aid));
476	}
477
478	if (state->nhflags & NEXTHOP_SELF) {
479		/*
480		 * Forcing the nexthop to self is always possible
481		 * and has precedence over other flags.
482		 */
483		return (peer_local);
484	} else if (!peer->conf.ebgp) {
485		/*
486		 * in the ibgp case the nexthop is normally not
487		 * modified unless it points at the peer itself.
488		 */
489		if (state->nexthop == NULL) {
490			/* announced networks without explicit nexthop set */
491			return (peer_local);
492		}
493		/*
494		 * per RFC: if remote peer address is equal to the nexthop set
495		 * the nexthop to our local address. This reduces the risk of
496		 * routing loops. This overrides NEXTHOP_NOMODIFY.
497		 */
498		if (memcmp(&state->nexthop->exit_nexthop,
499		    &peer->remote_addr, sizeof(peer->remote_addr)) == 0) {
500			return (peer_local);
501		}
502		return (&state->nexthop->exit_nexthop);
503	} else if (peer->conf.distance == 1) {
504		/*
505		 * In the ebgp directly connected case never send
506		 * out a nexthop that is outside of the connected
507		 * network of the peer. No matter what flags are
508		 * set. This follows section 5.1.3 of RFC 4271.
509		 * So just check if the nexthop is in the same net
510		 * is enough here.
511		 */
512		if (state->nexthop != NULL &&
513		    state->nexthop->flags & NEXTHOP_CONNECTED &&
514		    prefix_compare(&peer->remote_addr,
515		    &state->nexthop->nexthop_net,
516		    state->nexthop->nexthop_netlen) == 0) {
517			/* nexthop and peer are in the same net */
518			return (&state->nexthop->exit_nexthop);
519		}
520		return (peer_local);
521	} else {
522		/*
523		 * For ebgp multihop make it possible to overrule
524		 * the sent nexthop by setting NEXTHOP_NOMODIFY.
525		 * Similar to the ibgp case there is no same net check
526		 * needed but still ensure that the nexthop is not
527		 * pointing to the peer itself.
528		 */
529		if (state->nhflags & NEXTHOP_NOMODIFY &&
530		    state->nexthop != NULL &&
531		    memcmp(&state->nexthop->exit_nexthop,
532		    &peer->remote_addr, sizeof(peer->remote_addr)) != 0) {
533			/* no modify flag set and nexthop not peer addr */
534			return (&state->nexthop->exit_nexthop);
535		}
536		return (peer_local);
537	}
538}
539
540static void
541up_prep_adjout(struct rde_peer *peer, struct filterstate *state, uint8_t aid)
542{
543	struct bgpd_addr *nexthop;
544	struct nexthop *nh = NULL;
545	u_char *np;
546	uint16_t nl;
547
548	/* prepend local AS number for eBGP sessions. */
549	if (peer->conf.ebgp && (peer->flags & PEERFLAG_TRANS_AS) == 0) {
550		uint32_t prep_as = peer->conf.local_as;
551		np = aspath_prepend(state->aspath.aspath, prep_as, 1, &nl);
552		aspath_put(state->aspath.aspath);
553		state->aspath.aspath = aspath_get(np, nl);
554		free(np);
555	}
556
557	/* update nexthop */
558	nexthop = up_get_nexthop(peer, state, aid);
559	if (nexthop != NULL)
560		nh = nexthop_get(nexthop);
561	nexthop_unref(state->nexthop);
562	state->nexthop = nh;
563	state->nhflags = 0;
564}
565
566
567static int
568up_generate_attr(struct ibuf *buf, struct rde_peer *peer,
569    struct rde_aspath *asp, struct rde_community *comm, struct nexthop *nh,
570    uint8_t aid)
571{
572	struct attr	*oa = NULL, *newaggr = NULL;
573	u_char		*pdata;
574	uint32_t	 tmp32;
575	int		 flags, neednewpath = 0, rv;
576	uint16_t	 plen;
577	uint8_t		 oalen = 0, type;
578
579	if (asp->others_len > 0)
580		oa = asp->others[oalen++];
581
582	/* dump attributes in ascending order */
583	for (type = ATTR_ORIGIN; type < 255; type++) {
584		while (oa && oa->type < type) {
585			if (oalen < asp->others_len)
586				oa = asp->others[oalen++];
587			else
588				oa = NULL;
589		}
590
591		switch (type) {
592		/*
593		 * Attributes stored in rde_aspath
594		 */
595		case ATTR_ORIGIN:
596			if (attr_writebuf(buf, ATTR_WELL_KNOWN,
597			    ATTR_ORIGIN, &asp->origin, 1) == -1)
598				return -1;
599			break;
600		case ATTR_ASPATH:
601			plen = aspath_length(asp->aspath);
602			pdata = aspath_dump(asp->aspath);
603
604			if (!peer_has_as4byte(peer))
605				pdata = aspath_deflate(pdata, &plen,
606				    &neednewpath);
607			rv = attr_writebuf(buf, ATTR_WELL_KNOWN,
608			    ATTR_ASPATH, pdata, plen);
609			if (!peer_has_as4byte(peer))
610				free(pdata);
611
612			if (rv == -1)
613				return -1;
614			break;
615		case ATTR_NEXTHOP:
616			switch (aid) {
617			case AID_INET:
618				if (nh == NULL)
619					return -1;
620				if (attr_writebuf(buf, ATTR_WELL_KNOWN,
621				    ATTR_NEXTHOP, &nh->exit_nexthop.v4,
622				    sizeof(nh->exit_nexthop.v4)) == -1)
623					return -1;
624				break;
625			default:
626				break;
627			}
628			break;
629		case ATTR_MED:
630			/*
631			 * The old MED from other peers MUST not be announced
632			 * to others unless the MED is originating from us or
633			 * the peer is an IBGP one. Only exception are routers
634			 * with "transparent-as yes" set.
635			 */
636			if (asp->flags & F_ATTR_MED && (!peer->conf.ebgp ||
637			    asp->flags & F_ATTR_MED_ANNOUNCE ||
638			    peer->flags & PEERFLAG_TRANS_AS)) {
639				tmp32 = htonl(asp->med);
640				if (attr_writebuf(buf, ATTR_OPTIONAL,
641				    ATTR_MED, &tmp32, 4) == -1)
642					return -1;
643			}
644			break;
645		case ATTR_LOCALPREF:
646			if (!peer->conf.ebgp) {
647				/* local preference, only valid for ibgp */
648				tmp32 = htonl(asp->lpref);
649				if (attr_writebuf(buf, ATTR_WELL_KNOWN,
650				    ATTR_LOCALPREF, &tmp32, 4) == -1)
651					return -1;
652			}
653			break;
654		/*
655		 * Communities are stored in struct rde_community
656		 */
657		case ATTR_COMMUNITIES:
658		case ATTR_EXT_COMMUNITIES:
659		case ATTR_LARGE_COMMUNITIES:
660			if (community_writebuf(comm, type, peer->conf.ebgp,
661			    buf) == -1)
662				return -1;
663			break;
664		/*
665		 * NEW to OLD conversion when sending stuff to a 2byte AS peer
666		 */
667		case ATTR_AS4_PATH:
668			if (neednewpath) {
669				plen = aspath_length(asp->aspath);
670				pdata = aspath_dump(asp->aspath);
671
672				flags = ATTR_OPTIONAL|ATTR_TRANSITIVE;
673				if (!(asp->flags & F_PREFIX_ANNOUNCED))
674					flags |= ATTR_PARTIAL;
675				if (plen != 0)
676					if (attr_writebuf(buf, flags,
677					    ATTR_AS4_PATH, pdata, plen) == -1)
678						return -1;
679			}
680			break;
681		case ATTR_AS4_AGGREGATOR:
682			if (newaggr) {
683				flags = ATTR_OPTIONAL|ATTR_TRANSITIVE;
684				if (!(asp->flags & F_PREFIX_ANNOUNCED))
685					flags |= ATTR_PARTIAL;
686				if (attr_writebuf(buf, flags,
687				    ATTR_AS4_AGGREGATOR, newaggr->data,
688				    newaggr->len) == -1)
689					return -1;
690			}
691			break;
692		/*
693		 * multiprotocol attributes are handled elsewhere
694		 */
695		case ATTR_MP_REACH_NLRI:
696		case ATTR_MP_UNREACH_NLRI:
697			break;
698		/*
699		 * dump all other path attributes. Following rules apply:
700		 *  1. well-known attrs: ATTR_ATOMIC_AGGREGATE and
701		 *     ATTR_AGGREGATOR pass unmodified (enforce flags
702		 *     to correct values). Actually ATTR_AGGREGATOR may be
703		 *     deflated for OLD 2-byte peers.
704		 *  2. non-transitive attrs: don't re-announce to ebgp peers
705		 *  3. transitive known attrs: announce unmodified
706		 *  4. transitive unknown attrs: set partial bit and re-announce
707		 */
708		case ATTR_ATOMIC_AGGREGATE:
709			if (oa == NULL || oa->type != type)
710				break;
711			if (attr_writebuf(buf, ATTR_WELL_KNOWN,
712			    ATTR_ATOMIC_AGGREGATE, NULL, 0) == -1)
713				return -1;
714			break;
715		case ATTR_AGGREGATOR:
716			if (oa == NULL || oa->type != type)
717				break;
718			if ((!(oa->flags & ATTR_TRANSITIVE)) &&
719			    peer->conf.ebgp)
720				break;
721			if (!peer_has_as4byte(peer)) {
722				/* need to deflate the aggregator */
723				uint8_t		t[6];
724				uint16_t	tas;
725
726				if ((!(oa->flags & ATTR_TRANSITIVE)) &&
727				    peer->conf.ebgp)
728					break;
729
730				memcpy(&tmp32, oa->data, sizeof(tmp32));
731				if (ntohl(tmp32) > USHRT_MAX) {
732					tas = htons(AS_TRANS);
733					newaggr = oa;
734				} else
735					tas = htons(ntohl(tmp32));
736
737				memcpy(t, &tas, sizeof(tas));
738				memcpy(t + sizeof(tas),
739				    oa->data + sizeof(tmp32),
740				    oa->len - sizeof(tmp32));
741				if (attr_writebuf(buf, oa->flags,
742				    oa->type, &t, sizeof(t)) == -1)
743					return -1;
744			} else {
745				if (attr_writebuf(buf, oa->flags, oa->type,
746				    oa->data, oa->len) == -1)
747					return -1;
748			}
749			break;
750		case ATTR_ORIGINATOR_ID:
751		case ATTR_CLUSTER_LIST:
752		case ATTR_OTC:
753			if (oa == NULL || oa->type != type)
754				break;
755			if ((!(oa->flags & ATTR_TRANSITIVE)) &&
756			    peer->conf.ebgp)
757				break;
758			if (attr_writebuf(buf, oa->flags, oa->type,
759			    oa->data, oa->len) == -1)
760				return -1;
761			break;
762		default:
763			if (oa == NULL && type >= ATTR_FIRST_UNKNOWN)
764				/* there is no attribute left to dump */
765				return (0);
766
767			if (oa == NULL || oa->type != type)
768				break;
769			/* unknown attribute */
770			if (!(oa->flags & ATTR_TRANSITIVE)) {
771				/*
772				 * RFC 1771:
773				 * Unrecognized non-transitive optional
774				 * attributes must be quietly ignored and
775				 * not passed along to other BGP peers.
776				 */
777				break;
778			}
779			if (attr_writebuf(buf, oa->flags | ATTR_PARTIAL,
780			    oa->type, oa->data, oa->len) == -1)
781				return -1;
782		}
783	}
784	return 0;
785}
786
787/*
788 * Check if the pending element is a EoR marker. If so remove it from the
789 * tree and return 1.
790 */
791int
792up_is_eor(struct rde_peer *peer, uint8_t aid)
793{
794	struct prefix *p;
795
796	p = RB_MIN(prefix_tree, &peer->updates[aid]);
797	if (p != NULL && (p->flags & PREFIX_FLAG_EOR)) {
798		/*
799		 * Need to remove eor from update tree because
800		 * prefix_adjout_destroy() can't handle that.
801		 */
802		RB_REMOVE(prefix_tree, &peer->updates[aid], p);
803		p->flags &= ~PREFIX_FLAG_UPDATE;
804		prefix_adjout_destroy(p);
805		return 1;
806	}
807	return 0;
808}
809
810/* minimal buffer size > withdraw len + attr len + attr hdr + afi/safi */
811#define MIN_UPDATE_LEN	16
812
813static void
814up_prefix_free(struct prefix_tree *prefix_head, struct prefix *p,
815    struct rde_peer *peer, int withdraw)
816{
817	if (withdraw) {
818		/* prefix no longer needed, remove it */
819		prefix_adjout_destroy(p);
820		peer->stats.prefix_sent_withdraw++;
821	} else {
822		/* prefix still in Adj-RIB-Out, keep it */
823		RB_REMOVE(prefix_tree, prefix_head, p);
824		p->flags &= ~PREFIX_FLAG_UPDATE;
825		peer->stats.pending_update--;
826		peer->stats.prefix_sent_update++;
827	}
828}
829
830/*
831 * Write prefixes to buffer until either there is no more space or
832 * the next prefix has no longer the same ASPATH attributes.
833 * Returns -1 if no prefix was written else 0.
834 */
835static int
836up_dump_prefix(struct ibuf *buf, struct prefix_tree *prefix_head,
837    struct rde_peer *peer, int withdraw)
838{
839	struct prefix	*p, *np;
840	int		 done = 0, has_ap = -1, rv = -1;
841
842	RB_FOREACH_SAFE(p, prefix_tree, prefix_head, np) {
843		if (has_ap == -1)
844			has_ap = peer_has_add_path(peer, p->pt->aid,
845			    CAPA_AP_SEND);
846		if (pt_writebuf(buf, p->pt, withdraw, has_ap, p->path_id_tx) ==
847		    -1)
848			break;
849
850		/* make sure we only dump prefixes which belong together */
851		if (np == NULL ||
852		    np->aspath != p->aspath ||
853		    np->communities != p->communities ||
854		    np->nexthop != p->nexthop ||
855		    np->nhflags != p->nhflags ||
856		    (np->flags & PREFIX_FLAG_EOR))
857			done = 1;
858
859		rv = 0;
860		up_prefix_free(prefix_head, p, peer, withdraw);
861		if (done)
862			break;
863	}
864	return rv;
865}
866
867static int
868up_generate_mp_reach(struct ibuf *buf, struct rde_peer *peer,
869    struct nexthop *nh, uint8_t aid)
870{
871	struct bgpd_addr *nexthop;
872	size_t off;
873	uint16_t len, afi;
874	uint8_t safi;
875
876	/* attribute header, defaulting to extended length one */
877	if (ibuf_add_n8(buf, ATTR_OPTIONAL | ATTR_EXTLEN) == -1)
878		return -1;
879	if (ibuf_add_n8(buf, ATTR_MP_REACH_NLRI) == -1)
880		return -1;
881	off = ibuf_size(buf);
882	if (ibuf_add_zero(buf, sizeof(len)) == -1)
883		return -1;
884
885	if (aid2afi(aid, &afi, &safi))
886		fatalx("up_generate_mp_reach: bad AID");
887
888	/* AFI + SAFI + NH LEN + NH + Reserved */
889	if (ibuf_add_n16(buf, afi) == -1)
890		return -1;
891	if (ibuf_add_n8(buf, safi) == -1)
892		return -1;
893
894	switch (aid) {
895	case AID_INET6:
896		if (nh == NULL)
897			return -1;
898		/* NH LEN */
899		if (ibuf_add_n8(buf, sizeof(struct in6_addr)) == -1)
900			return -1;
901		/* write nexthop */
902		nexthop = &nh->exit_nexthop;
903		if (ibuf_add(buf, &nexthop->v6, sizeof(struct in6_addr)) == -1)
904			return -1;
905		break;
906	case AID_VPN_IPv4:
907		if (nh == NULL)
908			return -1;
909		/* NH LEN */
910		if (ibuf_add_n8(buf,
911		    sizeof(uint64_t) + sizeof(struct in_addr)) == -1)
912			return -1;
913		/* write zero rd */
914		if (ibuf_add_zero(buf, sizeof(uint64_t)) == -1)
915			return -1;
916		/* write nexthop */
917		nexthop = &nh->exit_nexthop;
918		if (ibuf_add(buf, &nexthop->v4, sizeof(struct in_addr)) == -1)
919			return -1;
920		break;
921	case AID_VPN_IPv6:
922		if (nh == NULL)
923			return -1;
924		/* NH LEN */
925		if (ibuf_add_n8(buf,
926		    sizeof(uint64_t) + sizeof(struct in6_addr)) == -1)
927			return -1;
928		/* write zero rd */
929		if (ibuf_add_zero(buf, sizeof(uint64_t)) == -1)
930			return -1;
931		/* write nexthop */
932		nexthop = &nh->exit_nexthop;
933		if (ibuf_add(buf, &nexthop->v6, sizeof(struct in6_addr)) == -1)
934			return -1;
935		break;
936	case AID_FLOWSPECv4:
937	case AID_FLOWSPECv6:
938		if (ibuf_add_zero(buf, 1) == -1) /* NH LEN MUST be 0 */
939			return -1;
940		/* no NH */
941		break;
942	default:
943		fatalx("up_generate_mp_reach: unknown AID");
944	}
945
946	if (ibuf_add_zero(buf, 1) == -1) /* Reserved must be 0 */
947		return -1;
948
949	if (up_dump_prefix(buf, &peer->updates[aid], peer, 0) == -1)
950		/* no prefixes written, fail update  */
951		return (-1);
952
953	/* update MP_REACH attribute length field */
954	len = ibuf_size(buf) - off - sizeof(len);
955	if (ibuf_set_n16(buf, off, len) == -1)
956		return -1;
957
958	return 0;
959}
960
961/*
962 * Generate UPDATE message containing either just withdraws or updates.
963 * UPDATE messages are contructed like this:
964 *
965 *    +-----------------------------------------------------+
966 *    |   Withdrawn Routes Length (2 octets)                |
967 *    +-----------------------------------------------------+
968 *    |   Withdrawn Routes (variable)                       |
969 *    +-----------------------------------------------------+
970 *    |   Total Path Attribute Length (2 octets)            |
971 *    +-----------------------------------------------------+
972 *    |   Path Attributes (variable)                        |
973 *    +-----------------------------------------------------+
974 *    |   Network Layer Reachability Information (variable) |
975 *    +-----------------------------------------------------+
976 *
977 * Multiprotocol messages use MP_REACH_NLRI and MP_UNREACH_NLRI
978 * the latter will be the only path attribute in a message.
979 */
980
981/*
982 * Write UPDATE message for withdrawn routes. The size of buf limits
983 * how may routes can be added. Return 0 on success -1 on error which
984 * includes generating an empty withdraw message.
985 */
986int
987up_dump_withdraws(struct ibuf *buf, struct rde_peer *peer, uint8_t aid)
988{
989	size_t off;
990	uint16_t afi, len;
991	uint8_t safi;
992
993	/* reserve space for the withdrawn routes length field */
994	off = ibuf_size(buf);
995	if (ibuf_add_zero(buf, sizeof(len)) == -1)
996		return -1;
997
998	if (aid != AID_INET) {
999		/* reserve space for 2-byte path attribute length */
1000		off = ibuf_size(buf);
1001		if (ibuf_add_zero(buf, sizeof(len)) == -1)
1002			return -1;
1003
1004		/* attribute header, defaulting to extended length one */
1005		if (ibuf_add_n8(buf, ATTR_OPTIONAL | ATTR_EXTLEN) == -1)
1006			return -1;
1007		if (ibuf_add_n8(buf, ATTR_MP_UNREACH_NLRI) == -1)
1008			return -1;
1009		if (ibuf_add_zero(buf, sizeof(len)) == -1)
1010			return -1;
1011
1012		/* afi & safi */
1013		if (aid2afi(aid, &afi, &safi))
1014			fatalx("up_dump_mp_unreach: bad AID");
1015		if (ibuf_add_n16(buf, afi) == -1)
1016			return -1;
1017		if (ibuf_add_n8(buf, safi) == -1)
1018			return -1;
1019	}
1020
1021	if (up_dump_prefix(buf, &peer->withdraws[aid], peer, 1) == -1)
1022		return -1;
1023
1024	/* update length field (either withdrawn routes or attribute length) */
1025	len = ibuf_size(buf) - off - sizeof(len);
1026	if (ibuf_set_n16(buf, off, len) == -1)
1027		return -1;
1028
1029	if (aid != AID_INET) {
1030		/* write MP_UNREACH_NLRI attribute length (always extended) */
1031		len -= 4; /* skip attribute header */
1032		if (ibuf_set_n16(buf, off + sizeof(len) + 2, len) == -1)
1033			return -1;
1034	} else {
1035		/* no extra attributes so set attribute len to 0 */
1036		if (ibuf_add_zero(buf, sizeof(len)) == -1)
1037			return -1;
1038	}
1039
1040	return 0;
1041}
1042
1043/*
1044 * Write UPDATE message for changed and added routes. The size of buf limits
1045 * how may routes can be added. The function first dumps the path attributes
1046 * and then tries to add as many prefixes using these attributes.
1047 * Return 0 on success -1 on error which includes producing an empty message.
1048 */
1049int
1050up_dump_update(struct ibuf *buf, struct rde_peer *peer, uint8_t aid)
1051{
1052	struct bgpd_addr addr;
1053	struct prefix *p;
1054	size_t off;
1055	uint16_t len;
1056
1057	p = RB_MIN(prefix_tree, &peer->updates[aid]);
1058	if (p == NULL)
1059		return -1;
1060
1061	/* withdrawn routes length field is 0 */
1062	if (ibuf_add_zero(buf, sizeof(len)) == -1)
1063		return -1;
1064
1065	/* reserve space for 2-byte path attribute length */
1066	off = ibuf_size(buf);
1067	if (ibuf_add_zero(buf, sizeof(len)) == -1)
1068		return -1;
1069
1070	if (up_generate_attr(buf, peer, prefix_aspath(p),
1071	    prefix_communities(p), prefix_nexthop(p), aid) == -1)
1072		goto fail;
1073
1074	if (aid != AID_INET) {
1075		/* write mp attribute including nlri */
1076
1077		/*
1078		 * RFC 7606 wants this to be first but then we need
1079		 * to use multiple buffers with adjusted length to
1080		 * merge the attributes together in reverse order of
1081		 * creation.
1082		 */
1083		if (up_generate_mp_reach(buf, peer, prefix_nexthop(p), aid) ==
1084		    -1)
1085			goto fail;
1086	}
1087
1088	/* update attribute length field */
1089	len = ibuf_size(buf) - off - sizeof(len);
1090	if (ibuf_set_n16(buf, off, len) == -1)
1091		return -1;
1092
1093	if (aid == AID_INET) {
1094		/* last but not least dump the IPv4 nlri */
1095		if (up_dump_prefix(buf, &peer->updates[aid], peer, 0) == -1)
1096			goto fail;
1097	}
1098
1099	return 0;
1100
1101fail:
1102	/* Not enough space. Drop prefix, it will never fit. */
1103	pt_getaddr(p->pt, &addr);
1104	log_peer_warnx(&peer->conf, "dump of path attributes failed, "
1105	    "prefix %s/%d dropped", log_addr(&addr), p->pt->prefixlen);
1106
1107	up_prefix_free(&peer->updates[aid], p, peer, 0);
1108	/* XXX should probably send a withdraw for this prefix */
1109	return -1;
1110}
1111