1/*	$OpenBSD: kroute.c,v 1.71 2023/03/08 04:43:13 guenther Exp $ */
2
3/*
4 * Copyright (c) 2015, 2016 Renato Westphal <renato@openbsd.org>
5 * Copyright (c) 2009 Michele Marchetto <michele@openbsd.org>
6 * Copyright (c) 2004 Esben Norby <norby@openbsd.org>
7 * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
8 *
9 * Permission to use, copy, modify, and distribute this software for any
10 * purpose with or without fee is hereby granted, provided that the above
11 * copyright notice and this permission notice appear in all copies.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
14 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
15 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
16 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
17 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
18 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
19 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20 */
21
22#include <sys/types.h>
23#include <sys/socket.h>
24#include <sys/ioctl.h>
25#include <sys/sysctl.h>
26#include <arpa/inet.h>
27#include <net/if_dl.h>
28#include <net/if_types.h>
29#include <net/route.h>
30#include <netmpls/mpls.h>
31#include <errno.h>
32#include <stdlib.h>
33#include <string.h>
34#include <unistd.h>
35#include <limits.h>
36
37#include "ldpd.h"
38#include "log.h"
39
40struct {
41	uint32_t		rtseq;
42	pid_t			pid;
43	int			fib_sync;
44	int			fd;
45	int			ioctl_fd;
46	struct event		ev;
47	unsigned int		rdomain;
48} kr_state;
49
50struct kroute_node {
51	TAILQ_ENTRY(kroute_node)	 entry;
52	struct kroute_priority		*kprio;		/* back pointer */
53	struct kroute			 r;
54};
55
56struct kroute_priority {
57	TAILQ_ENTRY(kroute_priority)	 entry;
58	struct kroute_prefix		*kp;		/* back pointer */
59	uint8_t				 priority;
60	TAILQ_HEAD(, kroute_node)	 nexthops;
61};
62
63struct kroute_prefix {
64	RB_ENTRY(kroute_prefix)		 entry;
65	int				 af;
66	union ldpd_addr			 prefix;
67	uint8_t				 prefixlen;
68	TAILQ_HEAD(plist, kroute_priority) priorities;
69};
70RB_HEAD(kroute_tree, kroute_prefix);
71RB_PROTOTYPE(kroute_tree, kroute_prefix, entry, kroute_compare)
72
73struct kif_addr {
74	TAILQ_ENTRY(kif_addr)	 entry;
75	struct kaddr		 a;
76};
77
78struct kif_node {
79	RB_ENTRY(kif_node)	 entry;
80	TAILQ_HEAD(, kif_addr)	 addrs;
81	struct kif		 k;
82	struct kpw		*kpw;
83};
84RB_HEAD(kif_tree, kif_node);
85RB_PROTOTYPE(kif_tree, kif_node, entry, kif_compare)
86
87static void		 kr_dispatch_msg(int, short, void *);
88static void		 kr_redist_remove(struct kroute *);
89static int		 kr_redist_eval(struct kroute *);
90static void		 kr_redistribute(struct kroute_prefix *);
91static __inline int	 kroute_compare(struct kroute_prefix *,
92			    struct kroute_prefix *);
93static struct kroute_prefix	*kroute_find_prefix(int, union ldpd_addr *,
94			    uint8_t);
95static struct kroute_priority	*kroute_find_prio(struct kroute_prefix *,
96			    uint8_t);
97static struct kroute_node	*kroute_find_gw(struct kroute_priority *,
98				    union ldpd_addr *);
99static int		 kroute_insert(struct kroute *);
100static int		 kroute_uninstall(struct kroute_node *);
101static int		 kroute_remove(struct kroute *);
102static void		 kroute_clear(void);
103static __inline int	 kif_compare(struct kif_node *, struct kif_node *);
104static struct kif_node	*kif_find(unsigned short);
105static struct kif_node	*kif_insert(unsigned short);
106static int		 kif_remove(struct kif_node *);
107static struct kif_node	*kif_update(unsigned short, int, struct if_data *,
108			    struct sockaddr_dl *, int *);
109static struct kroute_priority	*kroute_match(int, union ldpd_addr *);
110static uint8_t		 prefixlen_classful(in_addr_t);
111static void		 get_rtaddrs(int, struct sockaddr *,
112			    struct sockaddr **);
113static void		 if_change(unsigned short, int, struct if_data *,
114		 	   struct sockaddr_dl *);
115static void		 if_newaddr(unsigned short, struct sockaddr *,
116			    struct sockaddr *, struct sockaddr *);
117static void		 if_deladdr(unsigned short, struct sockaddr *,
118			    struct sockaddr *, struct sockaddr *);
119static void		 if_announce(void *);
120static int		 send_rtmsg(int, int, struct kroute *, int);
121static int		 send_rtmsg_v4(int fd, int, struct kroute *, int);
122static int		 send_rtmsg_v6(int fd, int, struct kroute *, int);
123static int		 fetchtable(void);
124static int		 fetchifs(void);
125static int		 dispatch_rtmsg(void);
126static int		 rtmsg_process(char *, size_t);
127static int		 rtmsg_process_route(struct rt_msghdr *,
128			    struct sockaddr *[RTAX_MAX]);
129static int		 kmpw_install(const char *, struct kpw *);
130static int		 kmpw_uninstall(const char *);
131
132RB_GENERATE(kroute_tree, kroute_prefix, entry, kroute_compare)
133RB_GENERATE(kif_tree, kif_node, entry, kif_compare)
134
135static struct kroute_tree	 krt = RB_INITIALIZER(&krt);
136static struct kif_tree		 kit = RB_INITIALIZER(&kit);
137
138int
139kif_init(void)
140{
141	if (fetchifs() == -1)
142		return (-1);
143
144	if ((kr_state.ioctl_fd = socket(AF_INET,
145	    SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0)) == -1) {
146		log_warn("%s: ioctl socket", __func__);
147		return (-1);
148	}
149
150	return (0);
151}
152
153int
154kr_init(int fs, unsigned int rdomain)
155{
156	int		opt = 0, rcvbuf, default_rcvbuf;
157	socklen_t	optlen;
158	unsigned int	rtfilter;
159
160	kr_state.fib_sync = fs;
161	kr_state.rdomain = rdomain;
162
163	if ((kr_state.fd = socket(AF_ROUTE,
164	    SOCK_RAW | SOCK_CLOEXEC | SOCK_NONBLOCK, 0)) == -1) {
165		log_warn("%s: socket", __func__);
166		return (-1);
167	}
168
169	/* not interested in my own messages */
170	if (setsockopt(kr_state.fd, SOL_SOCKET, SO_USELOOPBACK,
171	    &opt, sizeof(opt)) == -1)
172		log_warn("%s: setsockopt(SO_USELOOPBACK)", __func__);
173
174	/* filter out unwanted messages */
175	rtfilter = ROUTE_FILTER(RTM_ADD) | ROUTE_FILTER(RTM_GET) |
176	    ROUTE_FILTER(RTM_CHANGE) | ROUTE_FILTER(RTM_DELETE) |
177	    ROUTE_FILTER(RTM_IFINFO) | ROUTE_FILTER(RTM_NEWADDR) |
178	    ROUTE_FILTER(RTM_DELADDR) | ROUTE_FILTER(RTM_IFANNOUNCE);
179
180	if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_MSGFILTER,
181	    &rtfilter, sizeof(rtfilter)) == -1)
182		log_warn("%s: setsockopt(ROUTE_MSGFILTER)", __func__);
183
184	/* grow receive buffer, don't wanna miss messages */
185	optlen = sizeof(default_rcvbuf);
186	if (getsockopt(kr_state.fd, SOL_SOCKET, SO_RCVBUF,
187	    &default_rcvbuf, &optlen) == -1)
188		log_warn("%s: getsockopt SOL_SOCKET SO_RCVBUF", __func__);
189	else
190		for (rcvbuf = MAX_RTSOCK_BUF;
191		    rcvbuf > default_rcvbuf &&
192		    setsockopt(kr_state.fd, SOL_SOCKET, SO_RCVBUF,
193		    &rcvbuf, sizeof(rcvbuf)) == -1 && errno == ENOBUFS;
194		    rcvbuf /= 2)
195			;	/* nothing */
196
197	kr_state.pid = getpid();
198	kr_state.rtseq = 1;
199
200	if (fetchtable() == -1)
201		return (-1);
202
203	event_set(&kr_state.ev, kr_state.fd, EV_READ | EV_PERSIST,
204	    kr_dispatch_msg, NULL);
205	event_add(&kr_state.ev, NULL);
206
207	return (0);
208}
209
210void
211kif_redistribute(const char *ifname)
212{
213	struct kif_node		*kif;
214	struct kif_addr		*ka;
215
216	RB_FOREACH(kif, kif_tree, &kit) {
217		if (kif->k.rdomain != kr_state.rdomain)
218			continue;
219
220		if (ifname && strcmp(kif->k.ifname, ifname) != 0)
221			continue;
222
223		TAILQ_FOREACH(ka, &kif->addrs, entry)
224			main_imsg_compose_ldpe(IMSG_NEWADDR, 0, &ka->a,
225			    sizeof(ka->a));
226	}
227}
228
229int
230kr_change(struct kroute *kr)
231{
232	struct kroute_prefix	*kp;
233	struct kroute_priority	*kprio;
234	struct kroute_node	*kn;
235	int			 action = RTM_ADD;
236
237	kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
238	if (kp == NULL)
239		goto miss;
240
241	kprio = kroute_find_prio(kp, kr->priority);
242	if (kprio == NULL)
243		goto miss;
244
245	kn = kroute_find_gw(kprio, &kr->nexthop);
246	if (kn == NULL)
247		goto miss;
248
249	if (kn->r.flags & F_LDPD_INSERTED)
250		action = RTM_CHANGE;
251
252	kn->r.local_label = kr->local_label;
253	kn->r.remote_label = kr->remote_label;
254	kn->r.flags = kn->r.flags | F_LDPD_INSERTED;
255
256	/* send update */
257	if (send_rtmsg(kr_state.fd, action, &kn->r, AF_MPLS) == -1)
258		return (-1);
259
260	if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
261	    kn->r.remote_label != NO_LABEL) {
262		if (send_rtmsg(kr_state.fd, RTM_CHANGE, &kn->r, kn->r.af) == -1)
263			return (-1);
264	}
265
266	return (0);
267
268 miss:
269	log_warnx("%s: lost FEC %s/%d nexthop %s", __func__,
270	    log_addr(kr->af, &kr->prefix), kr->prefixlen,
271	    log_addr(kr->af, &kr->nexthop));
272	return (-1);
273}
274
275int
276kr_delete(struct kroute *kr)
277{
278	struct kroute_prefix	*kp;
279	struct kroute_priority	*kprio;
280	struct kroute_node	*kn;
281	int			 update = 0;
282
283	kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
284	if (kp == NULL)
285		return (0);
286	kprio = kroute_find_prio(kp, kr->priority);
287	if (kprio == NULL)
288		return (0);
289	kn = kroute_find_gw(kprio, &kr->nexthop);
290	if (kn == NULL)
291		return (0);
292
293	if (!(kn->r.flags & F_LDPD_INSERTED))
294		return (0);
295	if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
296	    kn->r.remote_label != NO_LABEL)
297		update = 1;
298
299	/* kill MPLS LSP */
300	if (send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r, AF_MPLS) == -1)
301		return (-1);
302
303	kn->r.flags &= ~F_LDPD_INSERTED;
304	kn->r.local_label = NO_LABEL;
305	kn->r.remote_label = NO_LABEL;
306
307	if (update &&
308	    send_rtmsg(kr_state.fd, RTM_CHANGE, &kn->r, kn->r.af) == -1)
309		return (-1);
310
311	return (0);
312}
313
314void
315kr_shutdown(void)
316{
317	kr_fib_decouple();
318	kroute_clear();
319	kif_clear();
320}
321
322void
323kr_fib_couple(void)
324{
325	struct kroute_prefix	*kp;
326	struct kroute_priority	*kprio;
327	struct kroute_node	*kn;
328	struct kif_node		*kif;
329
330	if (kr_state.fib_sync == 1)	/* already coupled */
331		return;
332
333	kr_state.fib_sync = 1;
334
335	RB_FOREACH(kp, kroute_tree, &krt) {
336		kprio = TAILQ_FIRST(&kp->priorities);
337		if (kprio == NULL)
338			continue;
339
340		TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
341			if (!(kn->r.flags & F_LDPD_INSERTED))
342				continue;
343
344			send_rtmsg(kr_state.fd, RTM_ADD, &kn->r, AF_MPLS);
345
346			if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
347			    kn->r.remote_label != NO_LABEL) {
348				send_rtmsg(kr_state.fd, RTM_CHANGE,
349				    &kn->r, kn->r.af);
350			}
351		}
352	}
353
354	RB_FOREACH(kif, kif_tree, &kit)
355		if (kif->kpw)
356			kmpw_install(kif->k.ifname, kif->kpw);
357
358	log_info("kernel routing table coupled");
359}
360
361void
362kr_fib_decouple(void)
363{
364	struct kroute_prefix	*kp;
365	struct kroute_priority	*kprio;
366	struct kroute_node	*kn;
367	uint32_t		 rl;
368	struct kif_node		*kif;
369
370	if (kr_state.fib_sync == 0)	/* already decoupled */
371		return;
372
373	RB_FOREACH(kp, kroute_tree, &krt) {
374		kprio = TAILQ_FIRST(&kp->priorities);
375		if (kprio == NULL)
376			continue;
377
378		TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
379			if (!(kn->r.flags & F_LDPD_INSERTED))
380				continue;
381
382			send_rtmsg(kr_state.fd, RTM_DELETE,
383			    &kn->r, AF_MPLS);
384
385			if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
386			    kn->r.remote_label != NO_LABEL) {
387				rl = kn->r.remote_label;
388				kn->r.remote_label = NO_LABEL;
389				send_rtmsg(kr_state.fd, RTM_CHANGE,
390				    &kn->r, kn->r.af);
391				kn->r.remote_label = rl;
392			}
393		}
394	}
395
396	RB_FOREACH(kif, kif_tree, &kit)
397		if (kif->kpw)
398			kmpw_uninstall(kif->k.ifname);
399
400	kr_state.fib_sync = 0;
401	log_info("kernel routing table decoupled");
402}
403
404void
405kr_change_egress_label(int af, int was_implicit)
406{
407	struct kroute_prefix	*kp;
408	struct kroute_priority	*kprio;
409	struct kroute_node	*kn;
410
411	RB_FOREACH(kp, kroute_tree, &krt) {
412		if (kp->af != af)
413			continue;
414
415		TAILQ_FOREACH(kprio, &kp->priorities, entry) {
416			TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
417				if (kn->r.local_label > MPLS_LABEL_RESERVED_MAX)
418					continue;
419
420				if (!was_implicit) {
421					kn->r.local_label = MPLS_LABEL_IMPLNULL;
422					continue;
423				}
424
425				switch (kn->r.af) {
426				case AF_INET:
427					kn->r.local_label = MPLS_LABEL_IPV4NULL;
428					break;
429				case AF_INET6:
430					kn->r.local_label = MPLS_LABEL_IPV6NULL;
431					break;
432				default:
433					break;
434				}
435			}
436		}
437	}
438}
439
440static void
441kr_dispatch_msg(int fd, short event, void *bula)
442{
443	if (dispatch_rtmsg() == -1)
444		event_loopexit(NULL);
445}
446
447void
448kr_show_route(struct imsg *imsg)
449{
450	struct kroute_prefix	*kp;
451	struct kroute_priority	*kprio;
452	struct kroute_node	*kn;
453	int			 flags;
454	struct kroute		 kr;
455
456	switch (imsg->hdr.type) {
457	case IMSG_CTL_KROUTE:
458		if (imsg->hdr.len != IMSG_HEADER_SIZE + sizeof(flags)) {
459			log_warnx("%s: wrong imsg len", __func__);
460			return;
461		}
462		memcpy(&flags, imsg->data, sizeof(flags));
463
464		RB_FOREACH(kp, kroute_tree, &krt)
465			TAILQ_FOREACH(kprio, &kp->priorities, entry)
466				TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
467					if (flags && !(kn->r.flags & flags))
468						continue;
469
470					main_imsg_compose_ldpe(IMSG_CTL_KROUTE,
471					    imsg->hdr.pid, &kn->r,
472					    sizeof(kn->r));
473				}
474		break;
475	case IMSG_CTL_KROUTE_ADDR:
476		if (imsg->hdr.len != IMSG_HEADER_SIZE + sizeof(kr)) {
477			log_warnx("%s: wrong imsg len", __func__);
478			return;
479		}
480		memcpy(&kr, imsg->data, sizeof(kr));
481
482		kprio = kroute_match(kr.af, &kr.prefix);
483		if (kprio == NULL)
484			break;
485
486		TAILQ_FOREACH(kn, &kprio->nexthops, entry)
487			main_imsg_compose_ldpe(IMSG_CTL_KROUTE, imsg->hdr.pid,
488			    &kn->r, sizeof(kn->r));
489		break;
490	default:
491		log_debug("%s: error handling imsg", __func__);
492		break;
493	}
494	main_imsg_compose_ldpe(IMSG_CTL_END, imsg->hdr.pid, NULL, 0);
495}
496
497void
498kr_ifinfo(char *ifname, pid_t pid)
499{
500	struct kif_node	*kif;
501
502	RB_FOREACH(kif, kif_tree, &kit)
503		if (ifname == NULL || !strcmp(ifname, kif->k.ifname)) {
504			main_imsg_compose_ldpe(IMSG_CTL_IFINFO,
505			    pid, &kif->k, sizeof(kif->k));
506		}
507
508	main_imsg_compose_ldpe(IMSG_CTL_END, pid, NULL, 0);
509}
510
511static void
512kr_redist_remove(struct kroute *kr)
513{
514	/* was the route redistributed? */
515	if ((kr->flags & F_REDISTRIBUTED) == 0)
516		return;
517
518	/* remove redistributed flag */
519	kr->flags &= ~F_REDISTRIBUTED;
520	main_imsg_compose_lde(IMSG_NETWORK_DEL, 0, kr, sizeof(*kr));
521}
522
523static int
524kr_redist_eval(struct kroute *kr)
525{
526	/* was the route redistributed? */
527	if (kr->flags & F_REDISTRIBUTED)
528		goto dont_redistribute;
529
530	/* Dynamic routes are not redistributable. */
531	if (kr->flags & F_DYNAMIC)
532		goto dont_redistribute;
533
534	/* filter-out non-redistributable addresses */
535	if (bad_addr(kr->af, &kr->prefix) ||
536	    (kr->af == AF_INET6 && IN6_IS_SCOPE_EMBED(&kr->prefix.v6)))
537		goto dont_redistribute;
538
539	/* do not redistribute the default route */
540	if (kr->prefixlen == 0)
541		goto dont_redistribute;
542
543	/*
544	 * Consider networks with nexthop loopback as not redistributable
545	 * unless it is a reject or blackhole route.
546	 */
547	switch (kr->af) {
548	case AF_INET:
549		if (kr->nexthop.v4.s_addr == htonl(INADDR_LOOPBACK) &&
550		    !(kr->flags & (F_BLACKHOLE|F_REJECT)))
551			goto dont_redistribute;
552		break;
553	case AF_INET6:
554		if (IN6_IS_ADDR_LOOPBACK(&kr->nexthop.v6) &&
555		    !(kr->flags & (F_BLACKHOLE|F_REJECT)))
556			goto dont_redistribute;
557		break;
558	default:
559		log_debug("%s: unexpected address-family", __func__);
560		break;
561	}
562
563	/* prefix should be redistributed */
564	kr->flags |= F_REDISTRIBUTED;
565	main_imsg_compose_lde(IMSG_NETWORK_ADD, 0, kr, sizeof(*kr));
566	return (1);
567
568 dont_redistribute:
569	return (0);
570}
571
572static void
573kr_redistribute(struct kroute_prefix *kp)
574{
575	struct kroute_priority	*kprio;
576	struct kroute_node	*kn;
577
578	TAILQ_FOREACH_REVERSE(kprio, &kp->priorities, plist, entry) {
579		if (kprio == TAILQ_FIRST(&kp->priorities)) {
580			TAILQ_FOREACH(kn, &kprio->nexthops, entry)
581				kr_redist_eval(&kn->r);
582		} else {
583			TAILQ_FOREACH(kn, &kprio->nexthops, entry)
584				kr_redist_remove(&kn->r);
585		}
586	}
587}
588
589/* rb-tree compare */
590static __inline int
591kroute_compare(struct kroute_prefix *a, struct kroute_prefix *b)
592{
593	int		 addrcmp;
594
595	if (a->af < b->af)
596		return (-1);
597	if (a->af > b->af)
598		return (1);
599
600	addrcmp = ldp_addrcmp(a->af, &a->prefix, &b->prefix);
601	if (addrcmp != 0)
602		return (addrcmp);
603
604	if (a->prefixlen < b->prefixlen)
605		return (-1);
606	if (a->prefixlen > b->prefixlen)
607		return (1);
608
609	return (0);
610}
611
612/* tree management */
613static struct kroute_prefix *
614kroute_find_prefix(int af, union ldpd_addr *prefix, uint8_t prefixlen)
615{
616	struct kroute_prefix	 s;
617
618	s.af = af;
619	s.prefix = *prefix;
620	s.prefixlen = prefixlen;
621
622	return (RB_FIND(kroute_tree, &krt, &s));
623}
624
625static struct kroute_priority *
626kroute_find_prio(struct kroute_prefix *kp, uint8_t prio)
627{
628	struct kroute_priority	*kprio;
629
630	/* RTP_ANY here picks the lowest priority node */
631	if (prio == RTP_ANY)
632		return (TAILQ_FIRST(&kp->priorities));
633
634	TAILQ_FOREACH(kprio, &kp->priorities, entry)
635		if (kprio->priority == prio)
636			return (kprio);
637
638	return (NULL);
639}
640
641static struct kroute_node *
642kroute_find_gw(struct kroute_priority *kprio, union ldpd_addr *nh)
643{
644	struct kroute_node	*kn;
645
646	TAILQ_FOREACH(kn, &kprio->nexthops, entry)
647		if (ldp_addrcmp(kprio->kp->af, &kn->r.nexthop, nh) == 0)
648			return (kn);
649
650	return (NULL);
651}
652
653static int
654kroute_insert(struct kroute *kr)
655{
656	struct kroute_prefix	*kp;
657	struct kroute_priority	*kprio, *tmp;
658	struct kroute_node	*kn;
659
660	kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
661	if (kp == NULL) {
662		kp = calloc(1, sizeof((*kp)));
663		if (kp == NULL)
664			fatal(__func__);
665		kp->af = kr->af;
666		kp->prefix = kr->prefix;
667		kp->prefixlen = kr->prefixlen;
668		TAILQ_INIT(&kp->priorities);
669		RB_INSERT(kroute_tree, &krt, kp);
670	}
671
672	kprio = kroute_find_prio(kp, kr->priority);
673	if (kprio == NULL) {
674		kprio = calloc(1, sizeof(*kprio));
675		if (kprio == NULL)
676			fatal(__func__);
677		kprio->kp = kp;
678		kprio->priority = kr->priority;
679		TAILQ_INIT(&kprio->nexthops);
680
681		/* lower priorities first */
682		TAILQ_FOREACH(tmp, &kp->priorities, entry)
683			if (tmp->priority > kprio->priority)
684				break;
685		if (tmp)
686			TAILQ_INSERT_BEFORE(tmp, kprio, entry);
687		else
688			TAILQ_INSERT_TAIL(&kp->priorities, kprio, entry);
689	}
690
691	kn = kroute_find_gw(kprio, &kr->nexthop);
692	if (kn == NULL) {
693		kn = calloc(1, sizeof(*kn));
694		if (kn == NULL)
695			fatal(__func__);
696		kn->kprio = kprio;
697		kn->r = *kr;
698		TAILQ_INSERT_TAIL(&kprio->nexthops, kn, entry);
699	}
700
701	kr_redistribute(kp);
702	return (0);
703}
704
705static int
706kroute_uninstall(struct kroute_node *kn)
707{
708	/* kill MPLS LSP if one was installed */
709	if (kn->r.flags & F_LDPD_INSERTED)
710		if (send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r, AF_MPLS) == -1)
711			return (-1);
712
713	return (0);
714}
715
716static int
717kroute_remove(struct kroute *kr)
718{
719	struct kroute_prefix	*kp;
720	struct kroute_priority	*kprio;
721	struct kroute_node	*kn;
722
723	kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
724	if (kp == NULL)
725		goto notfound;
726	kprio = kroute_find_prio(kp, kr->priority);
727	if (kprio == NULL)
728		goto notfound;
729	kn = kroute_find_gw(kprio, &kr->nexthop);
730	if (kn == NULL)
731		goto notfound;
732
733	kr_redist_remove(&kn->r);
734	kroute_uninstall(kn);
735
736	TAILQ_REMOVE(&kprio->nexthops, kn, entry);
737	free(kn);
738
739	if (TAILQ_EMPTY(&kprio->nexthops)) {
740		TAILQ_REMOVE(&kp->priorities, kprio, entry);
741		free(kprio);
742	}
743
744	if (TAILQ_EMPTY(&kp->priorities)) {
745		if (RB_REMOVE(kroute_tree, &krt, kp) == NULL) {
746			log_warnx("%s failed for %s/%u", __func__,
747			    log_addr(kr->af, &kr->prefix), kp->prefixlen);
748			return (-1);
749		}
750		free(kp);
751	} else
752		kr_redistribute(kp);
753
754	return (0);
755
756 notfound:
757	log_warnx("%s failed to find %s/%u", __func__,
758	    log_addr(kr->af, &kr->prefix), kr->prefixlen);
759	return (-1);
760}
761
762static void
763kroute_clear(void)
764{
765	struct kroute_prefix	*kp;
766	struct kroute_priority	*kprio;
767	struct kroute_node	*kn;
768
769	while ((kp = RB_MIN(kroute_tree, &krt)) != NULL) {
770		while ((kprio = TAILQ_FIRST(&kp->priorities)) != NULL) {
771			while ((kn = TAILQ_FIRST(&kprio->nexthops)) != NULL) {
772				kr_redist_remove(&kn->r);
773				kroute_uninstall(kn);
774				TAILQ_REMOVE(&kprio->nexthops, kn, entry);
775				free(kn);
776			}
777			TAILQ_REMOVE(&kp->priorities, kprio, entry);
778			free(kprio);
779		}
780		RB_REMOVE(kroute_tree, &krt, kp);
781		free(kp);
782	}
783}
784
785static __inline int
786kif_compare(struct kif_node *a, struct kif_node *b)
787{
788	return (b->k.ifindex - a->k.ifindex);
789}
790
791/* tree management */
792static struct kif_node *
793kif_find(unsigned short ifindex)
794{
795	struct kif_node	s;
796
797	memset(&s, 0, sizeof(s));
798	s.k.ifindex = ifindex;
799
800	return (RB_FIND(kif_tree, &kit, &s));
801}
802
803struct kif *
804kif_findname(char *ifname)
805{
806	struct kif_node	*kif;
807
808	RB_FOREACH(kif, kif_tree, &kit)
809		if (!strcmp(ifname, kif->k.ifname))
810			return (&kif->k);
811
812	return (NULL);
813}
814
815static struct kif_node *
816kif_insert(unsigned short ifindex)
817{
818	struct kif_node	*kif;
819
820	if ((kif = calloc(1, sizeof(struct kif_node))) == NULL)
821		return (NULL);
822
823	kif->k.ifindex = ifindex;
824	TAILQ_INIT(&kif->addrs);
825
826	if (RB_INSERT(kif_tree, &kit, kif) != NULL)
827		fatalx("kif_insert: RB_INSERT");
828
829	return (kif);
830}
831
832static int
833kif_remove(struct kif_node *kif)
834{
835	struct kif_addr	*ka;
836
837	if (RB_REMOVE(kif_tree, &kit, kif) == NULL) {
838		log_warnx("RB_REMOVE(kif_tree, &kit, kif)");
839		return (-1);
840	}
841
842	while ((ka = TAILQ_FIRST(&kif->addrs)) != NULL) {
843		main_imsg_compose_ldpe(IMSG_DELADDR, 0, &ka->a, sizeof(ka->a));
844		TAILQ_REMOVE(&kif->addrs, ka, entry);
845		free(ka);
846	}
847	free(kif);
848	return (0);
849}
850
851void
852kif_clear(void)
853{
854	struct kif_node	*kif;
855
856	while ((kif = RB_MIN(kif_tree, &kit)) != NULL)
857		kif_remove(kif);
858}
859
860static struct kif_node *
861kif_update(unsigned short ifindex, int flags, struct if_data *ifd,
862    struct sockaddr_dl *sdl, int *link_old)
863{
864	struct kif_node		*kif;
865
866	if ((kif = kif_find(ifindex)) == NULL) {
867		if ((kif = kif_insert(ifindex)) == NULL)
868			return (NULL);
869	} else
870		*link_old = (kif->k.flags & IFF_UP) &&
871		    LINK_STATE_IS_UP(kif->k.link_state);
872
873	kif->k.flags = flags;
874	kif->k.link_state = ifd->ifi_link_state;
875	if (sdl)
876		memcpy(kif->k.mac, LLADDR(sdl), sizeof(kif->k.mac));
877	kif->k.if_type = ifd->ifi_type;
878	kif->k.baudrate = ifd->ifi_baudrate;
879	kif->k.mtu = ifd->ifi_mtu;
880	kif->k.rdomain = ifd->ifi_rdomain;
881
882	if (sdl && sdl->sdl_family == AF_LINK) {
883		if (sdl->sdl_nlen >= sizeof(kif->k.ifname))
884			memcpy(kif->k.ifname, sdl->sdl_data,
885			    sizeof(kif->k.ifname) - 1);
886		else if (sdl->sdl_nlen > 0)
887			memcpy(kif->k.ifname, sdl->sdl_data,
888			    sdl->sdl_nlen);
889		/* string already terminated via calloc() */
890	}
891
892	return (kif);
893}
894
895static struct kroute_priority *
896kroute_match(int af, union ldpd_addr *key)
897{
898	int			 i, maxprefixlen;
899	struct kroute_prefix	*kp;
900	struct kroute_priority	*kprio;
901	union ldpd_addr		 addr;
902
903	switch (af) {
904	case AF_INET:
905		maxprefixlen = 32;
906		break;
907	case AF_INET6:
908		maxprefixlen = 128;
909		break;
910	default:
911		log_warnx("%s: unknown af", __func__);
912		return (NULL);
913	}
914
915	for (i = maxprefixlen; i >= 0; i--) {
916		ldp_applymask(af, &addr, key, i);
917
918		kp = kroute_find_prefix(af, &addr, i);
919		if (kp == NULL)
920			continue;
921
922		kprio = kroute_find_prio(kp, RTP_ANY);
923		if (kprio != NULL)
924			return (kprio);
925	}
926
927	return (NULL);
928}
929
930/* misc */
931static uint8_t
932prefixlen_classful(in_addr_t ina)
933{
934	/* it hurt to write this. */
935
936	if (ina >= 0xf0000000U)		/* class E */
937		return (32);
938	else if (ina >= 0xe0000000U)	/* class D */
939		return (4);
940	else if (ina >= 0xc0000000U)	/* class C */
941		return (24);
942	else if (ina >= 0x80000000U)	/* class B */
943		return (16);
944	else				/* class A */
945		return (8);
946}
947
948#define ROUNDUP(a) \
949	((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
950
951static void
952get_rtaddrs(int addrs, struct sockaddr *sa, struct sockaddr **rti_info)
953{
954	int	i;
955
956	for (i = 0; i < RTAX_MAX; i++) {
957		if (addrs & (1 << i)) {
958			rti_info[i] = sa;
959			sa = (struct sockaddr *)((char *)(sa) +
960			    ROUNDUP(sa->sa_len));
961		} else
962			rti_info[i] = NULL;
963	}
964}
965
966static void
967if_change(unsigned short ifindex, int flags, struct if_data *ifd,
968    struct sockaddr_dl *sdl)
969{
970	struct kif_node		*kif;
971	struct kif_addr		*ka;
972	int			 link_old = 0, link_new;
973
974	kif = kif_update(ifindex, flags, ifd, sdl, &link_old);
975	if (!kif) {
976		log_warn("%s: kif_update(%u)", __func__, ifindex);
977		return;
978	}
979	link_new = (kif->k.flags & IFF_UP) &&
980	    LINK_STATE_IS_UP(kif->k.link_state);
981
982	if (link_new == link_old)
983		return;
984
985	main_imsg_compose_ldpe(IMSG_IFSTATUS, 0, &kif->k, sizeof(struct kif));
986	if (link_new) {
987		TAILQ_FOREACH(ka, &kif->addrs, entry)
988			main_imsg_compose_ldpe(IMSG_NEWADDR, 0, &ka->a,
989			    sizeof(ka->a));
990	} else {
991		TAILQ_FOREACH(ka, &kif->addrs, entry)
992			main_imsg_compose_ldpe(IMSG_DELADDR, 0, &ka->a,
993			    sizeof(ka->a));
994	}
995}
996
997static void
998if_newaddr(unsigned short ifindex, struct sockaddr *ifa, struct sockaddr *mask,
999    struct sockaddr *brd)
1000{
1001	struct kif_node		*kif;
1002	struct sockaddr_in	*ifa4, *mask4, *brd4;
1003	struct sockaddr_in6	*ifa6, *mask6, *brd6;
1004	struct kif_addr		*ka;
1005
1006	if (ifa == NULL)
1007		return;
1008	if ((kif = kif_find(ifindex)) == NULL) {
1009		log_warnx("%s: corresponding if %d not found", __func__,
1010		    ifindex);
1011		return;
1012	}
1013
1014	switch (ifa->sa_family) {
1015	case AF_INET:
1016		ifa4 = (struct sockaddr_in *) ifa;
1017		mask4 = (struct sockaddr_in *) mask;
1018		brd4 = (struct sockaddr_in *) brd;
1019
1020		/* filter out unwanted addresses */
1021		if (bad_addr_v4(ifa4->sin_addr))
1022			return;
1023
1024		if ((ka = calloc(1, sizeof(struct kif_addr))) == NULL)
1025			fatal("if_newaddr");
1026		ka->a.addr.v4 = ifa4->sin_addr;
1027		if (mask4)
1028			ka->a.prefixlen =
1029			    mask2prefixlen(mask4->sin_addr.s_addr);
1030		if (brd4)
1031			ka->a.dstbrd.v4 = brd4->sin_addr;
1032		break;
1033	case AF_INET6:
1034		ifa6 = (struct sockaddr_in6 *) ifa;
1035		mask6 = (struct sockaddr_in6 *) mask;
1036		brd6 = (struct sockaddr_in6 *) brd;
1037
1038		/* We only care about link-local and global-scope. */
1039		if (bad_addr_v6(&ifa6->sin6_addr))
1040			return;
1041
1042		clearscope(&ifa6->sin6_addr);
1043
1044		if ((ka = calloc(1, sizeof(struct kif_addr))) == NULL)
1045			fatal("if_newaddr");
1046		ka->a.addr.v6 = ifa6->sin6_addr;
1047		if (mask6)
1048			ka->a.prefixlen = mask2prefixlen6(mask6);
1049		if (brd6)
1050			ka->a.dstbrd.v6 = brd6->sin6_addr;
1051		break;
1052	default:
1053		return;
1054	}
1055
1056	ka->a.ifindex = ifindex;
1057	ka->a.af = ifa->sa_family;
1058	TAILQ_INSERT_TAIL(&kif->addrs, ka, entry);
1059
1060	/* notify ldpe about new address */
1061	main_imsg_compose_ldpe(IMSG_NEWADDR, 0, &ka->a, sizeof(ka->a));
1062}
1063
1064static void
1065if_deladdr(unsigned short ifindex, struct sockaddr *ifa, struct sockaddr *mask,
1066    struct sockaddr *brd)
1067{
1068	struct kif_node		*kif;
1069	struct sockaddr_in	*ifa4, *mask4, *brd4;
1070	struct sockaddr_in6	*ifa6, *mask6, *brd6;
1071	struct kaddr		 k;
1072	struct kif_addr		*ka, *nka;
1073
1074	if (ifa == NULL)
1075		return;
1076	if ((kif = kif_find(ifindex)) == NULL) {
1077		log_warnx("%s: corresponding if %d not found", __func__,
1078		    ifindex);
1079		return;
1080	}
1081
1082	memset(&k, 0, sizeof(k));
1083	k.af = ifa->sa_family;
1084	switch (ifa->sa_family) {
1085	case AF_INET:
1086		ifa4 = (struct sockaddr_in *) ifa;
1087		mask4 = (struct sockaddr_in *) mask;
1088		brd4 = (struct sockaddr_in *) brd;
1089
1090		/* filter out unwanted addresses */
1091		if (bad_addr_v4(ifa4->sin_addr))
1092			return;
1093
1094		k.addr.v4 = ifa4->sin_addr;
1095		if (mask4)
1096			k.prefixlen = mask2prefixlen(mask4->sin_addr.s_addr);
1097		if (brd4)
1098			k.dstbrd.v4 = brd4->sin_addr;
1099		break;
1100	case AF_INET6:
1101		ifa6 = (struct sockaddr_in6 *) ifa;
1102		mask6 = (struct sockaddr_in6 *) mask;
1103		brd6 = (struct sockaddr_in6 *) brd;
1104
1105		/* We only care about link-local and global-scope. */
1106		if (bad_addr_v6(&ifa6->sin6_addr))
1107			return;
1108
1109		clearscope(&ifa6->sin6_addr);
1110
1111		k.addr.v6 = ifa6->sin6_addr;
1112		if (mask6)
1113			k.prefixlen = mask2prefixlen6(mask6);
1114		if (brd6)
1115			k.dstbrd.v6 = brd6->sin6_addr;
1116		break;
1117	default:
1118		return;
1119	}
1120
1121	for (ka = TAILQ_FIRST(&kif->addrs); ka != NULL; ka = nka) {
1122		nka = TAILQ_NEXT(ka, entry);
1123
1124		if (ka->a.af != k.af ||
1125		    ka->a.prefixlen != k.prefixlen ||
1126		    ldp_addrcmp(ka->a.af, &ka->a.addr, &k.addr))
1127			continue;
1128
1129		/* notify ldpe about removed address */
1130		main_imsg_compose_ldpe(IMSG_DELADDR, 0, &ka->a, sizeof(ka->a));
1131		TAILQ_REMOVE(&kif->addrs, ka, entry);
1132		free(ka);
1133		return;
1134	}
1135}
1136
1137static void
1138if_announce(void *msg)
1139{
1140	struct if_announcemsghdr	*ifan;
1141	struct kif_node			*kif;
1142
1143	ifan = msg;
1144
1145	switch (ifan->ifan_what) {
1146	case IFAN_ARRIVAL:
1147		kif = kif_insert(ifan->ifan_index);
1148		if (kif)
1149			strlcpy(kif->k.ifname, ifan->ifan_name,
1150			    sizeof(kif->k.ifname));
1151		break;
1152	case IFAN_DEPARTURE:
1153		kif = kif_find(ifan->ifan_index);
1154		if (kif)
1155			kif_remove(kif);
1156		break;
1157	}
1158}
1159
1160/* rtsock */
1161static int
1162send_rtmsg(int fd, int action, struct kroute *kr, int family)
1163{
1164	switch (kr->af) {
1165	case AF_INET:
1166		return (send_rtmsg_v4(fd, action, kr, family));
1167	case AF_INET6:
1168		return (send_rtmsg_v6(fd, action, kr, family));
1169	default:
1170		fatalx("send_rtmsg: unknown af");
1171	}
1172}
1173
1174static int
1175send_rtmsg_v4(int fd, int action, struct kroute *kr, int family)
1176{
1177	struct iovec		iov[5];
1178	struct rt_msghdr	hdr;
1179	struct sockaddr_mpls	label_in, label_out;
1180	struct sockaddr_in	dst, mask, nexthop;
1181	int			iovcnt = 0;
1182
1183	if (kr_state.fib_sync == 0)
1184		return (0);
1185
1186	/*
1187	 * Reserved labels (implicit and explicit NULL) should not be added
1188	 * to the FIB.
1189	 */
1190	if (family == AF_MPLS && kr->local_label < MPLS_LABEL_RESERVED_MAX)
1191		return (0);
1192
1193	/* initialize header */
1194	memset(&hdr, 0, sizeof(hdr));
1195	hdr.rtm_version = RTM_VERSION;
1196
1197	hdr.rtm_type = action;
1198	hdr.rtm_flags = RTF_UP;
1199	hdr.rtm_fmask = RTF_MPLS;
1200	hdr.rtm_seq = kr_state.rtseq++;	/* overflow doesn't matter */
1201	hdr.rtm_msglen = sizeof(hdr);
1202	hdr.rtm_hdrlen = sizeof(struct rt_msghdr);
1203	hdr.rtm_priority = kr->priority;
1204	hdr.rtm_tableid = kr_state.rdomain;	/* rtableid */
1205	/* adjust iovec */
1206	iov[iovcnt].iov_base = &hdr;
1207	iov[iovcnt++].iov_len = sizeof(hdr);
1208
1209	if (family == AF_MPLS) {
1210		memset(&label_in, 0, sizeof(label_in));
1211		label_in.smpls_len = sizeof(label_in);
1212		label_in.smpls_family = AF_MPLS;
1213		label_in.smpls_label =
1214		    htonl(kr->local_label << MPLS_LABEL_OFFSET);
1215		/* adjust header */
1216		hdr.rtm_flags |= RTF_MPLS | RTF_MPATH;
1217		hdr.rtm_addrs |= RTA_DST;
1218		hdr.rtm_msglen += sizeof(label_in);
1219		/* adjust iovec */
1220		iov[iovcnt].iov_base = &label_in;
1221		iov[iovcnt++].iov_len = sizeof(label_in);
1222	} else {
1223		memset(&dst, 0, sizeof(dst));
1224		dst.sin_len = sizeof(dst);
1225		dst.sin_family = AF_INET;
1226		dst.sin_addr = kr->prefix.v4;
1227		/* adjust header */
1228		hdr.rtm_addrs |= RTA_DST;
1229		hdr.rtm_msglen += sizeof(dst);
1230		/* adjust iovec */
1231		iov[iovcnt].iov_base = &dst;
1232		iov[iovcnt++].iov_len = sizeof(dst);
1233	}
1234
1235	memset(&nexthop, 0, sizeof(nexthop));
1236	nexthop.sin_len = sizeof(nexthop);
1237	nexthop.sin_family = AF_INET;
1238	nexthop.sin_addr = kr->nexthop.v4;
1239	/* adjust header */
1240	hdr.rtm_flags |= RTF_GATEWAY;
1241	hdr.rtm_addrs |= RTA_GATEWAY;
1242	hdr.rtm_msglen += sizeof(nexthop);
1243	/* adjust iovec */
1244	iov[iovcnt].iov_base = &nexthop;
1245	iov[iovcnt++].iov_len = sizeof(nexthop);
1246
1247	if (family == AF_INET) {
1248		memset(&mask, 0, sizeof(mask));
1249		mask.sin_len = sizeof(mask);
1250		mask.sin_family = AF_INET;
1251		mask.sin_addr.s_addr = prefixlen2mask(kr->prefixlen);
1252		/* adjust header */
1253		hdr.rtm_addrs |= RTA_NETMASK;
1254		hdr.rtm_msglen += sizeof(mask);
1255		/* adjust iovec */
1256		iov[iovcnt].iov_base = &mask;
1257		iov[iovcnt++].iov_len = sizeof(mask);
1258	}
1259
1260	/* If action is RTM_DELETE we have to get rid of MPLS infos */
1261	if (kr->remote_label != NO_LABEL && action != RTM_DELETE) {
1262		memset(&label_out, 0, sizeof(label_out));
1263		label_out.smpls_len = sizeof(label_out);
1264		label_out.smpls_family = AF_MPLS;
1265		label_out.smpls_label =
1266		    htonl(kr->remote_label << MPLS_LABEL_OFFSET);
1267		/* adjust header */
1268		hdr.rtm_addrs |= RTA_SRC;
1269		hdr.rtm_flags |= RTF_MPLS;
1270		hdr.rtm_msglen += sizeof(label_out);
1271		/* adjust iovec */
1272		iov[iovcnt].iov_base = &label_out;
1273		iov[iovcnt++].iov_len = sizeof(label_out);
1274
1275		if (kr->remote_label == MPLS_LABEL_IMPLNULL) {
1276			if (family == AF_MPLS)
1277				hdr.rtm_mpls = MPLS_OP_POP;
1278			else
1279				return (0);
1280		} else {
1281			if (family == AF_MPLS)
1282				hdr.rtm_mpls = MPLS_OP_SWAP;
1283			else
1284				hdr.rtm_mpls = MPLS_OP_PUSH;
1285		}
1286	}
1287
1288 retry:
1289	if (writev(fd, iov, iovcnt) == -1) {
1290		if (errno == ESRCH) {
1291			if (hdr.rtm_type == RTM_CHANGE && family == AF_MPLS) {
1292				hdr.rtm_type = RTM_ADD;
1293				goto retry;
1294			} else if (hdr.rtm_type == RTM_DELETE) {
1295				log_info("route %s/%u vanished before delete",
1296				    inet_ntoa(kr->prefix.v4), kr->prefixlen);
1297				return (-1);
1298			}
1299		}
1300		log_warn("%s action %u, af %s, prefix %s/%u", __func__,
1301		    hdr.rtm_type, af_name(family), inet_ntoa(kr->prefix.v4),
1302		    kr->prefixlen);
1303		return (-1);
1304	}
1305
1306	return (0);
1307}
1308
1309static int
1310send_rtmsg_v6(int fd, int action, struct kroute *kr, int family)
1311{
1312	struct iovec		iov[5];
1313	struct rt_msghdr	hdr;
1314	struct sockaddr_mpls	label_in, label_out;
1315	struct sockaddr_in6	dst, mask, nexthop;
1316	int			iovcnt = 0;
1317
1318	if (kr_state.fib_sync == 0)
1319		return (0);
1320
1321	/*
1322	 * Reserved labels (implicit and explicit NULL) should not be added
1323	 * to the FIB.
1324	 */
1325	if (family == AF_MPLS && kr->local_label < MPLS_LABEL_RESERVED_MAX)
1326		return (0);
1327
1328	/* initialize header */
1329	memset(&hdr, 0, sizeof(hdr));
1330	hdr.rtm_version = RTM_VERSION;
1331
1332	hdr.rtm_type = action;
1333	hdr.rtm_flags = RTF_UP;
1334	hdr.rtm_fmask = RTF_MPLS;
1335	hdr.rtm_seq = kr_state.rtseq++;	/* overflow doesn't matter */
1336	hdr.rtm_msglen = sizeof(hdr);
1337	hdr.rtm_hdrlen = sizeof(struct rt_msghdr);
1338	hdr.rtm_priority = kr->priority;
1339	hdr.rtm_tableid = kr_state.rdomain;	/* rtableid */
1340	/* adjust iovec */
1341	iov[iovcnt].iov_base = &hdr;
1342	iov[iovcnt++].iov_len = sizeof(hdr);
1343
1344	if (family == AF_MPLS) {
1345		memset(&label_in, 0, sizeof(label_in));
1346		label_in.smpls_len = sizeof(label_in);
1347		label_in.smpls_family = AF_MPLS;
1348		label_in.smpls_label =
1349		    htonl(kr->local_label << MPLS_LABEL_OFFSET);
1350		/* adjust header */
1351		hdr.rtm_flags |= RTF_MPLS | RTF_MPATH;
1352		hdr.rtm_addrs |= RTA_DST;
1353		hdr.rtm_msglen += sizeof(label_in);
1354		/* adjust iovec */
1355		iov[iovcnt].iov_base = &label_in;
1356		iov[iovcnt++].iov_len = sizeof(label_in);
1357	} else {
1358		memset(&dst, 0, sizeof(dst));
1359		dst.sin6_len = sizeof(dst);
1360		dst.sin6_family = AF_INET6;
1361		dst.sin6_addr = kr->prefix.v6;
1362		/* adjust header */
1363		hdr.rtm_addrs |= RTA_DST;
1364		hdr.rtm_msglen += ROUNDUP(sizeof(dst));
1365		/* adjust iovec */
1366		iov[iovcnt].iov_base = &dst;
1367		iov[iovcnt++].iov_len = ROUNDUP(sizeof(dst));
1368	}
1369
1370	memset(&nexthop, 0, sizeof(nexthop));
1371	nexthop.sin6_len = sizeof(nexthop);
1372	nexthop.sin6_family = AF_INET6;
1373	nexthop.sin6_addr = kr->nexthop.v6;
1374	nexthop.sin6_scope_id = kr->ifindex;
1375	/*
1376	 * XXX we should set the sin6_scope_id but the kernel
1377	 * XXX does not expect it that way. It must be fiddled
1378	 * XXX into the sin6_addr. Welcome to the typical
1379	 * XXX IPv6 insanity and all without wine bottles.
1380	 */
1381	embedscope(&nexthop);
1382
1383	/* adjust header */
1384	hdr.rtm_flags |= RTF_GATEWAY;
1385	hdr.rtm_addrs |= RTA_GATEWAY;
1386	hdr.rtm_msglen += ROUNDUP(sizeof(nexthop));
1387	/* adjust iovec */
1388	iov[iovcnt].iov_base = &nexthop;
1389	iov[iovcnt++].iov_len = ROUNDUP(sizeof(nexthop));
1390
1391	if (family == AF_INET6) {
1392		memset(&mask, 0, sizeof(mask));
1393		mask.sin6_len = sizeof(mask);
1394		mask.sin6_family = AF_INET6;
1395		mask.sin6_addr = *prefixlen2mask6(kr->prefixlen);
1396		/* adjust header */
1397		if (kr->prefixlen == 128)
1398			hdr.rtm_flags |= RTF_HOST;
1399		hdr.rtm_addrs |= RTA_NETMASK;
1400		hdr.rtm_msglen += ROUNDUP(sizeof(mask));
1401		/* adjust iovec */
1402		iov[iovcnt].iov_base = &mask;
1403		iov[iovcnt++].iov_len = ROUNDUP(sizeof(mask));
1404	}
1405
1406	/* If action is RTM_DELETE we have to get rid of MPLS infos */
1407	if (kr->remote_label != NO_LABEL && action != RTM_DELETE) {
1408		memset(&label_out, 0, sizeof(label_out));
1409		label_out.smpls_len = sizeof(label_out);
1410		label_out.smpls_family = AF_MPLS;
1411		label_out.smpls_label =
1412		    htonl(kr->remote_label << MPLS_LABEL_OFFSET);
1413		/* adjust header */
1414		hdr.rtm_addrs |= RTA_SRC;
1415		hdr.rtm_flags |= RTF_MPLS;
1416		hdr.rtm_msglen += sizeof(label_out);
1417		/* adjust iovec */
1418		iov[iovcnt].iov_base = &label_out;
1419		iov[iovcnt++].iov_len = sizeof(label_out);
1420
1421		if (kr->remote_label == MPLS_LABEL_IMPLNULL) {
1422			if (family == AF_MPLS)
1423				hdr.rtm_mpls = MPLS_OP_POP;
1424			else
1425				return (0);
1426		} else {
1427			if (family == AF_MPLS)
1428				hdr.rtm_mpls = MPLS_OP_SWAP;
1429			else
1430				hdr.rtm_mpls = MPLS_OP_PUSH;
1431		}
1432	}
1433
1434 retry:
1435	if (writev(fd, iov, iovcnt) == -1) {
1436		if (errno == ESRCH) {
1437			if (hdr.rtm_type == RTM_CHANGE && family == AF_MPLS) {
1438				hdr.rtm_type = RTM_ADD;
1439				goto retry;
1440			} else if (hdr.rtm_type == RTM_DELETE) {
1441				log_info("route %s/%u vanished before delete",
1442				    log_addr(kr->af, &kr->prefix),
1443				    kr->prefixlen);
1444				return (-1);
1445			}
1446		}
1447		log_warn("%s action %u, af %s, prefix %s/%u", __func__,
1448		    hdr.rtm_type, af_name(family), log_addr(kr->af,
1449		    &kr->prefix), kr->prefixlen);
1450		return (-1);
1451	}
1452	return (0);
1453}
1454
1455static int
1456fetchtable(void)
1457{
1458	size_t			 len;
1459	int			 mib[7];
1460	char			*buf;
1461	int			 rv;
1462
1463	mib[0] = CTL_NET;
1464	mib[1] = PF_ROUTE;
1465	mib[2] = 0;
1466	mib[3] = 0;
1467	mib[4] = NET_RT_DUMP;
1468	mib[5] = 0;
1469	mib[6] = kr_state.rdomain;	/* rtableid */
1470
1471	if (sysctl(mib, 7, NULL, &len, NULL, 0) == -1) {
1472		log_warn("sysctl");
1473		return (-1);
1474	}
1475	if ((buf = malloc(len)) == NULL) {
1476		log_warn(__func__);
1477		return (-1);
1478	}
1479	if (sysctl(mib, 7, buf, &len, NULL, 0) == -1) {
1480		log_warn("sysctl");
1481		free(buf);
1482		return (-1);
1483	}
1484
1485	rv = rtmsg_process(buf, len);
1486	free(buf);
1487
1488	return (rv);
1489}
1490
1491static int
1492fetchifs(void)
1493{
1494	size_t			 len;
1495	int			 mib[6];
1496	char			*buf;
1497	int			 rv;
1498
1499	mib[0] = CTL_NET;
1500	mib[1] = PF_ROUTE;
1501	mib[2] = 0;
1502	mib[3] = 0;	/* wildcard */
1503	mib[4] = NET_RT_IFLIST;
1504	mib[5] = 0;
1505
1506	if (sysctl(mib, 6, NULL, &len, NULL, 0) == -1) {
1507		log_warn("sysctl");
1508		return (-1);
1509	}
1510	if ((buf = malloc(len)) == NULL) {
1511		log_warn(__func__);
1512		return (-1);
1513	}
1514	if (sysctl(mib, 6, buf, &len, NULL, 0) == -1) {
1515		log_warn("sysctl");
1516		free(buf);
1517		return (-1);
1518	}
1519
1520	rv = rtmsg_process(buf, len);
1521	free(buf);
1522
1523	return (rv);
1524}
1525
1526static int
1527dispatch_rtmsg(void)
1528{
1529	char			 buf[RT_BUF_SIZE];
1530	ssize_t			 n;
1531
1532	if ((n = read(kr_state.fd, &buf, sizeof(buf))) == -1) {
1533		if (errno == EAGAIN || errno == EINTR)
1534			return (0);
1535		log_warn("%s: read error", __func__);
1536		return (-1);
1537	}
1538
1539	if (n == 0) {
1540		log_warnx("routing socket closed");
1541		return (-1);
1542	}
1543
1544	return (rtmsg_process(buf, n));
1545}
1546
1547static int
1548rtmsg_process(char *buf, size_t len)
1549{
1550	struct rt_msghdr	*rtm;
1551	struct if_msghdr	 ifm;
1552	struct ifa_msghdr	*ifam;
1553	struct sockaddr		*sa, *rti_info[RTAX_MAX];
1554	size_t			 offset;
1555	char			*next;
1556
1557	for (offset = 0; offset < len; offset += rtm->rtm_msglen) {
1558		next = buf + offset;
1559		rtm = (struct rt_msghdr *)next;
1560		if (len < offset + sizeof(unsigned short) ||
1561		    len < offset + rtm->rtm_msglen)
1562			fatalx("rtmsg_process: partial rtm in buffer");
1563		if (rtm->rtm_version != RTM_VERSION)
1564			continue;
1565
1566		sa = (struct sockaddr *)(next + rtm->rtm_hdrlen);
1567		get_rtaddrs(rtm->rtm_addrs, sa, rti_info);
1568
1569		switch (rtm->rtm_type) {
1570		case RTM_ADD:
1571		case RTM_GET:
1572		case RTM_CHANGE:
1573		case RTM_DELETE:
1574			if (rtm->rtm_errno)		/* failed attempts... */
1575				continue;
1576
1577			if (rtm->rtm_tableid != kr_state.rdomain)
1578				continue;
1579
1580			if (rtm->rtm_type == RTM_GET &&
1581			    rtm->rtm_pid != kr_state.pid)
1582				continue;
1583
1584			/* Skip ARP/ND cache and broadcast routes. */
1585			if (rtm->rtm_flags & (RTF_LLINFO|RTF_BROADCAST))
1586				continue;
1587
1588			/* LDP should follow the IGP and ignore BGP routes */
1589			if (rtm->rtm_priority == RTP_BGP)
1590				continue;
1591
1592			if (rtmsg_process_route(rtm, rti_info) == -1)
1593				return (-1);
1594		}
1595
1596		switch (rtm->rtm_type) {
1597		case RTM_IFINFO:
1598			memcpy(&ifm, next, sizeof(ifm));
1599			if_change(ifm.ifm_index, ifm.ifm_flags, &ifm.ifm_data,
1600			    (struct sockaddr_dl *)rti_info[RTAX_IFP]);
1601			break;
1602		case RTM_NEWADDR:
1603			ifam = (struct ifa_msghdr *)rtm;
1604			if ((ifam->ifam_addrs & (RTA_NETMASK | RTA_IFA |
1605			    RTA_BRD)) == 0)
1606				break;
1607
1608			if_newaddr(ifam->ifam_index,
1609			    (struct sockaddr *)rti_info[RTAX_IFA],
1610			    (struct sockaddr *)rti_info[RTAX_NETMASK],
1611			    (struct sockaddr *)rti_info[RTAX_BRD]);
1612			break;
1613		case RTM_DELADDR:
1614			ifam = (struct ifa_msghdr *)rtm;
1615			if ((ifam->ifam_addrs & (RTA_NETMASK | RTA_IFA |
1616			    RTA_BRD)) == 0)
1617				break;
1618
1619			if_deladdr(ifam->ifam_index,
1620			    (struct sockaddr *)rti_info[RTAX_IFA],
1621			    (struct sockaddr *)rti_info[RTAX_NETMASK],
1622			    (struct sockaddr *)rti_info[RTAX_BRD]);
1623			break;
1624		case RTM_IFANNOUNCE:
1625			if_announce(next);
1626			break;
1627		default:
1628			/* ignore for now */
1629			break;
1630		}
1631	}
1632
1633	return (offset);
1634}
1635
1636static int
1637rtmsg_process_route(struct rt_msghdr *rtm, struct sockaddr *rti_info[RTAX_MAX])
1638{
1639	struct sockaddr		*sa;
1640	struct sockaddr_in	*sa_in;
1641	struct sockaddr_in6	*sa_in6;
1642	struct kroute		 kr;
1643	struct kroute_prefix	*kp;
1644	struct kroute_priority	*kprio;
1645	struct kroute_node	*kn;
1646
1647	if ((sa = rti_info[RTAX_DST]) == NULL)
1648		return (-1);
1649
1650	memset(&kr, 0, sizeof(kr));
1651	kr.af = sa->sa_family;
1652	switch (kr.af) {
1653	case AF_INET:
1654		kr.prefix.v4 = ((struct sockaddr_in *)sa)->sin_addr;
1655		sa_in = (struct sockaddr_in *) rti_info[RTAX_NETMASK];
1656		if (sa_in != NULL && sa_in->sin_len != 0)
1657			kr.prefixlen = mask2prefixlen(sa_in->sin_addr.s_addr);
1658		else if (rtm->rtm_flags & RTF_HOST)
1659			kr.prefixlen = 32;
1660		else if (kr.prefix.v4.s_addr == INADDR_ANY)
1661			kr.prefixlen = 0;
1662		else
1663			kr.prefixlen = prefixlen_classful(kr.prefix.v4.s_addr);
1664		break;
1665	case AF_INET6:
1666		kr.prefix.v6 = ((struct sockaddr_in6 *)sa)->sin6_addr;
1667		sa_in6 = (struct sockaddr_in6 *)rti_info[RTAX_NETMASK];
1668		if (sa_in6 != NULL && sa_in6->sin6_len != 0)
1669			kr.prefixlen = mask2prefixlen6(sa_in6);
1670		else if (rtm->rtm_flags & RTF_HOST)
1671			kr.prefixlen = 128;
1672		else if (IN6_IS_ADDR_UNSPECIFIED(&kr.prefix.v6))
1673			kr.prefixlen = 0;
1674		else
1675			fatalx("in6 net addr without netmask");
1676		break;
1677	default:
1678		return (0);
1679	}
1680	kr.ifindex = rtm->rtm_index;
1681	if ((sa = rti_info[RTAX_GATEWAY]) != NULL) {
1682		switch (sa->sa_family) {
1683		case AF_INET:
1684			kr.nexthop.v4 = ((struct sockaddr_in *)sa)->sin_addr;
1685			break;
1686		case AF_INET6:
1687			sa_in6 = (struct sockaddr_in6 *)sa;
1688			recoverscope(sa_in6);
1689			kr.nexthop.v6 = sa_in6->sin6_addr;
1690			if (sa_in6->sin6_scope_id)
1691				kr.ifindex = sa_in6->sin6_scope_id;
1692			break;
1693		case AF_LINK:
1694			kr.flags |= F_CONNECTED;
1695			break;
1696		}
1697	}
1698
1699	if (rtm->rtm_flags & RTF_STATIC)
1700		kr.flags |= F_STATIC;
1701	if (rtm->rtm_flags & RTF_BLACKHOLE)
1702		kr.flags |= F_BLACKHOLE;
1703	if (rtm->rtm_flags & RTF_REJECT)
1704		kr.flags |= F_REJECT;
1705	if (rtm->rtm_flags & RTF_DYNAMIC)
1706		kr.flags |= F_DYNAMIC;
1707	/* routes attached to connected or loopback interfaces */
1708	if (rtm->rtm_flags & RTF_CONNECTED ||
1709	    ldp_addrcmp(kr.af, &kr.prefix, &kr.nexthop) == 0)
1710		kr.flags |= F_CONNECTED;
1711	kr.priority = rtm->rtm_priority;
1712
1713	if (rtm->rtm_type == RTM_CHANGE) {
1714		/*
1715		 * The kernel doesn't allow RTM_CHANGE for multipath routes.
1716		 * If we got this message we know that the route has only one
1717		 * nexthop and we should remove it before installing the same
1718		 * route with the new nexthop.
1719		 */
1720		kp = kroute_find_prefix(kr.af, &kr.prefix, kr.prefixlen);
1721		if (kp) {
1722			kprio = kroute_find_prio(kp, kr.priority);
1723			if (kprio) {
1724				kn = TAILQ_FIRST(&kprio->nexthops);
1725				if (kn)
1726					kroute_remove(&kn->r);
1727			}
1728		}
1729	}
1730
1731	kn = NULL;
1732	kp = kroute_find_prefix(kr.af, &kr.prefix, kr.prefixlen);
1733	if (kp) {
1734		kprio = kroute_find_prio(kp, kr.priority);
1735		if (kprio)
1736			kn = kroute_find_gw(kprio, &kr.nexthop);
1737	}
1738
1739	if (rtm->rtm_type == RTM_DELETE) {
1740		if (kn == NULL)
1741			return (0);
1742		return (kroute_remove(&kr));
1743	}
1744
1745	if (!ldp_addrisset(kr.af, &kr.nexthop) && !(kr.flags & F_CONNECTED)) {
1746		log_warnx("%s: no nexthop for %s/%u", __func__,
1747		    log_addr(kr.af, &kr.prefix), kr.prefixlen);
1748		return (-1);
1749	}
1750
1751	if (kn != NULL) {
1752		/* update route */
1753		kn->r = kr;
1754		kr_redistribute(kp);
1755	} else {
1756		kr.local_label = NO_LABEL;
1757		kr.remote_label = NO_LABEL;
1758		kroute_insert(&kr);
1759	}
1760
1761	return (0);
1762}
1763
1764int
1765kmpw_set(struct kpw *kpw)
1766{
1767	struct kif_node		*kif;
1768
1769	kif = kif_find(kpw->ifindex);
1770	if (kif == NULL) {
1771		log_warnx("%s: failed to find mpw by index (%u)", __func__,
1772		    kpw->ifindex);
1773		return (-1);
1774	}
1775
1776	if (kif->kpw == NULL)
1777		kif->kpw = malloc(sizeof(*kif->kpw));
1778	*kif->kpw = *kpw;
1779
1780	return (kmpw_install(kif->k.ifname, kpw));
1781}
1782
1783int
1784kmpw_unset(struct kpw *kpw)
1785{
1786	struct kif_node		*kif;
1787
1788	kif = kif_find(kpw->ifindex);
1789	if (kif == NULL) {
1790		log_warnx("%s: failed to find mpw by index (%u)", __func__,
1791		    kpw->ifindex);
1792		return (-1);
1793	}
1794
1795	if (kif->kpw == NULL) {
1796		log_warnx("%s: %s is not set", __func__, kif->k.ifname);
1797		return (-1);
1798	}
1799
1800	free(kif->kpw);
1801	kif->kpw = NULL;
1802	return (kmpw_uninstall(kif->k.ifname));
1803}
1804
1805static int
1806kmpw_install(const char *ifname, struct kpw *kpw)
1807{
1808	struct ifreq		 ifr;
1809	struct ifmpwreq		 imr;
1810
1811	memset(&imr, 0, sizeof(imr));
1812	switch (kpw->pw_type) {
1813	case PW_TYPE_ETHERNET:
1814		imr.imr_type = IMR_TYPE_ETHERNET;
1815		break;
1816	case PW_TYPE_ETHERNET_TAGGED:
1817		imr.imr_type = IMR_TYPE_ETHERNET_TAGGED;
1818		break;
1819	default:
1820		log_warnx("%s: unhandled pseudowire type (%#X)", __func__,
1821		    kpw->pw_type);
1822		return (-1);
1823	}
1824
1825	if (kpw->flags & F_PW_CWORD)
1826		imr.imr_flags |= IMR_FLAG_CONTROLWORD;
1827
1828	memcpy(&imr.imr_nexthop, addr2sa(kpw->af, &kpw->nexthop, 0),
1829	    sizeof(imr.imr_nexthop));
1830
1831	imr.imr_lshim.shim_label = kpw->local_label;
1832	imr.imr_rshim.shim_label = kpw->remote_label;
1833
1834	memset(&ifr, 0, sizeof(ifr));
1835	strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
1836	ifr.ifr_data = (caddr_t) &imr;
1837	if (ioctl(kr_state.ioctl_fd, SIOCSETMPWCFG, &ifr) == -1) {
1838		log_warn("ioctl SIOCSETMPWCFG");
1839		return (-1);
1840	}
1841
1842	return (0);
1843}
1844
1845static int
1846kmpw_uninstall(const char *ifname)
1847{
1848	struct ifreq		 ifr;
1849	struct ifmpwreq		 imr;
1850
1851	memset(&ifr, 0, sizeof(ifr));
1852	memset(&imr, 0, sizeof(imr));
1853	strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
1854	ifr.ifr_data = (caddr_t) &imr;
1855	if (ioctl(kr_state.ioctl_fd, SIOCSETMPWCFG, &ifr) == -1) {
1856		log_warn("ioctl SIOCSETMPWCFG");
1857		return (-1);
1858	}
1859
1860	return (0);
1861}
1862
1863int
1864kmpw_find(const char *ifname)
1865{
1866	struct ifreq		 ifr;
1867
1868	memset(&ifr, 0, sizeof(ifr));
1869	if (strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)) >=
1870	    sizeof(ifr.ifr_name)) {
1871		errno = ENAMETOOLONG;
1872		return (-1);
1873	}
1874
1875	if (ioctl(kr_state.ioctl_fd, SIOCGPWE3, &ifr) == -1)
1876		return (-1);
1877
1878	if (ifr.ifr_pwe3 != IF_PWE3_ETHERNET) {
1879		errno = EPFNOSUPPORT;
1880 		return (-1);
1881 	}
1882
1883	return (0);
1884}
1885