npf_bpf_comp.c revision 1.11
1/*-
2 * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This material is based upon work partially supported by The
6 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30/*
31 * BPF byte-code generation for NPF rules.
32 */
33
34#include <sys/cdefs.h>
35__RCSID("$NetBSD: npf_bpf_comp.c,v 1.11 2018/09/29 14:41:36 rmind Exp $");
36
37#include <stdlib.h>
38#include <stdbool.h>
39#include <stddef.h>
40#include <string.h>
41#include <inttypes.h>
42#include <err.h>
43#include <assert.h>
44
45#include <netinet/in.h>
46#include <netinet/in_systm.h>
47#define	__FAVOR_BSD
48#include <netinet/ip.h>
49#include <netinet/ip6.h>
50#include <netinet/udp.h>
51#include <netinet/tcp.h>
52#include <netinet/ip_icmp.h>
53#include <netinet/icmp6.h>
54
55#include <net/bpf.h>
56
57#include "npfctl.h"
58
59/*
60 * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
61 * something other than L4 header offset.  Generally, when BPF_LDX is used.
62 */
63#define	FETCHED_L3		0x01
64#define	CHECKED_L4		0x02
65#define	X_EQ_L4OFF		0x04
66
67struct npf_bpf {
68	/*
69	 * BPF program code, the allocated length (in bytes), the number
70	 * of logical blocks and the flags.
71	 */
72	struct bpf_program	prog;
73	size_t			alen;
74	u_int			nblocks;
75	sa_family_t		af;
76	uint32_t		flags;
77
78	/* The current group offset and block number. */
79	bool			ingroup;
80	u_int			goff;
81	u_int			gblock;
82
83	/* BPF marks, allocated length and the real length. */
84	uint32_t *		marks;
85	size_t			malen;
86	size_t			mlen;
87};
88
89/*
90 * NPF success and failure values to be returned from BPF.
91 */
92#define	NPF_BPF_SUCCESS		((u_int)-1)
93#define	NPF_BPF_FAILURE		0
94
95/*
96 * Magic value to indicate the failure path, which is fixed up on completion.
97 * Note: this is the longest jump offset in BPF, since the offset is one byte.
98 */
99#define	JUMP_MAGIC		0xff
100
101/* Reduce re-allocations by expanding in 64 byte blocks. */
102#define	ALLOC_MASK		(64 - 1)
103#define	ALLOC_ROUND(x)		(((x) + ALLOC_MASK) & ~ALLOC_MASK)
104
105#ifndef IPV6_VERSION
106#define	IPV6_VERSION		0x60
107#endif
108
109npf_bpf_t *
110npfctl_bpf_create(void)
111{
112	return ecalloc(1, sizeof(npf_bpf_t));
113}
114
115static void
116fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap)
117{
118	struct bpf_program *bp = &ctx->prog;
119
120	for (u_int i = start; i < end; i++) {
121		struct bpf_insn *insn = &bp->bf_insns[i];
122		const u_int fail_off = end - i;
123
124		if (fail_off >= JUMP_MAGIC) {
125			errx(EXIT_FAILURE, "BPF generation error: "
126			    "the number of instructions is over the limit");
127		}
128		if (BPF_CLASS(insn->code) != BPF_JMP) {
129			continue;
130		}
131		if (swap) {
132			uint8_t jt = insn->jt;
133			insn->jt = insn->jf;
134			insn->jf = jt;
135		}
136		if (insn->jt == JUMP_MAGIC)
137			insn->jt = fail_off;
138		if (insn->jf == JUMP_MAGIC)
139			insn->jf = fail_off;
140	}
141}
142
143static void
144add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count)
145{
146	struct bpf_program *bp = &ctx->prog;
147	size_t offset, len, reqlen;
148
149	/* Note: bf_len is the count of instructions. */
150	offset = bp->bf_len * sizeof(struct bpf_insn);
151	len = count * sizeof(struct bpf_insn);
152
153	/* Ensure the memory buffer for the program. */
154	reqlen = ALLOC_ROUND(offset + len);
155	if (reqlen > ctx->alen) {
156		bp->bf_insns = erealloc(bp->bf_insns, reqlen);
157		ctx->alen = reqlen;
158	}
159
160	/* Add the code block. */
161	memcpy((uint8_t *)bp->bf_insns + offset, insns, len);
162	bp->bf_len += count;
163}
164
165static void
166done_raw_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
167{
168	size_t reqlen, nargs = m[1];
169
170	if ((len / sizeof(uint32_t) - 2) != nargs) {
171		errx(EXIT_FAILURE, "invalid BPF block description");
172	}
173	reqlen = ALLOC_ROUND(ctx->mlen + len);
174	if (reqlen > ctx->malen) {
175		ctx->marks = erealloc(ctx->marks, reqlen);
176		ctx->malen = reqlen;
177	}
178	memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len);
179	ctx->mlen += len;
180}
181
182static void
183done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
184{
185	done_raw_block(ctx, m, len);
186	ctx->nblocks++;
187}
188
189struct bpf_program *
190npfctl_bpf_complete(npf_bpf_t *ctx)
191{
192	struct bpf_program *bp = &ctx->prog;
193	const u_int retoff = bp->bf_len;
194
195	/* No instructions (optimised out). */
196	if (!bp->bf_len)
197		return NULL;
198
199	/* Add the return fragment (success and failure paths). */
200	struct bpf_insn insns_ret[] = {
201		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS),
202		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
203	};
204	add_insns(ctx, insns_ret, __arraycount(insns_ret));
205
206	/* Fixup all jumps to the main failure path. */
207	fixup_jumps(ctx, 0, retoff, false);
208
209	return &ctx->prog;
210}
211
212const void *
213npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len)
214{
215	*len = ctx->mlen;
216	return ctx->marks;
217}
218
219void
220npfctl_bpf_destroy(npf_bpf_t *ctx)
221{
222	free(ctx->prog.bf_insns);
223	free(ctx->marks);
224	free(ctx);
225}
226
227/*
228 * npfctl_bpf_group: begin a logical group.  It merely uses logical
229 * disjunction (OR) for compares within the group.
230 */
231void
232npfctl_bpf_group(npf_bpf_t *ctx)
233{
234	struct bpf_program *bp = &ctx->prog;
235
236	assert(ctx->goff == 0);
237	assert(ctx->gblock == 0);
238
239	ctx->goff = bp->bf_len;
240	ctx->gblock = ctx->nblocks;
241	ctx->ingroup = true;
242}
243
244void
245npfctl_bpf_endgroup(npf_bpf_t *ctx, bool invert)
246{
247	struct bpf_program *bp = &ctx->prog;
248	const size_t curoff = bp->bf_len;
249
250	/* If there are no blocks or only one - nothing to do. */
251	if (!invert && (ctx->nblocks - ctx->gblock) <= 1) {
252		ctx->goff = ctx->gblock = 0;
253		return;
254	}
255
256	/*
257	 * If inverting, then prepend a jump over the statement below.
258	 * If matching, jump will jump below and the fail will happen.
259	 */
260	if (invert) {
261		struct bpf_insn insns_ret[] = {
262			BPF_STMT(BPF_JMP+BPF_JA, 1),
263		};
264		add_insns(ctx, insns_ret, __arraycount(insns_ret));
265	}
266
267	/*
268	 * Append a failure return as a fall-through i.e. if there is
269	 * no match within the group.
270	 */
271	struct bpf_insn insns_ret[] = {
272		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
273	};
274	add_insns(ctx, insns_ret, __arraycount(insns_ret));
275
276	/*
277	 * Adjust jump offsets: on match - jump outside the group i.e.
278	 * to the current offset.  Otherwise, jump to the next instruction
279	 * which would lead to the fall-through code above if none matches.
280	 */
281	fixup_jumps(ctx, ctx->goff, curoff, true);
282	ctx->goff = ctx->gblock = 0;
283}
284
285static void
286fetch_l3(npf_bpf_t *ctx, sa_family_t af, u_int flags)
287{
288	u_int ver;
289
290	switch (af) {
291	case AF_INET:
292		ver = IPVERSION;
293		break;
294	case AF_INET6:
295		ver = IPV6_VERSION >> 4;
296		break;
297	case AF_UNSPEC:
298		ver = 0;
299		break;
300	default:
301		abort();
302	}
303
304	/*
305	 * The memory store is populated with:
306	 * - BPF_MW_IPVER: IP version (4 or 6).
307	 * - BPF_MW_L4OFF: L4 header offset.
308	 * - BPF_MW_L4PROTO: L4 protocol.
309	 */
310	if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
311		const uint8_t jt = ver ? 0 : JUMP_MAGIC;
312		const uint8_t jf = ver ? JUMP_MAGIC : 0;
313		bool ingroup = ctx->ingroup;
314
315		/*
316		 * L3 block cannot be inserted in the middle of a group.
317		 * In fact, it never is.  Check and start the group after.
318		 */
319		if (ingroup) {
320			assert(ctx->nblocks == ctx->gblock);
321			npfctl_bpf_endgroup(ctx, false);
322		}
323
324		/*
325		 * A <- IP version; A == expected-version?
326		 * If no particular version specified, check for non-zero.
327		 */
328		struct bpf_insn insns_af[] = {
329			BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
330			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
331		};
332		add_insns(ctx, insns_af, __arraycount(insns_af));
333		ctx->flags |= FETCHED_L3;
334		ctx->af = af;
335
336		if (af) {
337			uint32_t mwords[] = { BM_IPVER, 1, af };
338			done_raw_block(ctx, mwords, sizeof(mwords));
339		}
340		if (ingroup) {
341			npfctl_bpf_group(ctx);
342		}
343
344	} else if (af && af != ctx->af) {
345		errx(EXIT_FAILURE, "address family mismatch");
346	}
347
348	if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) {
349		/* X <- IP header length */
350		struct bpf_insn insns_hlen[] = {
351			BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF),
352		};
353		add_insns(ctx, insns_hlen, __arraycount(insns_hlen));
354		ctx->flags |= X_EQ_L4OFF;
355	}
356}
357
358/*
359 * npfctl_bpf_proto: code block to match IP version and L4 protocol.
360 */
361void
362npfctl_bpf_proto(npf_bpf_t *ctx, sa_family_t af, int proto)
363{
364	assert(af != AF_UNSPEC || proto != -1);
365
366	/* Note: fails if IP version does not match. */
367	fetch_l3(ctx, af, 0);
368	if (proto == -1) {
369		return;
370	}
371
372	struct bpf_insn insns_proto[] = {
373		/* A <- L4 protocol; A == expected-protocol? */
374		BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
375		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC),
376	};
377	add_insns(ctx, insns_proto, __arraycount(insns_proto));
378
379	uint32_t mwords[] = { BM_PROTO, 1, proto };
380	done_block(ctx, mwords, sizeof(mwords));
381	ctx->flags |= CHECKED_L4;
382}
383
384/*
385 * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR.
386 *
387 * => IP address shall be in the network byte order.
388 */
389void
390npfctl_bpf_cidr(npf_bpf_t *ctx, u_int opts, sa_family_t af,
391    const npf_addr_t *addr, const npf_netmask_t mask)
392{
393	const uint32_t *awords = (const uint32_t *)addr;
394	u_int nwords, length, maxmask, off;
395
396	assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
397	assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK);
398
399	switch (af) {
400	case AF_INET:
401		maxmask = 32;
402		off = (opts & MATCH_SRC) ?
403		    offsetof(struct ip, ip_src) :
404		    offsetof(struct ip, ip_dst);
405		nwords = sizeof(struct in_addr) / sizeof(uint32_t);
406		break;
407	case AF_INET6:
408		maxmask = 128;
409		off = (opts & MATCH_SRC) ?
410		    offsetof(struct ip6_hdr, ip6_src) :
411		    offsetof(struct ip6_hdr, ip6_dst);
412		nwords = sizeof(struct in6_addr) / sizeof(uint32_t);
413		break;
414	default:
415		abort();
416	}
417
418	/* Ensure address family. */
419	fetch_l3(ctx, af, 0);
420
421	length = (mask == NPF_NO_NETMASK) ? maxmask : mask;
422
423	/* CAUTION: BPF operates in host byte-order. */
424	for (u_int i = 0; i < nwords; i++) {
425		const u_int woff = i * sizeof(uint32_t);
426		uint32_t word = ntohl(awords[i]);
427		uint32_t wordmask;
428
429		if (length >= 32) {
430			/* The mask is a full word - do not apply it. */
431			wordmask = 0;
432			length -= 32;
433		} else if (length) {
434			wordmask = 0xffffffff << (32 - length);
435			length = 0;
436		} else {
437			/* The mask became zero - skip the rest. */
438			break;
439		}
440
441		/* A <- IP address (or one word of it) */
442		struct bpf_insn insns_ip[] = {
443			BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
444		};
445		add_insns(ctx, insns_ip, __arraycount(insns_ip));
446
447		/* A <- (A & MASK) */
448		if (wordmask) {
449			struct bpf_insn insns_mask[] = {
450				BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
451			};
452			add_insns(ctx, insns_mask, __arraycount(insns_mask));
453		}
454
455		/* A == expected-IP-word ? */
456		struct bpf_insn insns_cmp[] = {
457			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, JUMP_MAGIC),
458		};
459		add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
460	}
461
462	uint32_t mwords[] = {
463		(opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6,
464		af, mask, awords[0], awords[1], awords[2], awords[3],
465	};
466	done_block(ctx, mwords, sizeof(mwords));
467}
468
469/*
470 * npfctl_bpf_ports: code block to match TCP/UDP port range.
471 *
472 * => Port numbers shall be in the network byte order.
473 */
474void
475npfctl_bpf_ports(npf_bpf_t *ctx, u_int opts, in_port_t from, in_port_t to)
476{
477	const u_int sport_off = offsetof(struct udphdr, uh_sport);
478	const u_int dport_off = offsetof(struct udphdr, uh_dport);
479	u_int off;
480
481	/* TCP and UDP port offsets are the same. */
482	assert(sport_off == offsetof(struct tcphdr, th_sport));
483	assert(dport_off == offsetof(struct tcphdr, th_dport));
484	assert(ctx->flags & CHECKED_L4);
485
486	assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
487	off = (opts & MATCH_SRC) ? sport_off : dport_off;
488
489	/* X <- IP header length */
490	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
491
492	struct bpf_insn insns_fetch[] = {
493		/* A <- port */
494		BPF_STMT(BPF_LD+BPF_H+BPF_IND, off),
495	};
496	add_insns(ctx, insns_fetch, __arraycount(insns_fetch));
497
498	/* CAUTION: BPF operates in host byte-order. */
499	from = ntohs(from);
500	to = ntohs(to);
501
502	if (from == to) {
503		/* Single port case. */
504		struct bpf_insn insns_port[] = {
505			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC),
506		};
507		add_insns(ctx, insns_port, __arraycount(insns_port));
508	} else {
509		/* Port range case. */
510		struct bpf_insn insns_range[] = {
511			BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, JUMP_MAGIC),
512			BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, JUMP_MAGIC, 0),
513		};
514		add_insns(ctx, insns_range, __arraycount(insns_range));
515	}
516
517	uint32_t mwords[] = {
518		opts & MATCH_SRC ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to
519	};
520	done_block(ctx, mwords, sizeof(mwords));
521}
522
523/*
524 * npfctl_bpf_tcpfl: code block to match TCP flags.
525 */
526void
527npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask, bool checktcp)
528{
529	const u_int tcpfl_off = offsetof(struct tcphdr, th_flags);
530	const bool usingmask = tf_mask != tf;
531
532	/* X <- IP header length */
533	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
534	if (checktcp) {
535		const u_int jf = usingmask ? 3 : 2;
536		assert(ctx->ingroup == false);
537
538		/* A <- L4 protocol; A == TCP?  If not, jump out. */
539		struct bpf_insn insns_tcp[] = {
540			BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
541			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
542		};
543		add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
544	} else {
545		assert(ctx->flags & CHECKED_L4);
546	}
547
548	struct bpf_insn insns_tf[] = {
549		/* A <- TCP flags */
550		BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off),
551	};
552	add_insns(ctx, insns_tf, __arraycount(insns_tf));
553
554	if (usingmask) {
555		/* A <- (A & mask) */
556		struct bpf_insn insns_mask[] = {
557			BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask),
558		};
559		add_insns(ctx, insns_mask, __arraycount(insns_mask));
560	}
561
562	struct bpf_insn insns_cmp[] = {
563		/* A == expected-TCP-flags? */
564		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC),
565	};
566	add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
567
568	if (!checktcp) {
569		uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask};
570		done_block(ctx, mwords, sizeof(mwords));
571	}
572}
573
574/*
575 * npfctl_bpf_icmp: code block to match ICMP type and/or code.
576 * Note: suitable both for the ICMPv4 and ICMPv6.
577 */
578void
579npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
580{
581	const u_int type_off = offsetof(struct icmp, icmp_type);
582	const u_int code_off = offsetof(struct icmp, icmp_code);
583
584	assert(ctx->flags & CHECKED_L4);
585	assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off);
586	assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off);
587	assert(type != -1 || code != -1);
588
589	/* X <- IP header length */
590	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
591
592	if (type != -1) {
593		struct bpf_insn insns_type[] = {
594			BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off),
595			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC),
596		};
597		add_insns(ctx, insns_type, __arraycount(insns_type));
598
599		uint32_t mwords[] = { BM_ICMP_TYPE, 1, type };
600		done_block(ctx, mwords, sizeof(mwords));
601	}
602
603	if (code != -1) {
604		struct bpf_insn insns_code[] = {
605			BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off),
606			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC),
607		};
608		add_insns(ctx, insns_code, __arraycount(insns_code));
609
610		uint32_t mwords[] = { BM_ICMP_CODE, 1, code };
611		done_block(ctx, mwords, sizeof(mwords));
612	}
613}
614
615#define	SRC_FLAG_BIT	(1U << 31)
616
617/*
618 * npfctl_bpf_table: code block to match source/destination IP address
619 * against NPF table specified by ID.
620 */
621void
622npfctl_bpf_table(npf_bpf_t *ctx, u_int opts, u_int tid)
623{
624	const bool src = (opts & MATCH_SRC) != 0;
625
626	struct bpf_insn insns_table[] = {
627		BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid),
628		BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE),
629		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0),
630	};
631	add_insns(ctx, insns_table, __arraycount(insns_table));
632
633	uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid };
634	done_block(ctx, mwords, sizeof(mwords));
635}
636