npf_bpf_comp.c revision 1.6
1/*	$NetBSD: npf_bpf_comp.c,v 1.6 2014/05/31 22:41:37 rmind Exp $	*/
2
3/*-
4 * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This material is based upon work partially supported by The
8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * BPF byte-code generation for NPF rules.
34 */
35
36#include <sys/cdefs.h>
37__RCSID("$NetBSD: npf_bpf_comp.c,v 1.6 2014/05/31 22:41:37 rmind Exp $");
38
39#include <stdlib.h>
40#include <stdbool.h>
41#include <stddef.h>
42#include <string.h>
43#include <inttypes.h>
44#include <err.h>
45#include <assert.h>
46
47#include <netinet/in.h>
48#include <netinet/in_systm.h>
49#include <netinet/ip.h>
50#include <netinet/ip6.h>
51#include <netinet/udp.h>
52#include <netinet/tcp.h>
53#include <netinet/ip_icmp.h>
54#include <netinet/icmp6.h>
55
56#include <net/bpf.h>
57
58#include "npfctl.h"
59
60/*
61 * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
62 * something other than L4 header offset.  Generally, when BPF_LDX is used.
63 */
64#define	FETCHED_L3		0x01
65#define	CHECKED_L4		0x02
66#define	X_EQ_L4OFF		0x04
67
68struct npf_bpf {
69	/*
70	 * BPF program code, the allocated length (in bytes), the number
71	 * of logical blocks and the flags.
72	 */
73	struct bpf_program	prog;
74	size_t			alen;
75	u_int			nblocks;
76	sa_family_t		af;
77	uint32_t		flags;
78
79	/* The current group offset and block number. */
80	bool			ingroup;
81	u_int			goff;
82	u_int			gblock;
83
84	/* BPF marks, allocated length and the real length. */
85	uint32_t *		marks;
86	size_t			malen;
87	size_t			mlen;
88};
89
90/*
91 * NPF success and failure values to be returned from BPF.
92 */
93#define	NPF_BPF_SUCCESS		((u_int)-1)
94#define	NPF_BPF_FAILURE		0
95
96/*
97 * Magic value to indicate the failure path, which is fixed up on completion.
98 * Note: this is the longest jump offset in BPF, since the offset is one byte.
99 */
100#define	JUMP_MAGIC		0xff
101
102/* Reduce re-allocations by expanding in 64 byte blocks. */
103#define	ALLOC_MASK		(64 - 1)
104#define	ALLOC_ROUND(x)		(((x) + ALLOC_MASK) & ~ALLOC_MASK)
105
106npf_bpf_t *
107npfctl_bpf_create(void)
108{
109	return ecalloc(1, sizeof(npf_bpf_t));
110}
111
112static void
113fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap)
114{
115	struct bpf_program *bp = &ctx->prog;
116
117	for (u_int i = start; i < end; i++) {
118		struct bpf_insn *insn = &bp->bf_insns[i];
119		const u_int fail_off = end - i;
120
121		if (fail_off >= JUMP_MAGIC) {
122			errx(EXIT_FAILURE, "BPF generation error: "
123			    "the number of instructions is over the limit");
124		}
125		if (BPF_CLASS(insn->code) != BPF_JMP) {
126			continue;
127		}
128		if (swap) {
129			uint8_t jt = insn->jt;
130			insn->jt = insn->jf;
131			insn->jf = jt;
132		}
133		if (insn->jt == JUMP_MAGIC)
134			insn->jt = fail_off;
135		if (insn->jf == JUMP_MAGIC)
136			insn->jf = fail_off;
137	}
138}
139
140static void
141add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count)
142{
143	struct bpf_program *bp = &ctx->prog;
144	size_t offset, len, reqlen;
145
146	/* Note: bf_len is the count of instructions. */
147	offset = bp->bf_len * sizeof(struct bpf_insn);
148	len = count * sizeof(struct bpf_insn);
149
150	/* Ensure the memory buffer for the program. */
151	reqlen = ALLOC_ROUND(offset + len);
152	if (reqlen > ctx->alen) {
153		bp->bf_insns = erealloc(bp->bf_insns, reqlen);
154		ctx->alen = reqlen;
155	}
156
157	/* Add the code block. */
158	memcpy((uint8_t *)bp->bf_insns + offset, insns, len);
159	bp->bf_len += count;
160}
161
162static void
163done_raw_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
164{
165	size_t reqlen, nargs = m[1];
166
167	if ((len / sizeof(uint32_t) - 2) != nargs) {
168		errx(EXIT_FAILURE, "invalid BPF block description");
169	}
170	reqlen = ALLOC_ROUND(ctx->mlen + len);
171	if (reqlen > ctx->malen) {
172		ctx->marks = erealloc(ctx->marks, reqlen);
173		ctx->malen = reqlen;
174	}
175	memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len);
176	ctx->mlen += len;
177}
178
179static void
180done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
181{
182	done_raw_block(ctx, m, len);
183	ctx->nblocks++;
184}
185
186struct bpf_program *
187npfctl_bpf_complete(npf_bpf_t *ctx)
188{
189	struct bpf_program *bp = &ctx->prog;
190	const u_int retoff = bp->bf_len;
191
192	/* Add the return fragment (success and failure paths). */
193	struct bpf_insn insns_ret[] = {
194		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS),
195		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
196	};
197	add_insns(ctx, insns_ret, __arraycount(insns_ret));
198
199	/* Fixup all jumps to the main failure path. */
200	fixup_jumps(ctx, 0, retoff, false);
201
202	return &ctx->prog;
203}
204
205const void *
206npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len)
207{
208	*len = ctx->mlen;
209	return ctx->marks;
210}
211
212void
213npfctl_bpf_destroy(npf_bpf_t *ctx)
214{
215	free(ctx->prog.bf_insns);
216	free(ctx->marks);
217	free(ctx);
218}
219
220/*
221 * npfctl_bpf_group: begin a logical group.  It merely uses logical
222 * disjunction (OR) for compares within the group.
223 */
224void
225npfctl_bpf_group(npf_bpf_t *ctx)
226{
227	struct bpf_program *bp = &ctx->prog;
228
229	assert(ctx->goff == 0);
230	assert(ctx->gblock == 0);
231
232	ctx->goff = bp->bf_len;
233	ctx->gblock = ctx->nblocks;
234	ctx->ingroup = true;
235}
236
237void
238npfctl_bpf_endgroup(npf_bpf_t *ctx)
239{
240	struct bpf_program *bp = &ctx->prog;
241	const size_t curoff = bp->bf_len;
242
243	/* If there are no blocks or only one - nothing to do. */
244	if ((ctx->nblocks - ctx->gblock) <= 1) {
245		ctx->goff = ctx->gblock = 0;
246		return;
247	}
248
249	/*
250	 * Append a failure return as a fall-through i.e. if there is
251	 * no match within the group.
252	 */
253	struct bpf_insn insns_ret[] = {
254		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
255	};
256	add_insns(ctx, insns_ret, __arraycount(insns_ret));
257
258	/*
259	 * Adjust jump offsets: on match - jump outside the group i.e.
260	 * to the current offset.  Otherwise, jump to the next instruction
261	 * which would lead to the fall-through code above if none matches.
262	 */
263	fixup_jumps(ctx, ctx->goff, curoff, true);
264	ctx->goff = ctx->gblock = 0;
265}
266
267static void
268fetch_l3(npf_bpf_t *ctx, sa_family_t af, u_int flags)
269{
270	u_int ver;
271
272	switch (af) {
273	case AF_INET:
274		ver = IPVERSION;
275		break;
276	case AF_INET6:
277		ver = IPV6_VERSION >> 4;
278		break;
279	case AF_UNSPEC:
280		ver = 0;
281		break;
282	default:
283		abort();
284	}
285
286	/*
287	 * Call NPF_COP_L3 to fetch L3 information.  The coprocessor
288	 * populates the following words in the scratch memory store:
289	 * - BPF_MW_IPVER: IP version (4 or 6).
290	 * - BPF_MW_L4OFF: L4 header offset.
291	 * - BPF_MW_L4PROTO: L4 protocol.
292	 */
293	if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
294		const uint8_t jt = ver ? 0 : JUMP_MAGIC;
295		const uint8_t jf = ver ? JUMP_MAGIC : 0;
296		bool ingroup = ctx->ingroup;
297
298		/*
299		 * L3 block cannot be inserted in the middle of a group.
300		 * In fact, it never is.  Check and start the group after.
301		 */
302		if (ingroup) {
303			assert(ctx->nblocks == ctx->gblock);
304			npfctl_bpf_endgroup(ctx);
305		}
306
307		/*
308		 * A <- IP version; A == expected-version?
309		 * If no particular version specified, check for non-zero.
310		 */
311		if ((ctx->flags & FETCHED_L3) == 0) {
312			struct bpf_insn insns_l3[] = {
313				BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_L3),
314				BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
315			};
316			add_insns(ctx, insns_l3, __arraycount(insns_l3));
317			ctx->flags |= FETCHED_L3;
318		} else {
319			/* IP version is already fetched in BPF_MW_IPVER. */
320			struct bpf_insn insns_af[] = {
321				BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
322				BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
323			};
324			add_insns(ctx, insns_af, __arraycount(insns_af));
325		}
326		ctx->af = af;
327
328		if (af) {
329			uint32_t mwords[] = { BM_IPVER, 1, af };
330			done_raw_block(ctx, mwords, sizeof(mwords));
331		}
332		if (ingroup) {
333			npfctl_bpf_group(ctx);
334		}
335
336	} else if (af && af != ctx->af) {
337		errx(EXIT_FAILURE, "address family mismatch");
338	}
339
340	if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) {
341		/* X <- IP header length */
342		struct bpf_insn insns_hlen[] = {
343			BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF),
344		};
345		add_insns(ctx, insns_hlen, __arraycount(insns_hlen));
346		ctx->flags |= X_EQ_L4OFF;
347	}
348}
349
350/*
351 * npfctl_bpf_proto: code block to match IP version and L4 protocol.
352 */
353void
354npfctl_bpf_proto(npf_bpf_t *ctx, sa_family_t af, int proto)
355{
356	assert(af != AF_UNSPEC || proto != -1);
357
358	/* Note: fails if IP version does not match. */
359	fetch_l3(ctx, af, 0);
360	if (proto == -1) {
361		return;
362	}
363
364	struct bpf_insn insns_proto[] = {
365		/* A <- L4 protocol; A == expected-protocol? */
366		BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
367		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC),
368	};
369	add_insns(ctx, insns_proto, __arraycount(insns_proto));
370
371	uint32_t mwords[] = { BM_PROTO, 1, proto };
372	done_block(ctx, mwords, sizeof(mwords));
373	ctx->flags |= CHECKED_L4;
374}
375
376/*
377 * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR.
378 *
379 * => IP address shall be in the network byte order.
380 */
381void
382npfctl_bpf_cidr(npf_bpf_t *ctx, u_int opts, sa_family_t af,
383    const npf_addr_t *addr, const npf_netmask_t mask)
384{
385	const uint32_t *awords = (const uint32_t *)addr;
386	u_int nwords, length, maxmask, off;
387
388	assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
389	assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK);
390
391	switch (af) {
392	case AF_INET:
393		maxmask = 32;
394		off = (opts & MATCH_SRC) ?
395		    offsetof(struct ip, ip_src) :
396		    offsetof(struct ip, ip_dst);
397		nwords = sizeof(struct in_addr) / sizeof(uint32_t);
398		break;
399	case AF_INET6:
400		maxmask = 128;
401		off = (opts & MATCH_SRC) ?
402		    offsetof(struct ip6_hdr, ip6_src) :
403		    offsetof(struct ip6_hdr, ip6_dst);
404		nwords = sizeof(struct in6_addr) / sizeof(uint32_t);
405		break;
406	default:
407		abort();
408	}
409
410	/* Ensure address family. */
411	fetch_l3(ctx, af, 0);
412
413	length = (mask == NPF_NO_NETMASK) ? maxmask : mask;
414
415	/* CAUTION: BPF operates in host byte-order. */
416	for (u_int i = 0; i < nwords; i++) {
417		const u_int woff = i * sizeof(uint32_t);
418		uint32_t word = ntohl(awords[i]);
419		uint32_t wordmask;
420
421		if (length >= 32) {
422			/* The mask is a full word - do not apply it. */
423			wordmask = 0;
424			length -= 32;
425		} else if (length) {
426			wordmask = 0xffffffff << (32 - length);
427			length = 0;
428		} else {
429			/* The mask became zero - skip the rest. */
430			break;
431		}
432
433		/* A <- IP address (or one word of it) */
434		struct bpf_insn insns_ip[] = {
435			BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
436		};
437		add_insns(ctx, insns_ip, __arraycount(insns_ip));
438
439		/* A <- (A & MASK) */
440		if (wordmask) {
441			struct bpf_insn insns_mask[] = {
442				BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
443			};
444			add_insns(ctx, insns_mask, __arraycount(insns_mask));
445		}
446
447		/* A == expected-IP-word ? */
448		struct bpf_insn insns_cmp[] = {
449			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, JUMP_MAGIC),
450		};
451		add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
452	}
453
454	uint32_t mwords[] = {
455		(opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6,
456		af, mask, awords[0], awords[1], awords[2], awords[3],
457	};
458	done_block(ctx, mwords, sizeof(mwords));
459}
460
461/*
462 * npfctl_bpf_ports: code block to match TCP/UDP port range.
463 *
464 * => Port numbers shall be in the network byte order.
465 */
466void
467npfctl_bpf_ports(npf_bpf_t *ctx, u_int opts, in_port_t from, in_port_t to)
468{
469	const u_int sport_off = offsetof(struct udphdr, uh_sport);
470	const u_int dport_off = offsetof(struct udphdr, uh_dport);
471	u_int off;
472
473	/* TCP and UDP port offsets are the same. */
474	assert(sport_off == offsetof(struct tcphdr, th_sport));
475	assert(dport_off == offsetof(struct tcphdr, th_dport));
476	assert(ctx->flags & CHECKED_L4);
477
478	assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
479	off = (opts & MATCH_SRC) ? sport_off : dport_off;
480
481	/* X <- IP header length */
482	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
483
484	struct bpf_insn insns_fetch[] = {
485		/* A <- port */
486		BPF_STMT(BPF_LD+BPF_H+BPF_IND, off),
487	};
488	add_insns(ctx, insns_fetch, __arraycount(insns_fetch));
489
490	/* CAUTION: BPF operates in host byte-order. */
491	from = ntohs(from);
492	to = ntohs(to);
493
494	if (from == to) {
495		/* Single port case. */
496		struct bpf_insn insns_port[] = {
497			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC),
498		};
499		add_insns(ctx, insns_port, __arraycount(insns_port));
500	} else {
501		/* Port range case. */
502		struct bpf_insn insns_range[] = {
503			BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, JUMP_MAGIC),
504			BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, JUMP_MAGIC, 0),
505		};
506		add_insns(ctx, insns_range, __arraycount(insns_range));
507	}
508
509	uint32_t mwords[] = {
510		opts & MATCH_SRC ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to
511	};
512	done_block(ctx, mwords, sizeof(mwords));
513}
514
515/*
516 * npfctl_bpf_tcpfl: code block to match TCP flags.
517 */
518void
519npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask, bool checktcp)
520{
521	const u_int tcpfl_off = offsetof(struct tcphdr, th_flags);
522	const bool usingmask = tf_mask != tf;
523
524	/* X <- IP header length */
525	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
526	if (checktcp) {
527		const u_int jf = usingmask ? 3 : 2;
528		assert(ctx->ingroup == false);
529
530		/* A <- L4 protocol; A == TCP?  If not, jump out. */
531		struct bpf_insn insns_tcp[] = {
532			BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
533			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
534		};
535		add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
536	} else {
537		assert(ctx->flags & CHECKED_L4);
538	}
539
540	struct bpf_insn insns_tf[] = {
541		/* A <- TCP flags */
542		BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off),
543	};
544	add_insns(ctx, insns_tf, __arraycount(insns_tf));
545
546	if (usingmask) {
547		/* A <- (A & mask) */
548		struct bpf_insn insns_mask[] = {
549			BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask),
550		};
551		add_insns(ctx, insns_mask, __arraycount(insns_mask));
552	}
553
554	struct bpf_insn insns_cmp[] = {
555		/* A == expected-TCP-flags? */
556		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC),
557	};
558	add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
559
560	if (!checktcp) {
561		uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask};
562		done_block(ctx, mwords, sizeof(mwords));
563	}
564}
565
566/*
567 * npfctl_bpf_icmp: code block to match ICMP type and/or code.
568 * Note: suitable both for the ICMPv4 and ICMPv6.
569 */
570void
571npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
572{
573	const u_int type_off = offsetof(struct icmp, icmp_type);
574	const u_int code_off = offsetof(struct icmp, icmp_code);
575
576	assert(ctx->flags & CHECKED_L4);
577	assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off);
578	assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off);
579	assert(type != -1 || code != -1);
580
581	/* X <- IP header length */
582	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
583
584	if (type != -1) {
585		struct bpf_insn insns_type[] = {
586			BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off),
587			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC),
588		};
589		add_insns(ctx, insns_type, __arraycount(insns_type));
590
591		uint32_t mwords[] = { BM_ICMP_TYPE, 1, type };
592		done_block(ctx, mwords, sizeof(mwords));
593	}
594
595	if (code != -1) {
596		struct bpf_insn insns_code[] = {
597			BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off),
598			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC),
599		};
600		add_insns(ctx, insns_code, __arraycount(insns_code));
601
602		uint32_t mwords[] = { BM_ICMP_CODE, 1, code };
603		done_block(ctx, mwords, sizeof(mwords));
604	}
605}
606
607#define	SRC_FLAG_BIT	(1U << 31)
608
609/*
610 * npfctl_bpf_table: code block to match source/destination IP address
611 * against NPF table specified by ID.
612 */
613void
614npfctl_bpf_table(npf_bpf_t *ctx, u_int opts, u_int tid)
615{
616	const bool src = (opts & MATCH_SRC) != 0;
617
618	struct bpf_insn insns_table[] = {
619		BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid),
620		BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE),
621		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0),
622	};
623	add_insns(ctx, insns_table, __arraycount(insns_table));
624
625	uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid };
626	done_block(ctx, mwords, sizeof(mwords));
627}
628