npf_bpf_comp.c revision 1.5
1/* $NetBSD: npf_bpf_comp.c,v 1.5 2014/05/15 02:34:29 rmind Exp $ */ 2 3/*- 4 * Copyright (c) 2010-2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This material is based upon work partially supported by The 8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * BPF byte-code generation for NPF rules. 34 */ 35 36#include <sys/cdefs.h> 37__RCSID("$NetBSD: npf_bpf_comp.c,v 1.5 2014/05/15 02:34:29 rmind Exp $"); 38 39#include <stdlib.h> 40#include <stdbool.h> 41#include <stddef.h> 42#include <string.h> 43#include <inttypes.h> 44#include <err.h> 45#include <assert.h> 46 47#include <netinet/in.h> 48#include <netinet/in_systm.h> 49#include <netinet/ip.h> 50#include <netinet/ip6.h> 51#include <netinet/udp.h> 52#include <netinet/tcp.h> 53#include <netinet/ip_icmp.h> 54#include <netinet/icmp6.h> 55 56#include <net/bpf.h> 57 58#include "npfctl.h" 59 60/* 61 * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores 62 * something other than L4 header offset. Generally, when BPF_LDX is used. 63 */ 64#define FETCHED_L3 0x01 65#define X_EQ_L4OFF 0x02 66 67struct npf_bpf { 68 /* 69 * BPF program code, the allocated length (in bytes), the number 70 * of logical blocks and the flags. 71 */ 72 struct bpf_program prog; 73 size_t alen; 74 u_int nblocks; 75 sa_family_t af; 76 uint32_t flags; 77 78 /* The current group offset and block number. */ 79 bool ingroup; 80 u_int goff; 81 u_int gblock; 82 83 /* BPF marks, allocated length and the real length. */ 84 uint32_t * marks; 85 size_t malen; 86 size_t mlen; 87}; 88 89/* 90 * NPF success and failure values to be returned from BPF. 91 */ 92#define NPF_BPF_SUCCESS ((u_int)-1) 93#define NPF_BPF_FAILURE 0 94 95/* 96 * Magic value to indicate the failure path, which is fixed up on completion. 97 * Note: this is the longest jump offset in BPF, since the offset is one byte. 98 */ 99#define JUMP_MAGIC 0xff 100 101/* Reduce re-allocations by expanding in 64 byte blocks. */ 102#define ALLOC_MASK (64 - 1) 103#define ALLOC_ROUND(x) (((x) + ALLOC_MASK) & ~ALLOC_MASK) 104 105npf_bpf_t * 106npfctl_bpf_create(void) 107{ 108 return ecalloc(1, sizeof(npf_bpf_t)); 109} 110 111static void 112fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap) 113{ 114 struct bpf_program *bp = &ctx->prog; 115 116 for (u_int i = start; i < end; i++) { 117 struct bpf_insn *insn = &bp->bf_insns[i]; 118 const u_int fail_off = end - i; 119 120 if (fail_off >= JUMP_MAGIC) { 121 errx(EXIT_FAILURE, "BPF generation error: " 122 "the number of instructions is over the limit"); 123 } 124 if (BPF_CLASS(insn->code) != BPF_JMP) { 125 continue; 126 } 127 if (swap) { 128 uint8_t jt = insn->jt; 129 insn->jt = insn->jf; 130 insn->jf = jt; 131 } 132 if (insn->jt == JUMP_MAGIC) 133 insn->jt = fail_off; 134 if (insn->jf == JUMP_MAGIC) 135 insn->jf = fail_off; 136 } 137} 138 139static void 140add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count) 141{ 142 struct bpf_program *bp = &ctx->prog; 143 size_t offset, len, reqlen; 144 145 /* Note: bf_len is the count of instructions. */ 146 offset = bp->bf_len * sizeof(struct bpf_insn); 147 len = count * sizeof(struct bpf_insn); 148 149 /* Ensure the memory buffer for the program. */ 150 reqlen = ALLOC_ROUND(offset + len); 151 if (reqlen > ctx->alen) { 152 bp->bf_insns = erealloc(bp->bf_insns, reqlen); 153 ctx->alen = reqlen; 154 } 155 156 /* Add the code block. */ 157 memcpy((uint8_t *)bp->bf_insns + offset, insns, len); 158 bp->bf_len += count; 159} 160 161static void 162done_raw_block(npf_bpf_t *ctx, const uint32_t *m, size_t len) 163{ 164 size_t reqlen, nargs = m[1]; 165 166 if ((len / sizeof(uint32_t) - 2) != nargs) { 167 errx(EXIT_FAILURE, "invalid BPF block description"); 168 } 169 reqlen = ALLOC_ROUND(ctx->mlen + len); 170 if (reqlen > ctx->malen) { 171 ctx->marks = erealloc(ctx->marks, reqlen); 172 ctx->malen = reqlen; 173 } 174 memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len); 175 ctx->mlen += len; 176} 177 178static void 179done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len) 180{ 181 done_raw_block(ctx, m, len); 182 ctx->nblocks++; 183} 184 185struct bpf_program * 186npfctl_bpf_complete(npf_bpf_t *ctx) 187{ 188 struct bpf_program *bp = &ctx->prog; 189 const u_int retoff = bp->bf_len; 190 191 /* Add the return fragment (success and failure paths). */ 192 struct bpf_insn insns_ret[] = { 193 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS), 194 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE), 195 }; 196 add_insns(ctx, insns_ret, __arraycount(insns_ret)); 197 198 /* Fixup all jumps to the main failure path. */ 199 fixup_jumps(ctx, 0, retoff, false); 200 201 return &ctx->prog; 202} 203 204const void * 205npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len) 206{ 207 *len = ctx->mlen; 208 return ctx->marks; 209} 210 211void 212npfctl_bpf_destroy(npf_bpf_t *ctx) 213{ 214 free(ctx->prog.bf_insns); 215 free(ctx->marks); 216 free(ctx); 217} 218 219/* 220 * npfctl_bpf_group: begin a logical group. It merely uses logical 221 * disjunction (OR) for compares within the group. 222 */ 223void 224npfctl_bpf_group(npf_bpf_t *ctx) 225{ 226 struct bpf_program *bp = &ctx->prog; 227 228 assert(ctx->goff == 0); 229 assert(ctx->gblock == 0); 230 231 ctx->goff = bp->bf_len; 232 ctx->gblock = ctx->nblocks; 233 ctx->ingroup = true; 234} 235 236void 237npfctl_bpf_endgroup(npf_bpf_t *ctx) 238{ 239 struct bpf_program *bp = &ctx->prog; 240 const size_t curoff = bp->bf_len; 241 242 /* If there are no blocks or only one - nothing to do. */ 243 if ((ctx->nblocks - ctx->gblock) <= 1) { 244 ctx->goff = ctx->gblock = 0; 245 return; 246 } 247 248 /* 249 * Append a failure return as a fall-through i.e. if there is 250 * no match within the group. 251 */ 252 struct bpf_insn insns_ret[] = { 253 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE), 254 }; 255 add_insns(ctx, insns_ret, __arraycount(insns_ret)); 256 257 /* 258 * Adjust jump offsets: on match - jump outside the group i.e. 259 * to the current offset. Otherwise, jump to the next instruction 260 * which would lead to the fall-through code above if none matches. 261 */ 262 fixup_jumps(ctx, ctx->goff, curoff, true); 263 ctx->goff = ctx->gblock = 0; 264} 265 266static void 267fetch_l3(npf_bpf_t *ctx, sa_family_t af, u_int flags) 268{ 269 u_int ver; 270 271 switch (af) { 272 case AF_INET: 273 ver = IPVERSION; 274 break; 275 case AF_INET6: 276 ver = IPV6_VERSION >> 4; 277 break; 278 case AF_UNSPEC: 279 ver = 0; 280 break; 281 default: 282 abort(); 283 } 284 285 /* 286 * Fetch L3 information. The coprocessor populates the following 287 * words in the scratch memory store: 288 * - BPF_MW_IPVER: IP version (4 or 6). 289 * - BPF_MW_L4OFF: L4 header offset. 290 * - BPF_MW_L4PROTO: L4 protocol. 291 */ 292 if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) { 293 const uint8_t jt = ver ? 0 : JUMP_MAGIC; 294 const uint8_t jf = ver ? JUMP_MAGIC : 0; 295 bool ingroup = ctx->ingroup; 296 297 /* 298 * L3 block cannot be inserted in the middle of a group. 299 * In fact, it never is. Check and start the group after. 300 */ 301 if (ingroup) { 302 assert(ctx->nblocks == ctx->gblock); 303 npfctl_bpf_endgroup(ctx); 304 } 305 306 /* 307 * A <- IP version; A == expected-version? 308 * If no particular version specified, check for non-zero. 309 */ 310 if ((ctx->flags & FETCHED_L3) == 0) { 311 struct bpf_insn insns_l3[] = { 312 BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_L3), 313 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf), 314 }; 315 add_insns(ctx, insns_l3, __arraycount(insns_l3)); 316 ctx->flags |= FETCHED_L3; 317 } else { 318 /* IP version is already fetched in BPF_MW_IPVER. */ 319 struct bpf_insn insns_af[] = { 320 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER), 321 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf), 322 }; 323 add_insns(ctx, insns_af, __arraycount(insns_af)); 324 } 325 ctx->af = af; 326 327 if (af) { 328 uint32_t mwords[] = { BM_IPVER, 1, af }; 329 done_raw_block(ctx, mwords, sizeof(mwords)); 330 } 331 if (ingroup) { 332 npfctl_bpf_group(ctx); 333 } 334 335 } else if (af && af != ctx->af) { 336 errx(EXIT_FAILURE, "address family mismatch"); 337 } 338 339 if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) { 340 /* X <- IP header length */ 341 struct bpf_insn insns_hlen[] = { 342 BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF), 343 }; 344 add_insns(ctx, insns_hlen, __arraycount(insns_hlen)); 345 ctx->flags |= X_EQ_L4OFF; 346 } 347} 348 349/* 350 * npfctl_bpf_proto: code block to match IP version and L4 protocol. 351 */ 352void 353npfctl_bpf_proto(npf_bpf_t *ctx, sa_family_t af, int proto) 354{ 355 assert(af != AF_UNSPEC || proto != -1); 356 357 /* Note: fails if IP version does not match. */ 358 fetch_l3(ctx, af, 0); 359 if (proto == -1) { 360 return; 361 } 362 363 struct bpf_insn insns_proto[] = { 364 /* A <- L4 protocol; A == expected-protocol? */ 365 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO), 366 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC), 367 }; 368 add_insns(ctx, insns_proto, __arraycount(insns_proto)); 369 370 uint32_t mwords[] = { BM_PROTO, 1, proto }; 371 done_block(ctx, mwords, sizeof(mwords)); 372} 373 374/* 375 * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR. 376 * 377 * => IP address shall be in the network byte order. 378 */ 379void 380npfctl_bpf_cidr(npf_bpf_t *ctx, u_int opts, sa_family_t af, 381 const npf_addr_t *addr, const npf_netmask_t mask) 382{ 383 const uint32_t *awords = (const uint32_t *)addr; 384 u_int nwords, length, maxmask, off; 385 386 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 387 assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK); 388 389 switch (af) { 390 case AF_INET: 391 maxmask = 32; 392 off = (opts & MATCH_SRC) ? 393 offsetof(struct ip, ip_src) : 394 offsetof(struct ip, ip_dst); 395 nwords = sizeof(struct in_addr) / sizeof(uint32_t); 396 break; 397 case AF_INET6: 398 maxmask = 128; 399 off = (opts & MATCH_SRC) ? 400 offsetof(struct ip6_hdr, ip6_src) : 401 offsetof(struct ip6_hdr, ip6_dst); 402 nwords = sizeof(struct in6_addr) / sizeof(uint32_t); 403 break; 404 default: 405 abort(); 406 } 407 408 /* Ensure address family. */ 409 fetch_l3(ctx, af, 0); 410 411 length = (mask == NPF_NO_NETMASK) ? maxmask : mask; 412 413 /* CAUTION: BPF operates in host byte-order. */ 414 for (u_int i = 0; i < nwords; i++) { 415 const u_int woff = i * sizeof(uint32_t); 416 uint32_t word = ntohl(awords[i]); 417 uint32_t wordmask; 418 419 if (length >= 32) { 420 /* The mask is a full word - do not apply it. */ 421 wordmask = 0; 422 length -= 32; 423 } else if (length) { 424 wordmask = 0xffffffff << (32 - length); 425 length = 0; 426 } else { 427 /* The mask became zero - skip the rest. */ 428 break; 429 } 430 431 /* A <- IP address (or one word of it) */ 432 struct bpf_insn insns_ip[] = { 433 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff), 434 }; 435 add_insns(ctx, insns_ip, __arraycount(insns_ip)); 436 437 /* A <- (A & MASK) */ 438 if (wordmask) { 439 struct bpf_insn insns_mask[] = { 440 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask), 441 }; 442 add_insns(ctx, insns_mask, __arraycount(insns_mask)); 443 } 444 445 /* A == expected-IP-word ? */ 446 struct bpf_insn insns_cmp[] = { 447 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, JUMP_MAGIC), 448 }; 449 add_insns(ctx, insns_cmp, __arraycount(insns_cmp)); 450 } 451 452 uint32_t mwords[] = { 453 (opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6, 454 af, mask, awords[0], awords[1], awords[2], awords[3], 455 }; 456 done_block(ctx, mwords, sizeof(mwords)); 457} 458 459/* 460 * npfctl_bpf_ports: code block to match TCP/UDP port range. 461 * 462 * => Port numbers shall be in the network byte order. 463 */ 464void 465npfctl_bpf_ports(npf_bpf_t *ctx, u_int opts, in_port_t from, in_port_t to) 466{ 467 const u_int sport_off = offsetof(struct udphdr, uh_sport); 468 const u_int dport_off = offsetof(struct udphdr, uh_dport); 469 u_int off; 470 471 /* TCP and UDP port offsets are the same. */ 472 assert(sport_off == offsetof(struct tcphdr, th_sport)); 473 assert(dport_off == offsetof(struct tcphdr, th_dport)); 474 475 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 476 off = (opts & MATCH_SRC) ? sport_off : dport_off; 477 478 /* X <- IP header length */ 479 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 480 481 struct bpf_insn insns_fetch[] = { 482 /* A <- port */ 483 BPF_STMT(BPF_LD+BPF_H+BPF_IND, off), 484 }; 485 add_insns(ctx, insns_fetch, __arraycount(insns_fetch)); 486 487 /* CAUTION: BPF operates in host byte-order. */ 488 from = ntohs(from); 489 to = ntohs(to); 490 491 if (from == to) { 492 /* Single port case. */ 493 struct bpf_insn insns_port[] = { 494 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC), 495 }; 496 add_insns(ctx, insns_port, __arraycount(insns_port)); 497 } else { 498 /* Port range case. */ 499 struct bpf_insn insns_range[] = { 500 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, JUMP_MAGIC), 501 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, JUMP_MAGIC, 0), 502 }; 503 add_insns(ctx, insns_range, __arraycount(insns_range)); 504 } 505 506 uint32_t mwords[] = { 507 opts & MATCH_SRC ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to 508 }; 509 done_block(ctx, mwords, sizeof(mwords)); 510} 511 512/* 513 * npfctl_bpf_tcpfl: code block to match TCP flags. 514 */ 515void 516npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask, bool checktcp) 517{ 518 const u_int tcpfl_off = offsetof(struct tcphdr, th_flags); 519 520 /* X <- IP header length */ 521 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 522 if (checktcp) { 523 const u_int jf = (tf_mask != tf) ? 3 : 2; 524 assert(ctx->ingroup == false); 525 526 /* A <- L4 protocol; A == TCP? If not, jump out. */ 527 struct bpf_insn insns_tcp[] = { 528 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO), 529 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf), 530 }; 531 add_insns(ctx, insns_tcp, __arraycount(insns_tcp)); 532 } 533 534 struct bpf_insn insns_tf[] = { 535 /* A <- TCP flags */ 536 BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off), 537 }; 538 add_insns(ctx, insns_tf, __arraycount(insns_tf)); 539 540 if (tf_mask != tf) { 541 /* A <- (A & mask) */ 542 struct bpf_insn insns_mask[] = { 543 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask), 544 }; 545 add_insns(ctx, insns_mask, __arraycount(insns_mask)); 546 } 547 548 struct bpf_insn insns_cmp[] = { 549 /* A == expected-TCP-flags? */ 550 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC), 551 }; 552 add_insns(ctx, insns_cmp, __arraycount(insns_cmp)); 553 554 if (!checktcp) { 555 uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask}; 556 done_block(ctx, mwords, sizeof(mwords)); 557 } 558} 559 560/* 561 * npfctl_bpf_icmp: code block to match ICMP type and/or code. 562 * Note: suitable both for the ICMPv4 and ICMPv6. 563 */ 564void 565npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code) 566{ 567 const u_int type_off = offsetof(struct icmp, icmp_type); 568 const u_int code_off = offsetof(struct icmp, icmp_code); 569 570 assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off); 571 assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off); 572 assert(type != -1 || code != -1); 573 574 /* X <- IP header length */ 575 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 576 577 if (type != -1) { 578 struct bpf_insn insns_type[] = { 579 BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off), 580 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC), 581 }; 582 add_insns(ctx, insns_type, __arraycount(insns_type)); 583 584 uint32_t mwords[] = { BM_ICMP_TYPE, 1, type }; 585 done_block(ctx, mwords, sizeof(mwords)); 586 } 587 588 if (code != -1) { 589 struct bpf_insn insns_code[] = { 590 BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off), 591 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC), 592 }; 593 add_insns(ctx, insns_code, __arraycount(insns_code)); 594 595 uint32_t mwords[] = { BM_ICMP_CODE, 1, code }; 596 done_block(ctx, mwords, sizeof(mwords)); 597 } 598} 599 600#define SRC_FLAG_BIT (1U << 31) 601 602/* 603 * npfctl_bpf_table: code block to match source/destination IP address 604 * against NPF table specified by ID. 605 */ 606void 607npfctl_bpf_table(npf_bpf_t *ctx, u_int opts, u_int tid) 608{ 609 const bool src = (opts & MATCH_SRC) != 0; 610 611 struct bpf_insn insns_table[] = { 612 BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid), 613 BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE), 614 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0), 615 }; 616 add_insns(ctx, insns_table, __arraycount(insns_table)); 617 618 uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid }; 619 done_block(ctx, mwords, sizeof(mwords)); 620} 621