1/*- 2 * Copyright (c) 1999-2011 Apple Inc. 3 * Copyright (c) 2006-2008 Robert N. M. Watson 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Neither the name of Apple Inc. ("Apple") nor the names of 15 * its contributors may be used to endorse or promote products derived 16 * from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR 22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include <sys/param.h> 32#include <sys/fcntl.h> 33#include <sys/kernel.h> 34#include <sys/lock.h> 35#include <sys/namei.h> 36#include <sys/proc_internal.h> 37#include <sys/kauth.h> 38#include <sys/queue.h> 39#include <sys/systm.h> 40#include <sys/time.h> 41#include <sys/ucred.h> 42#include <sys/uio.h> 43#include <sys/unistd.h> 44#include <sys/file_internal.h> 45#include <sys/vnode_internal.h> 46#include <sys/user.h> 47#include <sys/syscall.h> 48#include <sys/malloc.h> 49#include <sys/un.h> 50#include <sys/sysent.h> 51#include <sys/sysproto.h> 52#include <sys/vfs_context.h> 53#include <sys/domain.h> 54#include <sys/protosw.h> 55#include <sys/socketvar.h> 56 57#include <bsm/audit.h> 58#include <bsm/audit_internal.h> 59#include <bsm/audit_kevents.h> 60 61#include <security/audit/audit.h> 62#include <security/audit/audit_bsd.h> 63#include <security/audit/audit_private.h> 64 65#include <mach/host_priv.h> 66#include <mach/host_special_ports.h> 67#include <mach/audit_triggers_server.h> 68 69#include <kern/host.h> 70#include <kern/zalloc.h> 71#include <kern/lock.h> 72#include <kern/sched_prim.h> 73#include <kern/task.h> 74#include <kern/wait_queue.h> 75 76#include <net/route.h> 77 78#include <netinet/in.h> 79#include <netinet/in_pcb.h> 80 81/* 82 * Worker thread that will schedule disk I/O, etc. 83 */ 84static thread_t audit_thread; 85 86/* 87 * audit_ctx and audit_vp are the stored credential and vnode to use for 88 * active audit trail. They are protected by audit_worker_sl, which will be 89 * held across all I/O and all rotation to prevent them from being replaced 90 * (rotated) while in use. The audit_file_rotate_wait flag is set when the 91 * kernel has delivered a trigger to auditd to rotate the trail, and is 92 * cleared when the next rotation takes place. It is also protected by 93 * audit_worker_sl. 94 */ 95static int audit_file_rotate_wait; 96static struct slck audit_worker_sl; 97static struct vfs_context audit_ctx; 98static struct vnode *audit_vp; 99 100#define AUDIT_WORKER_SX_INIT() slck_init(&audit_worker_sl, \ 101 "audit_worker_sl") 102#define AUDIT_WORKER_SX_XLOCK() slck_lock(&audit_worker_sl) 103#define AUDIT_WORKER_SX_XUNLOCK() slck_unlock(&audit_worker_sl) 104#define AUDIT_WORKER_SX_ASSERT() slck_assert(&audit_worker_sl, SL_OWNED) 105#define AUDIT_WORKER_SX_DESTROY() slck_destroy(&audit_worker_sl) 106 107/* 108 * The audit_q_draining flag is set when audit is disabled and the audit 109 * worker queue is being drained. 110 */ 111static int audit_q_draining; 112 113/* 114 * The special kernel audit record, audit_drain_kar, is used to mark the end of 115 * the queue when draining it. 116 */ 117static struct kaudit_record audit_drain_kar = { 118 .k_ar = { 119 .ar_event = AUE_NULL, 120 }, 121 .k_ar_commit = AR_DRAIN_QUEUE, 122}; 123 124/* 125 * Write an audit record to a file, performed as the last stage after both 126 * preselection and BSM conversion. Both space management and write failures 127 * are handled in this function. 128 * 129 * No attempt is made to deal with possible failure to deliver a trigger to 130 * the audit daemon, since the message is asynchronous anyway. 131 */ 132static void 133audit_record_write(struct vnode *vp, struct vfs_context *ctx, void *data, 134 size_t len) 135{ 136 static struct timeval last_lowspace_trigger; 137 static struct timeval last_fail; 138 static int cur_lowspace_trigger; 139 struct vfsstatfs *mnt_stat; 140 int error; 141 static int cur_fail; 142 uint64_t temp; 143 off_t file_size; 144 145 AUDIT_WORKER_SX_ASSERT(); /* audit_file_rotate_wait. */ 146 147 if (vp == NULL) 148 return; 149 150 if (vnode_getwithref(vp)) 151 return /*(ENOENT)*/; 152 153 mnt_stat = &vp->v_mount->mnt_vfsstat; 154 155 /* 156 * First, gather statistics on the audit log file and file system so 157 * that we know how we're doing on space. Consider failure of these 158 * operations to indicate a future inability to write to the file. 159 */ 160 error = vfs_update_vfsstat(vp->v_mount, ctx, VFS_KERNEL_EVENT); 161 if (error) 162 goto fail; 163 error = vnode_size(vp, &file_size, ctx); 164 if (error) 165 goto fail; 166 audit_fstat.af_currsz = (u_quad_t)file_size; 167 168 /* 169 * We handle four different space-related limits: 170 * 171 * - A fixed (hard) limit on the minimum free blocks we require on 172 * the file system, and results in record loss, a trigger, and 173 * possible fail stop due to violating invariants. 174 * 175 * - An administrative (soft) limit, which when fallen below, results 176 * in the kernel notifying the audit daemon of low space. 177 * 178 * - An audit trail size limit, which when gone above, results in the 179 * kernel notifying the audit daemon that rotation is desired. 180 * 181 * - The total depth of the kernel audit record exceeding free space, 182 * which can lead to possible fail stop (with drain), in order to 183 * prevent violating invariants. Failure here doesn't halt 184 * immediately, but prevents new records from being generated. 185 * 186 * Possibly, the last of these should be handled differently, always 187 * allowing a full queue to be lost, rather than trying to prevent 188 * loss. 189 * 190 * First, handle the hard limit, which generates a trigger and may 191 * fail stop. This is handled in the same manner as ENOSPC from 192 * VOP_WRITE, and results in record loss. 193 */ 194 if (mnt_stat->f_bfree < AUDIT_HARD_LIMIT_FREE_BLOCKS) { 195 error = ENOSPC; 196 goto fail_enospc; 197 } 198 199 /* 200 * Second, handle falling below the soft limit, if defined; we send 201 * the daemon a trigger and continue processing the record. Triggers 202 * are limited to 1/sec. 203 */ 204 if (audit_qctrl.aq_minfree != 0) { 205 temp = mnt_stat->f_blocks / (100 / audit_qctrl.aq_minfree); 206 if (mnt_stat->f_bfree < temp && 207 ppsratecheck(&last_lowspace_trigger, 208 &cur_lowspace_trigger, 1)) 209 (void)audit_send_trigger( 210 AUDIT_TRIGGER_LOW_SPACE); 211 } 212 213 /* 214 * If the current file is getting full, generate a rotation trigger 215 * to the daemon. This is only approximate, which is fine as more 216 * records may be generated before the daemon rotates the file. 217 */ 218 if ((audit_fstat.af_filesz != 0) && (audit_file_rotate_wait == 0) && 219 ((u_quad_t)file_size >= audit_fstat.af_filesz)) { 220 AUDIT_WORKER_SX_ASSERT(); 221 222 audit_file_rotate_wait = 1; 223 (void)audit_send_trigger(AUDIT_TRIGGER_ROTATE_KERNEL); 224 } 225 226 /* 227 * If the estimated amount of audit data in the audit event queue 228 * (plus records allocated but not yet queued) has reached the amount 229 * of free space on the disk, then we need to go into an audit fail 230 * stop state, in which we do not permit the allocation/committing of 231 * any new audit records. We continue to process records but don't 232 * allow any activities that might generate new records. In the 233 * future, we might want to detect when space is available again and 234 * allow operation to continue, but this behavior is sufficient to 235 * meet fail stop requirements in CAPP. 236 */ 237 if (audit_fail_stop) { 238 if ((unsigned long)((audit_q_len + audit_pre_q_len + 1) * 239 MAX_AUDIT_RECORD_SIZE) / mnt_stat->f_bsize >= 240 (unsigned long)(mnt_stat->f_bfree)) { 241 if (ppsratecheck(&last_fail, &cur_fail, 1)) 242 printf("audit_record_write: free space " 243 "below size of audit queue, failing " 244 "stop\n"); 245 audit_in_failure = 1; 246 } else if (audit_in_failure) { 247 /* 248 * Note: if we want to handle recovery, this is the 249 * spot to do it: unset audit_in_failure, and issue a 250 * wakeup on the cv. 251 */ 252 } 253 } 254 255 error = vn_rdwr(UIO_WRITE, vp, data, len, (off_t)0, UIO_SYSSPACE, 256 IO_APPEND|IO_UNIT, vfs_context_ucred(ctx), NULL, 257 vfs_context_proc(ctx)); 258 if (error == ENOSPC) 259 goto fail_enospc; 260 else if (error) 261 goto fail; 262 263 /* 264 * Catch completion of a queue drain here; if we're draining and the 265 * queue is now empty, fail stop. That audit_fail_stop is implicitly 266 * true, since audit_in_failure can only be set of audit_fail_stop is 267 * set. 268 * 269 * Note: if we handle recovery from audit_in_failure, then we need to 270 * make panic here conditional. 271 */ 272 if (audit_in_failure) { 273 if (audit_q_len == 0 && audit_pre_q_len == 0) { 274 (void)VNOP_FSYNC(vp, MNT_WAIT, ctx); 275 panic("Audit store overflow; record queue drained."); 276 } 277 } 278 279 vnode_put(vp); 280 return; 281 282fail_enospc: 283 /* 284 * ENOSPC is considered a special case with respect to failures, as 285 * this can reflect either our preemptive detection of insufficient 286 * space, or ENOSPC returned by the vnode write call. 287 */ 288 if (audit_fail_stop) { 289 (void)VNOP_FSYNC(vp, MNT_WAIT, ctx); 290 panic("Audit log space exhausted and fail-stop set."); 291 } 292 (void)audit_send_trigger(AUDIT_TRIGGER_NO_SPACE); 293 audit_suspended = 1; 294 295 /* FALLTHROUGH */ 296fail: 297 /* 298 * We have failed to write to the file, so the current record is 299 * lost, which may require an immediate system halt. 300 */ 301 if (audit_panic_on_write_fail) { 302 (void)VNOP_FSYNC(vp, MNT_WAIT, ctx); 303 panic("audit_worker: write error %d\n", error); 304 } else if (ppsratecheck(&last_fail, &cur_fail, 1)) 305 printf("audit_worker: write error %d\n", error); 306 vnode_put(vp); 307} 308 309/* 310 * Given a kernel audit record, process as required. Kernel audit records 311 * are converted to one, or possibly two, BSM records, depending on whether 312 * there is a user audit record present also. Kernel records need be 313 * converted to BSM before they can be written out. Both types will be 314 * written to disk, and audit pipes. 315 */ 316static void 317audit_worker_process_record(struct kaudit_record *ar) 318{ 319 struct au_record *bsm; 320 au_class_t class; 321 au_event_t event; 322 au_id_t auid; 323 int error, sorf; 324 int trail_locked; 325 326 /* 327 * We hold the audit_worker_sl lock over both writes, if there are 328 * two, so that the two records won't be split across a rotation and 329 * end up in two different trail files. 330 */ 331 if (((ar->k_ar_commit & AR_COMMIT_USER) && 332 (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) || 333 (ar->k_ar_commit & AR_PRESELECT_TRAIL)) { 334 AUDIT_WORKER_SX_XLOCK(); 335 trail_locked = 1; 336 } else 337 trail_locked = 0; 338 339 /* 340 * First, handle the user record, if any: commit to the system trail 341 * and audit pipes as selected. 342 */ 343 if ((ar->k_ar_commit & AR_COMMIT_USER) && 344 (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) { 345 AUDIT_WORKER_SX_ASSERT(); 346 audit_record_write(audit_vp, &audit_ctx, ar->k_udata, 347 ar->k_ulen); 348 } 349 350 if ((ar->k_ar_commit & AR_COMMIT_USER) && 351 (ar->k_ar_commit & AR_PRESELECT_USER_PIPE)) 352 audit_pipe_submit_user(ar->k_udata, ar->k_ulen); 353 354 if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) || 355 ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 && 356 (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0 && 357 (ar->k_ar_commit & AR_PRESELECT_FILTER) == 0)) 358 goto out; 359 360 auid = ar->k_ar.ar_subj_auid; 361 event = ar->k_ar.ar_event; 362 class = au_event_class(event); 363 if (ar->k_ar.ar_errno == 0) 364 sorf = AU_PRS_SUCCESS; 365 else 366 sorf = AU_PRS_FAILURE; 367 368 error = kaudit_to_bsm(ar, &bsm); 369 switch (error) { 370 case BSM_NOAUDIT: 371 goto out; 372 373 case BSM_FAILURE: 374 printf("audit_worker_process_record: BSM_FAILURE\n"); 375 goto out; 376 377 case BSM_SUCCESS: 378 break; 379 380 default: 381 panic("kaudit_to_bsm returned %d", error); 382 } 383 384 if (ar->k_ar_commit & AR_PRESELECT_TRAIL) { 385 AUDIT_WORKER_SX_ASSERT(); 386 audit_record_write(audit_vp, &audit_ctx, bsm->data, bsm->len); 387 } 388 389 if (ar->k_ar_commit & AR_PRESELECT_PIPE) 390 audit_pipe_submit(auid, event, class, sorf, 391 ar->k_ar_commit & AR_PRESELECT_TRAIL, bsm->data, 392 bsm->len); 393 394 if (ar->k_ar_commit & AR_PRESELECT_FILTER) { 395 396 /* 397 * XXXss - This needs to be generalized so new filters can 398 * be easily plugged in. 399 */ 400 audit_sdev_submit(auid, ar->k_ar.ar_subj_asid, bsm->data, 401 bsm->len); 402 } 403 404 kau_free(bsm); 405out: 406 if (trail_locked) 407 AUDIT_WORKER_SX_XUNLOCK(); 408} 409 410/* 411 * The audit_worker thread is responsible for watching the event queue, 412 * dequeueing records, converting them to BSM format, and committing them to 413 * disk. In order to minimize lock thrashing, records are dequeued in sets 414 * to a thread-local work queue. 415 * 416 * Note: this means that the effect bound on the size of the pending record 417 * queue is 2x the length of the global queue. 418 */ 419static void 420audit_worker(void) 421{ 422 struct kaudit_queue ar_worklist; 423 struct kaudit_record *ar; 424 int lowater_signal; 425 426 if (audit_ctx.vc_thread == NULL) 427 audit_ctx.vc_thread = current_thread(); 428 429 TAILQ_INIT(&ar_worklist); 430 mtx_lock(&audit_mtx); 431 while (1) { 432 mtx_assert(&audit_mtx, MA_OWNED); 433 434 /* 435 * Wait for a record. 436 */ 437 while (TAILQ_EMPTY(&audit_q)) 438 cv_wait_continuation(&audit_worker_cv, &audit_mtx, 439 (thread_continue_t)audit_worker); 440 441 /* 442 * If there are records in the global audit record queue, 443 * transfer them to a thread-local queue and process them 444 * one by one. If we cross the low watermark threshold, 445 * signal any waiting processes that they may wake up and 446 * continue generating records. 447 */ 448 lowater_signal = 0; 449 while ((ar = TAILQ_FIRST(&audit_q))) { 450 TAILQ_REMOVE(&audit_q, ar, k_q); 451 audit_q_len--; 452 if (audit_q_len == audit_qctrl.aq_lowater) 453 lowater_signal++; 454 TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q); 455 } 456 if (lowater_signal) 457 cv_broadcast(&audit_watermark_cv); 458 459 mtx_unlock(&audit_mtx); 460 while ((ar = TAILQ_FIRST(&ar_worklist))) { 461 TAILQ_REMOVE(&ar_worklist, ar, k_q); 462 if (ar->k_ar_commit & AR_DRAIN_QUEUE) { 463 audit_q_draining = 0; 464 cv_broadcast(&audit_drain_cv); 465 } else { 466 audit_worker_process_record(ar); 467 audit_free(ar); 468 } 469 } 470 mtx_lock(&audit_mtx); 471 } 472} 473 474/* 475 * audit_rotate_vnode() is called by a user or kernel thread to configure or 476 * de-configure auditing on a vnode. The arguments are the replacement 477 * credential (referenced) and vnode (referenced and opened) to substitute 478 * for the current credential and vnode, if any. If either is set to NULL, 479 * both should be NULL, and this is used to indicate that audit is being 480 * disabled. Any previous cred/vnode will be closed and freed. We re-enable 481 * generating rotation requests to auditd. 482 */ 483void 484audit_rotate_vnode(kauth_cred_t cred, struct vnode *vp) 485{ 486 kauth_cred_t old_audit_cred; 487 struct vnode *old_audit_vp; 488 489 KASSERT((cred != NULL && vp != NULL) || (cred == NULL && vp == NULL), 490 ("audit_rotate_vnode: cred %p vp %p", cred, vp)); 491 492 493 mtx_lock(&audit_mtx); 494 if (audit_enabled && (NULL == vp)) { 495 /* Auditing is currently enabled but will be disabled. */ 496 497 /* 498 * Disable auditing now so nothing more is added while the 499 * audit worker thread is draining the audit record queue. 500 */ 501 audit_enabled = 0; 502 503 /* 504 * Drain the auditing queue by inserting a drain record at the 505 * end of the queue and waiting for the audit worker thread 506 * to find this record and signal that it is done before 507 * we close the audit trail. 508 */ 509 audit_q_draining = 1; 510 while (audit_q_len >= audit_qctrl.aq_hiwater) 511 cv_wait(&audit_watermark_cv, &audit_mtx); 512 TAILQ_INSERT_TAIL(&audit_q, &audit_drain_kar, k_q); 513 audit_q_len++; 514 cv_signal(&audit_worker_cv); 515 } 516 517 /* If the audit queue is draining then wait here until it's done. */ 518 while (audit_q_draining) 519 cv_wait(&audit_drain_cv, &audit_mtx); 520 mtx_unlock(&audit_mtx); 521 522 523 /* 524 * Rotate the vnode/cred, and clear the rotate flag so that we will 525 * send a rotate trigger if the new file fills. 526 */ 527 AUDIT_WORKER_SX_XLOCK(); 528 old_audit_cred = audit_ctx.vc_ucred; 529 old_audit_vp = audit_vp; 530 audit_ctx.vc_ucred = cred; 531 audit_vp = vp; 532 audit_file_rotate_wait = 0; 533 audit_enabled = (audit_vp != NULL); 534 AUDIT_WORKER_SX_XUNLOCK(); 535 536 /* 537 * If there was an old vnode/credential, close and free. 538 */ 539 if (old_audit_vp != NULL) { 540 if (vnode_get(old_audit_vp) == 0) { 541 vn_close(old_audit_vp, AUDIT_CLOSE_FLAGS, 542 vfs_context_kernel()); 543 vnode_put(old_audit_vp); 544 } else 545 printf("audit_rotate_vnode: Couldn't close " 546 "audit file.\n"); 547 kauth_cred_unref(&old_audit_cred); 548 } 549} 550 551void 552audit_worker_init(void) 553{ 554 555 AUDIT_WORKER_SX_INIT(); 556 kernel_thread_start((thread_continue_t)audit_worker, NULL, 557 &audit_thread); 558 if (audit_thread == THREAD_NULL) 559 panic("audit_worker_init: Couldn't create audit_worker thread"); 560} 561