1/*
2 * ntfs_page.c - NTFS kernel page operations.
3 *
4 * Copyright (c) 2006-2011 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2011 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/errno.h>
39#include <sys/stat.h>
40#include <sys/time.h>
41#include <sys/ucred.h>
42#include <sys/ubc.h>
43#include <sys/vnode.h>
44
45#include <kern/debug.h>
46#include <kern/locks.h>
47
48#include "ntfs_attr.h"
49#include "ntfs_compress.h"
50#include "ntfs_debug.h"
51#include "ntfs_inode.h"
52#include "ntfs_layout.h"
53#include "ntfs_page.h"
54#include "ntfs_types.h"
55#include "ntfs_volume.h"
56
57/**
58 * ntfs_pagein - read a range of pages into memory
59 * @ni:		ntfs inode whose data to read into the page range
60 * @attr_ofs:	byte offset in the inode at which to start
61 * @size:	number of bytes to read from the inode
62 * @upl:	page list describing destination page range
63 * @upl_ofs:	byte offset into page list at which to start
64 * @flags:	flags further describing the pagein request
65 *
66 * Read @size bytes from the ntfs inode @ni, starting at byte offset @attr_ofs
67 * into the inode, into the range of pages specified by the page list @upl,
68 * starting at byte offset @upl_ofs into the page list.
69 *
70 * The @flags further describe the pagein request.  The following pagein flags
71 * are currently defined in OSX kernel:
72 *	UPL_IOSYNC	- Perform synchronous i/o.
73 *	UPL_NOCOMMIT	- Do not commit/abort the page range.
74 *	UPL_NORDAHEAD	- Do not perform any speculative read-ahead.
75 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
76 *
77 * Inside the ntfs driver we have the need to perform pageins whilst the inode
78 * is locked for writing (@ni->lock) thus we cheat and set UPL_NESTED_PAGEOUT
79 * in @flags when this is the case.  We make sure to clear it in @flags before
80 * calling into the cluster layer so we do not accidentally cause confusion.
81 *
82 * For encrypted attributes we abort for now as we do not support them yet.
83 *
84 * For non-resident, non-compressed attributes we use cluster_pagein_ext()
85 * which deals with both normal and multi sector transfer protected attributes.
86 *
87 * For resident attributes and non-resident, compressed attributes we read the
88 * data ourselves by mapping the page list, and in the resident case, mapping
89 * the mft record, looking up the attribute in it, and copying the requested
90 * data from the mapped attribute into the page list, then unmapping the mft
91 * record, whilst for non-resident, compressed attributes, we get the raw inode
92 * and use it with ntfs_read_compressed() to read and decompress the data into
93 * our mapped page list.  We then unmap the page list and finally, if
94 * UPL_NOCOMMIT is not specified, we commit (success) or abort (error) the page
95 * range.
96 *
97 * Return 0 on success and errno on error.
98 *
99 * Note the pages in the page list are marked busy on entry and the busy bit is
100 * cleared when we commit the page range.  Thus it is perfectly safe for us to
101 * fill the pages with encrypted or mst protected data and to decrypt or mst
102 * deprotect in place before committing the page range.
103 *
104 * Adapted from cluster_pagein_ext().
105 *
106 * Locking: - Caller must hold an iocount reference on the vnode of @ni.
107 *	    - Caller must not hold @ni->lock or if it is held it must be for
108 *	      reading unless UPL_NESTED_PAGEOUT is set in @flags in which case
109 *	      the caller must hold @ni->lock for reading or writing.
110 */
111int ntfs_pagein(ntfs_inode *ni, s64 attr_ofs, unsigned size, upl_t upl,
112		upl_offset_t upl_ofs, int flags)
113{
114	s64 attr_size;
115	u8 *kaddr;
116	kern_return_t kerr;
117	unsigned to_read;
118	int err;
119	BOOL locked = FALSE;
120
121	ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%x, "
122			"pagein flags 0x%x, page list offset 0x%llx.",
123			(unsigned long long)ni->mft_no,
124			(unsigned long long)attr_ofs, size, flags,
125			(unsigned long long)upl_ofs);
126	/*
127	 * If the caller did not specify any i/o, then we are done.  We cannot
128	 * issue an abort because we do not have a upl or we do not know its
129	 * size.
130	 */
131	if (!upl) {
132		ntfs_error(ni->vol->mp, "NULL page list passed in (error "
133				"EINVAL).");
134		return EINVAL;
135	}
136	if (S_ISDIR(ni->mode)) {
137		ntfs_error(ni->vol->mp, "Called for directory vnode.");
138		err = EISDIR;
139		goto err;
140	}
141	/*
142	 * Protect against changes in initialized_size and thus against
143	 * truncation also unless UPL_NESTED_PAGEOUT is set in which case the
144	 * caller has already taken @ni->lock for exclusive access.  We simply
145	 * leave @locked to be FALSE in this case so we do not try to drop the
146	 * lock later on.
147	 *
148	 * If UPL_NESTED_PAGEOUT is set we clear it in @flags to ensure we do
149	 * not cause confusion in the cluster layer or the VM.
150	 */
151	if (flags & UPL_NESTED_PAGEOUT)
152		flags &= ~UPL_NESTED_PAGEOUT;
153	else {
154		locked = TRUE;
155		lck_rw_lock_shared(&ni->lock);
156	}
157	/* Do not allow messing with the inode once it has been deleted. */
158	if (NInoDeleted(ni)) {
159		/* Remove the inode from the name cache. */
160		cache_purge(ni->vn);
161		err = ENOENT;
162		goto err;
163	}
164retry_pagein:
165	/*
166	 * We guarantee that the size in the ubc will be smaller or equal to
167	 * the size in the ntfs inode thus no need to check @ni->data_size.
168	 */
169	attr_size = ubc_getsize(ni->vn);
170	/*
171	 * Only $DATA attributes can be encrypted/compressed.  Index root can
172	 * have the flags set but this means to create compressed/encrypted
173	 * files, not that the attribute is compressed/encrypted.  Note we need
174	 * to check for AT_INDEX_ALLOCATION since this is the type of directory
175	 * index inodes.
176	 */
177	if (ni->type != AT_INDEX_ALLOCATION) {
178		/* TODO: Deny access to encrypted attributes, just like NT4. */
179		if (NInoEncrypted(ni)) {
180			if (ni->type != AT_DATA)
181				panic("%s(): Encrypted non-data attribute.\n",
182						__FUNCTION__);
183			ntfs_warning(ni->vol->mp, "Denying access to "
184					"encrypted attribute (EACCES).");
185			err = EACCES;
186			goto err;
187		}
188		/* Compressed data streams need special handling. */
189		if (NInoNonResident(ni) && NInoCompressed(ni) && !NInoRaw(ni)) {
190			if (ni->type != AT_DATA)
191				panic("%s(): Compressed non-data attribute.\n",
192						__FUNCTION__);
193			goto compressed;
194		}
195	}
196	/* NInoNonResident() == NInoIndexAllocPresent() */
197	if (NInoNonResident(ni)) {
198		int (*callback)(buf_t, void *);
199
200		callback = NULL;
201		if (NInoMstProtected(ni) || NInoEncrypted(ni))
202			callback = ntfs_cluster_iodone;
203		/* Non-resident, possibly mst protected, attribute. */
204		err = cluster_pagein_ext(ni->vn, upl, upl_ofs, attr_ofs, size,
205				attr_size, flags, callback, NULL);
206		if (!err)
207			ntfs_debug("Done (cluster_pagein_ext()).");
208		else
209			ntfs_error(ni->vol->mp, "Failed (cluster_pagein_ext(), "
210					"error %d).", err);
211		if (locked)
212			lck_rw_unlock_shared(&ni->lock);
213		return err;
214	}
215compressed:
216	/*
217	 * The attribute is resident and/or compressed.
218	 *
219	 * Cannot pagein from a negative offset or if we are starting beyond
220	 * the end of the attribute or if the attribute offset is not page
221	 * aligned or the size requested is not a multiple of PAGE_SIZE.
222	 */
223	if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 ||
224			size & PAGE_MASK || upl_ofs & PAGE_MASK) {
225		err = EINVAL;
226		goto err;
227	}
228	to_read = size;
229	attr_size -= attr_ofs;
230	if (to_read > attr_size)
231		to_read = attr_size;
232	/*
233	 * We do not need @attr_size any more so reuse it to hold the number of
234	 * bytes available in the attribute starting at offset @attr_ofs up to
235	 * a maximum of the requested number of bytes rounded up to a multiple
236	 * of the system page size.
237	 */
238	attr_size = (to_read + PAGE_MASK) & ~PAGE_MASK;
239	/* Abort any pages outside the end of the attribute. */
240	if (size > attr_size && !(flags & UPL_NOCOMMIT)) {
241		ubc_upl_abort_range(upl, upl_ofs + attr_size, size - attr_size,
242				UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
243		/* Update @size. */
244		size = attr_size;
245	}
246	/* To access the page list contents, we need to map the page list. */
247	kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
248	if (kerr != KERN_SUCCESS) {
249		ntfs_error(ni->vol->mp, "ubc_upl_map() failed (error %d).",
250				(int)kerr);
251		err = EIO;
252		goto err;
253	}
254	if (!NInoNonResident(ni)) {
255		/*
256		 * Read the data from the resident attribute into the page
257		 * list.
258		 */
259		err = ntfs_resident_attr_read(ni, attr_ofs, size,
260				kaddr + upl_ofs);
261		if (err && err != EAGAIN)
262			ntfs_error(ni->vol->mp, "ntfs_resident_attr_read() "
263					"failed (error %d).", err);
264	} else {
265		ntfs_inode *raw_ni;
266		int ioflags;
267
268		/*
269		 * Get the raw inode.  We take the inode lock shared to protect
270		 * against concurrent writers as the compressed data is invalid
271		 * whilst a write is in progress.
272		 */
273		err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni);
274		if (err)
275			ntfs_error(ni->vol->mp, "Failed to get raw inode "
276					"(error %d).", err);
277		else {
278			if (!NInoRaw(raw_ni))
279				panic("%s(): Requested raw inode but got "
280						"non-raw one.\n", __FUNCTION__);
281			ioflags = 0;
282			if (vnode_isnocache(ni->vn) ||
283					vnode_isnocache(raw_ni->vn))
284				ioflags |= IO_NOCACHE;
285			if (vnode_isnoreadahead(ni->vn) ||
286					vnode_isnoreadahead(raw_ni->vn))
287				ioflags |= IO_RAOFF;
288			err = ntfs_read_compressed(ni, raw_ni, attr_ofs, size,
289					kaddr + upl_ofs, NULL, ioflags);
290			if (err)
291				ntfs_error(ni->vol->mp,
292						"ntfs_read_compressed() "
293						"failed (error %d).", err);
294			lck_rw_unlock_shared(&raw_ni->lock);
295			(void)vnode_put(raw_ni->vn);
296		}
297	}
298	kerr = ubc_upl_unmap(upl);
299	if (kerr != KERN_SUCCESS) {
300		ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
301				(int)kerr);
302		if (!err)
303			err = EIO;
304	}
305	if (!err) {
306		if (!(flags & UPL_NOCOMMIT)) {
307			/* Commit the page range we brought up to date. */
308			ubc_upl_commit_range(upl, upl_ofs, size,
309					UPL_COMMIT_FREE_ON_EMPTY);
310		}
311		ntfs_debug("Done (%s).", !NInoNonResident(ni) ?
312				"ntfs_resident_attr_read()" :
313				"ntfs_read_compressed()");
314	} else /* if (err) */ {
315		/*
316		 * If the attribute was converted to non-resident under our
317		 * nose, retry the pagein.
318		 *
319		 * TODO: This may no longer be possible to happen now that we
320		 * lock against changes in initialized size and thus
321		 * truncation...  Revisit this issue when the write code has
322		 * been written and remove the check + goto if appropriate.
323		 */
324		if (err == EAGAIN)
325			goto retry_pagein;
326err:
327		if (!(flags & UPL_NOCOMMIT)) {
328			int upl_flags = UPL_ABORT_FREE_ON_EMPTY;
329			if (err != ENOMEM)
330				upl_flags |= UPL_ABORT_ERROR;
331			ubc_upl_abort_range(upl, upl_ofs, size, upl_flags);
332		}
333		ntfs_error(ni->vol->mp, "Failed (error %d).", err);
334	}
335	if (locked)
336		lck_rw_unlock_shared(&ni->lock);
337	return err;
338}
339
340/**
341 * ntfs_page_map_ext - map a page of a vnode into memory
342 * @ni:		ntfs inode of which to map a page
343 * @ofs:	byte offset into @ni of which to map a page
344 * @upl:	destination page list for the page
345 * @pl:		destination array of pages containing the page itself
346 * @kaddr:	destination pointer for the address of the mapped page contents
347 * @uptodate:	if true return an uptodate page and if false return it as is
348 * @rw:		if true we intend to modify the page and if false we do not
349 *
350 * Map the page corresponding to byte offset @ofs into the ntfs inode @ni into
351 * memory and return the page list in @upl, the array of pages containing the
352 * page in @pl and the address of the mapped page contents in @kaddr.
353 *
354 * If @uptodate is true the page is returned uptodate, i.e. if the page is
355 * currently not valid, it will be brought uptodate via a call to ntfs_pagein()
356 * before it is returned.  And if @uptodate is false, the page is just returned
357 * ignoring its state.  This means the page may or may not be uptodate.
358 *
359 * The caller must set @rw to true if the page is going to be modified and to
360 * false otherwise.
361 *
362 * Note: @ofs must be page aligned.
363 *
364 * Locking: - Caller must hold an iocount reference on the vnode of @ni.
365 *	    - Caller must hold @ni->lock for reading or writing.
366 *
367 * Return 0 on success and errno on error in which case *@upl is set to NULL.
368 */
369errno_t ntfs_page_map_ext(ntfs_inode *ni, s64 ofs, upl_t *upl,
370		upl_page_info_array_t *pl, u8 **kaddr, const BOOL uptodate,
371		const BOOL rw)
372{
373	s64 size;
374	kern_return_t kerr;
375	int abort_flags;
376	errno_t err;
377
378	ntfs_debug("Entering for inode 0x%llx, offset 0x%llx, rw is %s.",
379			(unsigned long long)ni->mft_no,
380			(unsigned long long)ofs,
381			rw ? "true" : "false");
382	if (ofs & PAGE_MASK)
383		panic("%s() called with non page aligned offset (0x%llx).",
384				__FUNCTION__, (unsigned long long)ofs);
385	lck_spin_lock(&ni->size_lock);
386	size = ubc_getsize(ni->vn);
387	if (size > ni->data_size)
388		size = ni->data_size;
389	lck_spin_unlock(&ni->size_lock);
390	if (ofs > size) {
391		ntfs_error(ni->vol->mp, "Offset 0x%llx is outside the end of "
392				"the attribute (0x%llx).",
393				(unsigned long long)ofs,
394				(unsigned long long)size);
395		err = EINVAL;
396		goto err;
397	}
398	/* Create a page list for the wanted page. */
399	kerr = ubc_create_upl(ni->vn, ofs, PAGE_SIZE, upl, pl, UPL_SET_LITE |
400			(rw ? UPL_WILL_MODIFY : 0));
401	if (kerr != KERN_SUCCESS)
402		panic("%s(): Failed to get page (error %d).\n", __FUNCTION__,
403				(int)kerr);
404	/*
405	 * If the page is not valid, need to read it in from the vnode now thus
406	 * making it valid.
407	 *
408	 * We set UPL_NESTED_PAGEOUT to let ntfs_pagein() know that we already
409	 * have the inode locked (@ni->lock is held by the caller).
410	 */
411	if (uptodate && !upl_valid_page(*pl, 0)) {
412		ntfs_debug("Reading page as it was not valid.");
413		err = ntfs_pagein(ni, ofs, PAGE_SIZE, *upl, 0, UPL_IOSYNC |
414				UPL_NOCOMMIT | UPL_NESTED_PAGEOUT);
415		if (err) {
416			ntfs_error(ni->vol->mp, "Failed to read page (error "
417					"%d).", err);
418			goto pagein_err;
419		}
420	}
421	/* Map the page into the kernel's address space. */
422	kerr = ubc_upl_map(*upl, (vm_offset_t*)kaddr);
423	if (kerr == KERN_SUCCESS) {
424		ntfs_debug("Done.");
425		return 0;
426	}
427	ntfs_error(ni->vol->mp, "Failed to map page (error %d).",
428			(int)kerr);
429	err = EIO;
430pagein_err:
431	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
432	if (!upl_valid_page(*pl, 0) ||
433			(vnode_isnocache(ni->vn) && !upl_dirty_page(*pl, 0)))
434		abort_flags |= UPL_ABORT_DUMP_PAGES;
435	ubc_upl_abort_range(*upl, 0, PAGE_SIZE, abort_flags);
436err:
437	*upl = NULL;
438	return err;
439}
440
441/**
442 * ntfs_page_unmap - unmap a page belonging to a vnode from memory
443 * @ni:		ntfs inode to which the page belongs
444 * @upl:	page list of the page
445 * @pl:		array of pages containing the page itself
446 * @mark_dirty:	mark the page dirty
447 *
448 * Unmap the page belonging to the ntfs inode @ni from memory releasing it back
449 * to the vm.
450 *
451 * The page is described by the page list @upl, the array of pages containing
452 * the page @pl and the address of the mapped page contents @kaddr.
453 *
454 * If @mark_dirty is TRUE, tell the vm to mark the page dirty when releasing
455 * the page.
456 *
457 * Locking: Caller must hold an iocount reference on the vnode of @ni.
458 */
459void ntfs_page_unmap(ntfs_inode *ni, upl_t upl, upl_page_info_array_t pl,
460		const BOOL mark_dirty)
461{
462	kern_return_t kerr;
463	BOOL was_valid, was_dirty;
464
465	was_valid = upl_valid_page(pl, 0);
466	/* The page dirty bit is only valid if the page was valid. */
467	was_dirty = (was_valid && upl_dirty_page(pl, 0));
468	ntfs_debug("Entering for inode 0x%llx, page was %svalid %s %sdirty%s.",
469			(unsigned long long)ni->mft_no,
470			was_valid ? "" : "not ",
471			(int)was_valid ^ (int)was_dirty ? "but" : "and",
472			was_dirty ? "" : "not ",
473			mark_dirty ? ", marking it dirty" : "");
474	/* Unmap the page from the kernel's address space. */
475	kerr = ubc_upl_unmap(upl);
476	if (kerr != KERN_SUCCESS)
477		ntfs_warning(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
478				(int)kerr);
479	/*
480	 * If the page was valid and dirty or is being made dirty or if caching
481	 * for the vnode is enabled (as it will usually be the case for all
482	 * metadata files), commit it thus releasing it into the vm taking care
483	 * to preserve the dirty state and marking the page dirty if requested
484	 * when committing the page.
485	 *
486	 * If the page was not valid or was valid but not dirty, it is not
487	 * being marked dirty, and caching is disabled on the vnode, dump the
488	 * page.
489	 */
490	if (was_dirty || mark_dirty || !vnode_isnocache(ni->vn)) {
491		int commit_flags;
492
493		commit_flags = UPL_COMMIT_FREE_ON_EMPTY |
494				UPL_COMMIT_INACTIVATE;
495		if (!was_valid && !mark_dirty)
496			commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
497		else if (was_dirty || mark_dirty)
498			commit_flags |= UPL_COMMIT_SET_DIRTY;
499		ubc_upl_commit_range(upl, 0, PAGE_SIZE, commit_flags);
500		ntfs_debug("Done (committed page).");
501	} else {
502		ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES |
503				UPL_ABORT_FREE_ON_EMPTY);
504		ntfs_debug("Done (dumped page).");
505	}
506}
507
508/**
509 * ntfs_page_dump - discard a page belonging to a vnode from memory
510 * @ni:		ntfs inode to which the page belongs
511 * @upl:	page list of the page
512 * @pl:		array of pages containing the page itself
513 *
514 * Unmap the page belonging to the ntfs inode @ni from memory throwing it away.
515 * Note that if the page is dirty all changes to the page will be lost as it
516 * will be discarded so use this function with extreme caution.
517 *
518 * The page is described by the page list @upl, the array of pages containing
519 * the page @pl and the address of the mapped page contents @kaddr.
520 *
521 * Locking: Caller must hold an iocount reference on the vnode of @ni.
522 */
523void ntfs_page_dump(ntfs_inode *ni, upl_t upl,
524		upl_page_info_array_t pl __unused)
525{
526	kern_return_t kerr;
527
528	ntfs_debug("Entering for inode 0x%llx, page is %svalid, %sdirty.",
529			(unsigned long long)ni->mft_no,
530			upl_valid_page(pl, 0) ? "" : "not ",
531			upl_dirty_page(pl, 0) ? "" : "not ");
532	/* Unmap the page from the kernel's address space. */
533	kerr = ubc_upl_unmap(upl);
534	if (kerr != KERN_SUCCESS)
535		ntfs_warning(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
536				(int)kerr);
537	/* Dump the page. */
538	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES |
539			UPL_ABORT_FREE_ON_EMPTY);
540	ntfs_debug("Done.");
541}
542