1// Copyright 2016 The Fuchsia Authors
2//
3// Use of this source code is governed by a MIT-style
4// license that can be found in the LICENSE file or at
5// https://opensource.org/licenses/MIT
6
7#include "vm/vm_object_paged.h"
8
9#include "vm_priv.h"
10
11#include <arch/ops.h>
12#include <assert.h>
13#include <err.h>
14#include <fbl/alloc_checker.h>
15#include <fbl/auto_call.h>
16#include <inttypes.h>
17#include <lib/console.h>
18#include <stdlib.h>
19#include <string.h>
20#include <trace.h>
21#include <vm/fault.h>
22#include <vm/physmap.h>
23#include <vm/vm.h>
24#include <vm/vm_address_region.h>
25#include <zircon/types.h>
26
27#define LOCAL_TRACE MAX(VM_GLOBAL_TRACE, 0)
28
29namespace {
30
31void ZeroPage(paddr_t pa) {
32    void* ptr = paddr_to_physmap(pa);
33    DEBUG_ASSERT(ptr);
34
35    arch_zero_page(ptr);
36}
37
38void ZeroPage(vm_page_t* p) {
39    paddr_t pa = p->paddr();
40    ZeroPage(pa);
41}
42
43void InitializeVmPage(vm_page_t* p) {
44    DEBUG_ASSERT(p->state == VM_PAGE_STATE_ALLOC);
45    p->state = VM_PAGE_STATE_OBJECT;
46    p->object.pin_count = 0;
47}
48
49// round up the size to the next page size boundary and make sure we dont wrap
50zx_status_t RoundSize(uint64_t size, uint64_t* out_size) {
51    *out_size = ROUNDUP_PAGE_SIZE(size);
52    if (*out_size < size) {
53        return ZX_ERR_OUT_OF_RANGE;
54    }
55
56    // there's a max size to keep indexes within range
57    if (*out_size > VmObjectPaged::MAX_SIZE) {
58        return ZX_ERR_OUT_OF_RANGE;
59    }
60
61    return ZX_OK;
62}
63
64} // namespace
65
66VmObjectPaged::VmObjectPaged(
67    uint32_t options, uint32_t pmm_alloc_flags, uint64_t size, fbl::RefPtr<VmObject> parent)
68    : VmObject(fbl::move(parent)),
69      options_(options),
70      size_(size),
71      pmm_alloc_flags_(pmm_alloc_flags) {
72    LTRACEF("%p\n", this);
73
74    DEBUG_ASSERT(IS_PAGE_ALIGNED(size_));
75}
76
77VmObjectPaged::~VmObjectPaged() {
78    canary_.Assert();
79
80    LTRACEF("%p\n", this);
81
82    page_list_.ForEveryPage(
83        [this](const auto p, uint64_t off) {
84            if (this->is_contiguous()) {
85                p->object.pin_count--;
86            }
87            ASSERT(p->object.pin_count == 0);
88            return ZX_ERR_NEXT;
89        });
90
91    // free all of the pages attached to us
92    page_list_.FreeAllPages();
93}
94
95zx_status_t VmObjectPaged::Create(uint32_t pmm_alloc_flags,
96                                  uint32_t options,
97                                  uint64_t size, fbl::RefPtr<VmObject>* obj) {
98    // make sure size is page aligned
99    zx_status_t status = RoundSize(size, &size);
100    if (status != ZX_OK) {
101        return status;
102    }
103
104    if (options & kContiguous) {
105        // Force callers to use CreateContiguous() instead.
106        return ZX_ERR_INVALID_ARGS;
107    }
108
109    fbl::AllocChecker ac;
110    auto vmo = fbl::AdoptRef<VmObject>(
111        new (&ac) VmObjectPaged(options, pmm_alloc_flags, size, nullptr));
112    if (!ac.check()) {
113        return ZX_ERR_NO_MEMORY;
114    }
115
116    *obj = fbl::move(vmo);
117
118    return ZX_OK;
119}
120
121zx_status_t VmObjectPaged::CreateContiguous(uint32_t pmm_alloc_flags, uint64_t size,
122                                            uint8_t alignment_log2, fbl::RefPtr<VmObject>* obj) {
123    DEBUG_ASSERT(alignment_log2 < sizeof(uint64_t) * 8);
124    // make sure size is page aligned
125    zx_status_t status = RoundSize(size, &size);
126    if (status != ZX_OK) {
127        return status;
128    }
129
130    fbl::AllocChecker ac;
131    auto vmo = fbl::AdoptRef<VmObject>(
132        new (&ac) VmObjectPaged(kContiguous, pmm_alloc_flags, size, nullptr));
133    if (!ac.check()) {
134        return ZX_ERR_NO_MEMORY;
135    }
136
137    if (size == 0) {
138        *obj = fbl::move(vmo);
139        return ZX_OK;
140    }
141
142    // allocate the pages
143    list_node page_list;
144    list_initialize(&page_list);
145
146    size_t num_pages = size / PAGE_SIZE;
147    paddr_t pa;
148    status = pmm_alloc_contiguous(num_pages, pmm_alloc_flags, alignment_log2, &pa, &page_list);
149    if (status != ZX_OK) {
150        LTRACEF("failed to allocate enough pages (asked for %zu)\n", num_pages);
151        return ZX_ERR_NO_MEMORY;
152    }
153    auto cleanup_phys_pages = fbl::MakeAutoCall([&page_list]() {
154        pmm_free(&page_list);
155    });
156
157    // add them to the appropriate range of the object
158    VmObjectPaged* vmop = static_cast<VmObjectPaged*>(vmo.get());
159    for (uint64_t off = 0; off < size; off += PAGE_SIZE) {
160        vm_page_t* p = list_remove_head_type(&page_list, vm_page_t, queue_node);
161        ASSERT(p);
162
163        InitializeVmPage(p);
164
165        // TODO: remove once pmm returns zeroed pages
166        ZeroPage(p);
167
168        // We don't need thread-safety analysis here, since this VMO has not
169        // been shared anywhere yet.
170        [&]() TA_NO_THREAD_SAFETY_ANALYSIS {
171            status = vmop->page_list_.AddPage(p, off);
172        }();
173        if (status != ZX_OK) {
174            return status;
175        }
176
177        // Mark the pages as pinned, so they can't be physically rearranged
178        // underneath us.
179        p->object.pin_count++;
180    }
181
182    cleanup_phys_pages.cancel();
183    *obj = fbl::move(vmo);
184    return ZX_OK;
185}
186
187zx_status_t VmObjectPaged::CreateFromROData(const void* data, size_t size, fbl::RefPtr<VmObject>* obj) {
188    LTRACEF("data %p, size %zu\n", data, size);
189
190    fbl::RefPtr<VmObject> vmo;
191    zx_status_t status = Create(PMM_ALLOC_FLAG_ANY, 0, size, &vmo);
192    if (status != ZX_OK) {
193        return status;
194    }
195
196    if (size > 0) {
197        ASSERT(IS_PAGE_ALIGNED(size));
198        ASSERT(IS_PAGE_ALIGNED(reinterpret_cast<uintptr_t>(data)));
199
200        // Do a direct lookup of the physical pages backing the range of
201        // the kernel that these addresses belong to and jam them directly
202        // into the VMO.
203        //
204        // NOTE: This relies on the kernel not otherwise owning the pages.
205        // If the setup of the kernel's address space changes so that the
206        // pages are attached to a kernel VMO, this will need to change.
207
208        paddr_t start_paddr = vaddr_to_paddr(data);
209        ASSERT(start_paddr != 0);
210
211        for (size_t count = 0; count < size / PAGE_SIZE; count++) {
212            paddr_t pa = start_paddr + count * PAGE_SIZE;
213            vm_page_t* page = paddr_to_vm_page(pa);
214            ASSERT(page);
215
216            if (page->state == VM_PAGE_STATE_WIRED) {
217                // it's wired to the kernel, so we can just use it directly
218            } else if (page->state == VM_PAGE_STATE_FREE) {
219                list_node list = LIST_INITIAL_VALUE(list);
220                ASSERT(pmm_alloc_range(pa, 1, &list) == ZX_OK);
221                page->state = VM_PAGE_STATE_WIRED;
222            } else {
223                panic("page used to back static vmo in unusable state: paddr %#" PRIxPTR " state %u\n", pa,
224                      page->state);
225            }
226
227            // XXX hack to work around the ref pointer to the base class
228            auto vmo2 = static_cast<VmObjectPaged*>(vmo.get());
229            vmo2->AddPage(page, count * PAGE_SIZE);
230        }
231    }
232
233    *obj = fbl::move(vmo);
234
235    return ZX_OK;
236}
237
238zx_status_t VmObjectPaged::CloneCOW(bool resizable, uint64_t offset, uint64_t size,
239                                    bool copy_name, fbl::RefPtr<VmObject>* clone_vmo) {
240    LTRACEF("vmo %p offset %#" PRIx64 " size %#" PRIx64 "\n", this, offset, size);
241
242    canary_.Assert();
243
244    // make sure size is page aligned
245    zx_status_t status = RoundSize(size, &size);
246    if (status != ZX_OK) {
247        return status;
248    }
249
250    auto options = resizable ? kResizable : 0u;
251
252    // allocate the clone up front outside of our lock
253    fbl::AllocChecker ac;
254    auto vmo = fbl::AdoptRef<VmObjectPaged>(
255        new (&ac) VmObjectPaged(options, pmm_alloc_flags_, size, fbl::WrapRefPtr(this)));
256    if (!ac.check()) {
257        return ZX_ERR_NO_MEMORY;
258    }
259
260    Guard<fbl::Mutex> guard{&lock_};
261
262    // add the new VMO as a child before we do anything, since its
263    // dtor expects to find it in its parent's child list
264    AddChildLocked(vmo.get());
265
266    // check that we're not uncached in some way
267    if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
268        return ZX_ERR_BAD_STATE;
269    }
270
271    // set the offset with the parent
272    status = vmo->SetParentOffsetLocked(offset);
273    if (status != ZX_OK) {
274        return status;
275    }
276
277    if (copy_name) {
278        vmo->name_ = name_;
279    }
280
281    *clone_vmo = fbl::move(vmo);
282
283    return ZX_OK;
284}
285
286void VmObjectPaged::Dump(uint depth, bool verbose) {
287    canary_.Assert();
288
289    // This can grab our lock.
290    uint64_t parent_id = parent_user_id();
291
292    Guard<fbl::Mutex> guard{&lock_};
293
294    size_t count = 0;
295    page_list_.ForEveryPage([&count](const auto p, uint64_t) {
296        count++;
297        return ZX_ERR_NEXT;
298    });
299
300    for (uint i = 0; i < depth; ++i) {
301        printf("  ");
302    }
303    printf("vmo %p/k%" PRIu64 " size %#" PRIx64
304           " pages %zu ref %d parent k%" PRIu64 "\n",
305           this, user_id_, size_, count, ref_count_debug(), parent_id);
306
307    if (verbose) {
308        auto f = [depth](const auto p, uint64_t offset) {
309            for (uint i = 0; i < depth + 1; ++i) {
310                printf("  ");
311            }
312            printf("offset %#" PRIx64 " page %p paddr %#" PRIxPTR "\n", offset, p, p->paddr());
313            return ZX_ERR_NEXT;
314        };
315        page_list_.ForEveryPage(f);
316    }
317}
318
319size_t VmObjectPaged::AllocatedPagesInRange(uint64_t offset, uint64_t len) const {
320    canary_.Assert();
321    Guard<fbl::Mutex> guard{&lock_};
322    uint64_t new_len;
323    if (!TrimRange(offset, len, size_, &new_len)) {
324        return 0;
325    }
326    size_t count = 0;
327    // TODO: Figure out what to do with our parent's pages. If we're a clone,
328    // page_list_ only contains pages that we've made copies of.
329    page_list_.ForEveryPage(
330        [&count, offset, new_len](const auto p, uint64_t off) {
331            if (off >= offset && off < offset + new_len) {
332                count++;
333            }
334            return ZX_ERR_NEXT;
335        });
336    return count;
337}
338
339zx_status_t VmObjectPaged::AddPage(vm_page_t* p, uint64_t offset) {
340    Guard<fbl::Mutex> guard{&lock_};
341
342    return AddPageLocked(p, offset);
343}
344
345zx_status_t VmObjectPaged::AddPageLocked(vm_page_t* p, uint64_t offset) {
346    canary_.Assert();
347    DEBUG_ASSERT(lock_.lock().IsHeld());
348
349    LTRACEF("vmo %p, offset %#" PRIx64 ", page %p (%#" PRIxPTR ")\n", this, offset, p, p->paddr());
350
351    DEBUG_ASSERT(p);
352
353    if (offset >= size_) {
354        return ZX_ERR_OUT_OF_RANGE;
355    }
356
357    zx_status_t err = page_list_.AddPage(p, offset);
358    if (err != ZX_OK) {
359        return err;
360    }
361
362    // other mappings may have covered this offset into the vmo, so unmap those ranges
363    RangeChangeUpdateLocked(offset, PAGE_SIZE);
364
365    return ZX_OK;
366}
367
368// Looks up the page at the requested offset, faulting it in if requested and necessary.  If
369// this VMO has a parent and the requested page isn't found, the parent will be searched.
370//
371// |free_list|, if not NULL, is a list of allocated but unused vm_page_t that
372// this function may allocate from.  This function will need at most one entry,
373// and will not fail if |free_list| is a non-empty list, faulting in was requested,
374// and offset is in range.
375zx_status_t VmObjectPaged::GetPageLocked(uint64_t offset, uint pf_flags, list_node* free_list,
376                                         vm_page_t** const page_out, paddr_t* const pa_out) {
377    canary_.Assert();
378    DEBUG_ASSERT(lock_.lock().IsHeld());
379
380    if (offset >= size_) {
381        return ZX_ERR_OUT_OF_RANGE;
382    }
383
384    vm_page_t* p;
385    paddr_t pa;
386
387    // see if we already have a page at that offset
388    p = page_list_.GetPage(offset);
389    if (p) {
390        if (page_out) {
391            *page_out = p;
392        }
393        if (pa_out) {
394            *pa_out = p->paddr();
395        }
396        return ZX_OK;
397    }
398
399    __UNUSED char pf_string[5];
400    LTRACEF("vmo %p, offset %#" PRIx64 ", pf_flags %#x (%s)\n", this, offset, pf_flags,
401            vmm_pf_flags_to_string(pf_flags, pf_string));
402
403    // if we have a parent see if they have a page for us
404    if (parent_) {
405        uint64_t parent_offset;
406        bool overflowed = add_overflow(parent_offset_, offset, &parent_offset);
407        ASSERT(!overflowed);
408
409        // make sure we don't cause the parent to fault in new pages, just ask for any that already exist
410        uint parent_pf_flags = pf_flags & ~(VMM_PF_FLAG_FAULT_MASK);
411
412        zx_status_t status = parent_->GetPageLocked(parent_offset, parent_pf_flags,
413                                                    nullptr, &p, &pa);
414        if (status == ZX_OK) {
415            // we have a page from them. if we're read-only faulting, return that page so they can map
416            // or read from it directly
417            if ((pf_flags & VMM_PF_FLAG_WRITE) == 0) {
418                if (page_out) {
419                    *page_out = p;
420                }
421                if (pa_out) {
422                    *pa_out = pa;
423                }
424
425                LTRACEF("read only faulting in page %p, pa %#" PRIxPTR " from parent\n", p, pa);
426
427                return ZX_OK;
428            }
429
430            // if we're write faulting, we need to clone it and return the new page
431            paddr_t pa_clone;
432            vm_page_t* p_clone = nullptr;
433            if (free_list) {
434                p_clone = list_remove_head_type(free_list, vm_page, queue_node);
435                if (p_clone) {
436                    pa_clone = p_clone->paddr();
437                }
438            }
439            if (!p_clone) {
440                status = pmm_alloc_page(pmm_alloc_flags_, &p_clone, &pa_clone);
441            }
442            if (!p_clone) {
443                return ZX_ERR_NO_MEMORY;
444            }
445
446            InitializeVmPage(p_clone);
447
448            // do a direct copy of the two pages
449            const void* src = paddr_to_physmap(pa);
450            void* dst = paddr_to_physmap(pa_clone);
451
452            DEBUG_ASSERT(src && dst);
453
454            memcpy(dst, src, PAGE_SIZE);
455
456            // add the new page and return it
457            status = AddPageLocked(p_clone, offset);
458            DEBUG_ASSERT(status == ZX_OK);
459
460            LTRACEF("copy-on-write faulted in page %p, pa %#" PRIxPTR " copied from %p, pa %#" PRIxPTR "\n",
461                    p, pa, p_clone, pa_clone);
462
463            if (page_out) {
464                *page_out = p_clone;
465            }
466            if (pa_out) {
467                *pa_out = pa_clone;
468            }
469
470            return ZX_OK;
471        }
472    }
473
474    // if we're not being asked to sw or hw fault in the page, return not found
475    if ((pf_flags & VMM_PF_FLAG_FAULT_MASK) == 0) {
476        return ZX_ERR_NOT_FOUND;
477    }
478
479    // if we're read faulting, we don't already have a page, and the parent doesn't have it,
480    // return the single global zero page
481    if ((pf_flags & VMM_PF_FLAG_WRITE) == 0) {
482        LTRACEF("returning the zero page\n");
483        if (page_out) {
484            *page_out = vm_get_zero_page();
485        }
486        if (pa_out) {
487            *pa_out = vm_get_zero_page_paddr();
488        }
489        return ZX_OK;
490    }
491
492    // allocate a page
493    if (free_list) {
494        p = list_remove_head_type(free_list, vm_page, queue_node);
495        if (p) {
496            pa = p->paddr();
497        }
498    }
499    if (!p) {
500        pmm_alloc_page(pmm_alloc_flags_, &p, &pa);
501    }
502    if (!p) {
503        return ZX_ERR_NO_MEMORY;
504    }
505
506    InitializeVmPage(p);
507
508    // TODO: remove once pmm returns zeroed pages
509    ZeroPage(pa);
510
511// if ARM and not fully cached, clean/invalidate the page after zeroing it
512#if ARCH_ARM64
513    if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
514        arch_clean_invalidate_cache_range((addr_t)paddr_to_physmap(pa), PAGE_SIZE);
515    }
516#endif
517
518    zx_status_t status = AddPageLocked(p, offset);
519    DEBUG_ASSERT(status == ZX_OK);
520
521    // other mappings may have covered this offset into the vmo, so unmap those ranges
522    RangeChangeUpdateLocked(offset, PAGE_SIZE);
523
524    LTRACEF("faulted in page %p, pa %#" PRIxPTR "\n", p, pa);
525
526    if (page_out) {
527        *page_out = p;
528    }
529    if (pa_out) {
530        *pa_out = pa;
531    }
532
533    return ZX_OK;
534}
535
536zx_status_t VmObjectPaged::CommitRange(uint64_t offset, uint64_t len, uint64_t* committed) {
537    canary_.Assert();
538    LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len);
539
540    if (committed) {
541        *committed = 0;
542    }
543
544    Guard<fbl::Mutex> guard{&lock_};
545
546    // trim the size
547    uint64_t new_len;
548    if (!TrimRange(offset, len, size_, &new_len)) {
549        return ZX_ERR_OUT_OF_RANGE;
550    }
551
552    // was in range, just zero length
553    if (new_len == 0) {
554        return ZX_OK;
555    }
556
557    // compute a page aligned end to do our searches in to make sure we cover all the pages
558    uint64_t end = ROUNDUP_PAGE_SIZE(offset + new_len);
559    DEBUG_ASSERT(end > offset);
560    offset = ROUNDDOWN(offset, PAGE_SIZE);
561
562    // make a pass through the list, counting the number of pages we need to allocate
563    size_t count = 0;
564    uint64_t expected_next_off = offset;
565    page_list_.ForEveryPageInRange(
566        [&count, &expected_next_off](const auto p, uint64_t off) {
567
568            count += (off - expected_next_off) / PAGE_SIZE;
569            expected_next_off = off + PAGE_SIZE;
570            return ZX_ERR_NEXT;
571        },
572        expected_next_off, end);
573
574    // If expected_next_off isn't at the end of the range, there was a gap at
575    // the end.  Add it back in
576    DEBUG_ASSERT(end >= expected_next_off);
577    count += (end - expected_next_off) / PAGE_SIZE;
578    if (count == 0) {
579        return ZX_OK;
580    }
581
582    // allocate count number of pages
583    list_node page_list;
584    list_initialize(&page_list);
585
586    zx_status_t status = pmm_alloc_pages(count, pmm_alloc_flags_, &page_list);
587    if (status != ZX_OK) {
588        return status;
589    }
590
591    // unmap all of the pages in this range on all the mapping regions
592    RangeChangeUpdateLocked(offset, end - offset);
593
594    // add them to the appropriate range of the object
595    for (uint64_t o = offset; o < end; o += PAGE_SIZE) {
596        // Don't commit if we already have this page
597        vm_page_t* p = page_list_.GetPage(o);
598        if (p) {
599            continue;
600        }
601
602        // Check if our parent has the page
603        paddr_t pa;
604        const uint flags = VMM_PF_FLAG_SW_FAULT | VMM_PF_FLAG_WRITE;
605        // Should not be able to fail, since we're providing it memory and the
606        // range should be valid.
607        zx_status_t status = GetPageLocked(o, flags, &page_list, &p, &pa);
608        ASSERT(status == ZX_OK);
609
610        if (committed) {
611            *committed += PAGE_SIZE;
612        }
613    }
614
615    DEBUG_ASSERT(list_is_empty(&page_list));
616
617    // for now we only support committing as much as we were asked for
618    DEBUG_ASSERT(!committed || *committed == count * PAGE_SIZE);
619
620    return ZX_OK;
621}
622
623zx_status_t VmObjectPaged::DecommitRange(uint64_t offset, uint64_t len, uint64_t* decommitted) {
624    canary_.Assert();
625    LTRACEF("offset %#" PRIx64 ", len %#" PRIx64 "\n", offset, len);
626
627    if (decommitted) {
628        *decommitted = 0;
629    }
630
631    if (options_ & kContiguous) {
632        return ZX_ERR_NOT_SUPPORTED;
633    }
634
635    Guard<fbl::Mutex> guard{&lock_};
636
637    // trim the size
638    uint64_t new_len;
639    if (!TrimRange(offset, len, size_, &new_len)) {
640        return ZX_ERR_OUT_OF_RANGE;
641    }
642
643    // was in range, just zero length
644    if (new_len == 0) {
645        return ZX_OK;
646    }
647
648    // figure the starting and ending page offset
649    uint64_t start = ROUNDDOWN(offset, PAGE_SIZE);
650    uint64_t end = ROUNDUP_PAGE_SIZE(offset + new_len);
651    DEBUG_ASSERT(end > offset);
652    DEBUG_ASSERT(end > start);
653    uint64_t page_aligned_len = end - start;
654
655    LTRACEF("start offset %#" PRIx64 ", end %#" PRIx64 ", page_aliged_len %#" PRIx64 "\n", start, end,
656            page_aligned_len);
657
658    // TODO(teisenbe): Allow decommitting of pages pinned by
659    // CommitRangeContiguous
660
661    if (AnyPagesPinnedLocked(start, page_aligned_len)) {
662        return ZX_ERR_BAD_STATE;
663    }
664
665    // unmap all of the pages in this range on all the mapping regions
666    RangeChangeUpdateLocked(start, page_aligned_len);
667
668    // iterate through the pages, freeing them
669    // TODO: use page_list iterator, move pages to list, free at once
670    while (start < end) {
671        auto status = page_list_.FreePage(start);
672        if (status == ZX_OK && decommitted) {
673            *decommitted += PAGE_SIZE;
674        }
675        start += PAGE_SIZE;
676    }
677
678    return ZX_OK;
679}
680
681zx_status_t VmObjectPaged::Pin(uint64_t offset, uint64_t len) {
682    canary_.Assert();
683
684    Guard<fbl::Mutex> guard{&lock_};
685    return PinLocked(offset, len);
686}
687
688zx_status_t VmObjectPaged::PinLocked(uint64_t offset, uint64_t len) {
689    canary_.Assert();
690
691    // verify that the range is within the object
692    if (unlikely(!InRange(offset, len, size_))) {
693        return ZX_ERR_OUT_OF_RANGE;
694    }
695
696    if (unlikely(len == 0)) {
697        return ZX_OK;
698    }
699
700    const uint64_t start_page_offset = ROUNDDOWN(offset, PAGE_SIZE);
701    const uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE);
702
703    uint64_t expected_next_off = start_page_offset;
704    zx_status_t status = page_list_.ForEveryPageInRange(
705        [&expected_next_off](const auto p, uint64_t off) {
706            if (off != expected_next_off) {
707                return ZX_ERR_NOT_FOUND;
708            }
709
710            DEBUG_ASSERT(p->state == VM_PAGE_STATE_OBJECT);
711            if (p->object.pin_count == VM_PAGE_OBJECT_MAX_PIN_COUNT) {
712                return ZX_ERR_UNAVAILABLE;
713            }
714
715            p->object.pin_count++;
716            expected_next_off = off + PAGE_SIZE;
717            return ZX_ERR_NEXT;
718        },
719        start_page_offset, end_page_offset);
720
721    if (status == ZX_OK && expected_next_off != end_page_offset) {
722        status = ZX_ERR_NOT_FOUND;
723    }
724    if (status != ZX_OK) {
725        UnpinLocked(start_page_offset, expected_next_off - start_page_offset);
726        return status;
727    }
728
729    return ZX_OK;
730}
731
732void VmObjectPaged::Unpin(uint64_t offset, uint64_t len) {
733    Guard<fbl::Mutex> guard{&lock_};
734    UnpinLocked(offset, len);
735}
736
737void VmObjectPaged::UnpinLocked(uint64_t offset, uint64_t len) {
738    canary_.Assert();
739    DEBUG_ASSERT(lock_.lock().IsHeld());
740
741    // verify that the range is within the object
742    ASSERT(InRange(offset, len, size_));
743
744    if (unlikely(len == 0)) {
745        return;
746    }
747
748    const uint64_t start_page_offset = ROUNDDOWN(offset, PAGE_SIZE);
749    const uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE);
750
751    uint64_t expected_next_off = start_page_offset;
752    zx_status_t status = page_list_.ForEveryPageInRange(
753        [&expected_next_off](const auto p, uint64_t off) {
754            if (off != expected_next_off) {
755                return ZX_ERR_NOT_FOUND;
756            }
757
758            DEBUG_ASSERT(p->state == VM_PAGE_STATE_OBJECT);
759            ASSERT(p->object.pin_count > 0);
760            p->object.pin_count--;
761            expected_next_off = off + PAGE_SIZE;
762            return ZX_ERR_NEXT;
763        },
764        start_page_offset, end_page_offset);
765    ASSERT_MSG(status == ZX_OK && expected_next_off == end_page_offset,
766               "Tried to unpin an uncommitted page");
767    return;
768}
769
770bool VmObjectPaged::AnyPagesPinnedLocked(uint64_t offset, size_t len) {
771    canary_.Assert();
772    DEBUG_ASSERT(lock_.lock().IsHeld());
773    DEBUG_ASSERT(IS_PAGE_ALIGNED(offset));
774    DEBUG_ASSERT(IS_PAGE_ALIGNED(len));
775
776    const uint64_t start_page_offset = offset;
777    const uint64_t end_page_offset = offset + len;
778
779    bool found_pinned = false;
780    page_list_.ForEveryPageInRange(
781        [&found_pinned, start_page_offset, end_page_offset](const auto p, uint64_t off) {
782            DEBUG_ASSERT(off >= start_page_offset && off < end_page_offset);
783            if (p->object.pin_count > 0) {
784                found_pinned = true;
785                return ZX_ERR_STOP;
786            }
787            return ZX_ERR_NEXT;
788        },
789        start_page_offset, end_page_offset);
790
791    return found_pinned;
792}
793
794zx_status_t VmObjectPaged::ResizeLocked(uint64_t s) {
795    canary_.Assert();
796    DEBUG_ASSERT(lock_.lock().IsHeld());
797
798    LTRACEF("vmo %p, size %" PRIu64 "\n", this, s);
799
800    if (!(options_ & kResizable)) {
801        return ZX_ERR_UNAVAILABLE;
802    }
803
804    // round up the size to the next page size boundary and make sure we dont wrap
805    zx_status_t status = RoundSize(s, &s);
806    if (status != ZX_OK) {
807        return status;
808    }
809
810    // make sure everything is aligned before we get started
811    DEBUG_ASSERT(IS_PAGE_ALIGNED(size_));
812    DEBUG_ASSERT(IS_PAGE_ALIGNED(s));
813
814    // see if we're shrinking or expanding the vmo
815    if (s < size_) {
816        // shrinking
817        uint64_t start = s;
818        uint64_t end = size_;
819        uint64_t len = end - start;
820
821        // bail if there are any pinned pages in the range we're trimming
822        if (AnyPagesPinnedLocked(start, len)) {
823            return ZX_ERR_BAD_STATE;
824        }
825
826        // unmap all of the pages in this range on all the mapping regions
827        RangeChangeUpdateLocked(start, len);
828
829        // iterate through the pages, freeing them
830        // TODO: use page_list iterator, move pages to list, free at once
831        while (start < end) {
832            page_list_.FreePage(start);
833            start += PAGE_SIZE;
834        }
835    } else if (s > size_) {
836        // expanding
837        // figure the starting and ending page offset that is affected
838        uint64_t start = size_;
839        uint64_t end = s;
840        uint64_t len = end - start;
841
842        // inform all our children or mapping that there's new bits
843        RangeChangeUpdateLocked(start, len);
844    }
845
846    // save bytewise size
847    size_ = s;
848
849    return ZX_OK;
850}
851
852zx_status_t VmObjectPaged::Resize(uint64_t s) {
853    Guard<fbl::Mutex> guard{&lock_};
854
855    return ResizeLocked(s);
856}
857
858zx_status_t VmObjectPaged::SetParentOffsetLocked(uint64_t offset) {
859    DEBUG_ASSERT(lock_.lock().IsHeld());
860
861    // offset must be page aligned
862    if (!IS_PAGE_ALIGNED(offset)) {
863        return ZX_ERR_INVALID_ARGS;
864    }
865
866    // TODO: ZX-692 make sure that the accumulated offset of the entire parent chain doesn't wrap 64bit space
867
868    // make sure the size + this offset are still valid
869    uint64_t end;
870    if (add_overflow(offset, size_, &end)) {
871        return ZX_ERR_OUT_OF_RANGE;
872    }
873
874    parent_offset_ = offset;
875
876    return ZX_OK;
877}
878
879// perform some sort of copy in/out on a range of the object using a passed in lambda
880// for the copy routine
881template <typename T>
882zx_status_t VmObjectPaged::ReadWriteInternal(uint64_t offset, size_t len, bool write, T copyfunc) {
883    canary_.Assert();
884
885    Guard<fbl::Mutex> guard{&lock_};
886
887    // are we uncached? abort in this case
888    if (cache_policy_ != ARCH_MMU_FLAG_CACHED) {
889        return ZX_ERR_BAD_STATE;
890    }
891
892    // test if in range
893    uint64_t end_offset;
894    if (add_overflow(offset, len, &end_offset) || end_offset > size_) {
895        return ZX_ERR_OUT_OF_RANGE;
896    }
897
898    // walk the list of pages and do the write
899    uint64_t src_offset = offset;
900    size_t dest_offset = 0;
901    while (len > 0) {
902        size_t page_offset = src_offset % PAGE_SIZE;
903        size_t tocopy = MIN(PAGE_SIZE - page_offset, len);
904
905        // fault in the page
906        paddr_t pa;
907        auto status = GetPageLocked(src_offset,
908                                    VMM_PF_FLAG_SW_FAULT | (write ? VMM_PF_FLAG_WRITE : 0),
909                                    nullptr, nullptr, &pa);
910        if (status != ZX_OK) {
911            return status;
912        }
913
914        // compute the kernel mapping of this page
915        uint8_t* page_ptr = reinterpret_cast<uint8_t*>(paddr_to_physmap(pa));
916
917        // call the copy routine
918        auto err = copyfunc(page_ptr + page_offset, dest_offset, tocopy);
919        if (err < 0) {
920            return err;
921        }
922
923        src_offset += tocopy;
924        dest_offset += tocopy;
925        len -= tocopy;
926    }
927
928    return ZX_OK;
929}
930
931zx_status_t VmObjectPaged::Read(void* _ptr, uint64_t offset, size_t len) {
932    canary_.Assert();
933    // test to make sure this is a kernel pointer
934    if (!is_kernel_address(reinterpret_cast<vaddr_t>(_ptr))) {
935        DEBUG_ASSERT_MSG(0, "non kernel pointer passed\n");
936        return ZX_ERR_INVALID_ARGS;
937    }
938
939    // read routine that just uses a memcpy
940    uint8_t* ptr = reinterpret_cast<uint8_t*>(_ptr);
941    auto read_routine = [ptr](const void* src, size_t offset, size_t len) -> zx_status_t {
942        memcpy(ptr + offset, src, len);
943        return ZX_OK;
944    };
945
946    return ReadWriteInternal(offset, len, false, read_routine);
947}
948
949zx_status_t VmObjectPaged::Write(const void* _ptr, uint64_t offset, size_t len) {
950    canary_.Assert();
951    // test to make sure this is a kernel pointer
952    if (!is_kernel_address(reinterpret_cast<vaddr_t>(_ptr))) {
953        DEBUG_ASSERT_MSG(0, "non kernel pointer passed\n");
954        return ZX_ERR_INVALID_ARGS;
955    }
956
957    // write routine that just uses a memcpy
958    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(_ptr);
959    auto write_routine = [ptr](void* dst, size_t offset, size_t len) -> zx_status_t {
960        memcpy(dst, ptr + offset, len);
961        return ZX_OK;
962    };
963
964    return ReadWriteInternal(offset, len, true, write_routine);
965}
966
967zx_status_t VmObjectPaged::Lookup(uint64_t offset, uint64_t len, uint pf_flags,
968                                  vmo_lookup_fn_t lookup_fn, void* context) {
969    canary_.Assert();
970    if (unlikely(len == 0)) {
971        return ZX_ERR_INVALID_ARGS;
972    }
973
974    Guard<fbl::Mutex> guard{&lock_};
975
976    // verify that the range is within the object
977    if (unlikely(!InRange(offset, len, size_))) {
978        return ZX_ERR_OUT_OF_RANGE;
979    }
980
981    const uint64_t start_page_offset = ROUNDDOWN(offset, PAGE_SIZE);
982    const uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE);
983
984    uint64_t expected_next_off = start_page_offset;
985    zx_status_t status = page_list_.ForEveryPageInRange(
986        [&expected_next_off, this, pf_flags, lookup_fn, context,
987         start_page_offset](const auto p, uint64_t off) {
988
989            // If some page was missing from our list, run the more expensive
990            // GetPageLocked to see if our parent has it.
991            for (uint64_t missing_off = expected_next_off; missing_off < off;
992                 missing_off += PAGE_SIZE) {
993
994                paddr_t pa;
995                zx_status_t status = this->GetPageLocked(missing_off, pf_flags, nullptr,
996                                                         nullptr, &pa);
997                if (status != ZX_OK) {
998                    return ZX_ERR_NO_MEMORY;
999                }
1000                const size_t index = (off - start_page_offset) / PAGE_SIZE;
1001                status = lookup_fn(context, missing_off, index, pa);
1002                if (status != ZX_OK) {
1003                    if (unlikely(status == ZX_ERR_NEXT || status == ZX_ERR_STOP)) {
1004                        status = ZX_ERR_INTERNAL;
1005                    }
1006                    return status;
1007                }
1008            }
1009
1010            const size_t index = (off - start_page_offset) / PAGE_SIZE;
1011            paddr_t pa = p->paddr();
1012            zx_status_t status = lookup_fn(context, off, index, pa);
1013            if (status != ZX_OK) {
1014                if (unlikely(status == ZX_ERR_NEXT || status == ZX_ERR_STOP)) {
1015                    status = ZX_ERR_INTERNAL;
1016                }
1017                return status;
1018            }
1019
1020            expected_next_off = off + PAGE_SIZE;
1021            return ZX_ERR_NEXT;
1022        },
1023        start_page_offset, end_page_offset);
1024    if (status != ZX_OK) {
1025        return status;
1026    }
1027
1028    // If expected_next_off isn't at the end, there's a gap to process
1029    for (uint64_t off = expected_next_off; off < end_page_offset; off += PAGE_SIZE) {
1030        paddr_t pa;
1031        zx_status_t status = GetPageLocked(off, pf_flags, nullptr, nullptr, &pa);
1032        if (status != ZX_OK) {
1033            return ZX_ERR_NO_MEMORY;
1034        }
1035        const size_t index = (off - start_page_offset) / PAGE_SIZE;
1036        status = lookup_fn(context, off, index, pa);
1037        if (status != ZX_OK) {
1038            return status;
1039        }
1040    }
1041
1042    return ZX_OK;
1043}
1044
1045zx_status_t VmObjectPaged::ReadUser(user_out_ptr<void> ptr, uint64_t offset, size_t len) {
1046    canary_.Assert();
1047
1048    // read routine that uses copy_to_user
1049    auto read_routine = [ptr](const void* src, size_t offset, size_t len) -> zx_status_t {
1050        return ptr.byte_offset(offset).copy_array_to_user(src, len);
1051    };
1052
1053    return ReadWriteInternal(offset, len, false, read_routine);
1054}
1055
1056zx_status_t VmObjectPaged::WriteUser(user_in_ptr<const void> ptr, uint64_t offset, size_t len) {
1057    canary_.Assert();
1058
1059    // write routine that uses copy_from_user
1060    auto write_routine = [ptr](void* dst, size_t offset, size_t len) -> zx_status_t {
1061        return ptr.byte_offset(offset).copy_array_from_user(dst, len);
1062    };
1063
1064    return ReadWriteInternal(offset, len, true, write_routine);
1065}
1066
1067zx_status_t VmObjectPaged::LookupUser(uint64_t offset, uint64_t len, user_inout_ptr<paddr_t> buffer,
1068                                      size_t buffer_size) {
1069    canary_.Assert();
1070
1071    uint64_t start_page_offset = ROUNDDOWN(offset, PAGE_SIZE);
1072    uint64_t end_page_offset = ROUNDUP(offset + len, PAGE_SIZE);
1073    // compute the size of the table we'll need and make sure it fits in the user buffer
1074    uint64_t table_size = ((end_page_offset - start_page_offset) / PAGE_SIZE) * sizeof(paddr_t);
1075    if (unlikely(table_size > buffer_size)) {
1076        return ZX_ERR_BUFFER_TOO_SMALL;
1077    }
1078
1079    auto copy_to_user = [](void* context, size_t offset, size_t index, paddr_t pa) -> zx_status_t {
1080        user_inout_ptr<paddr_t>* buffer = static_cast<user_inout_ptr<paddr_t>*>(context);
1081        return buffer->element_offset(index).copy_to_user(pa);
1082    };
1083    // only lookup pages that are already present
1084    return Lookup(offset, len, 0, copy_to_user, &buffer);
1085}
1086
1087zx_status_t VmObjectPaged::InvalidateCache(const uint64_t offset, const uint64_t len) {
1088    return CacheOp(offset, len, CacheOpType::Invalidate);
1089}
1090
1091zx_status_t VmObjectPaged::CleanCache(const uint64_t offset, const uint64_t len) {
1092    return CacheOp(offset, len, CacheOpType::Clean);
1093}
1094
1095zx_status_t VmObjectPaged::CleanInvalidateCache(const uint64_t offset, const uint64_t len) {
1096    return CacheOp(offset, len, CacheOpType::CleanInvalidate);
1097}
1098
1099zx_status_t VmObjectPaged::SyncCache(const uint64_t offset, const uint64_t len) {
1100    return CacheOp(offset, len, CacheOpType::Sync);
1101}
1102
1103zx_status_t VmObjectPaged::CacheOp(const uint64_t start_offset, const uint64_t len,
1104                                   const CacheOpType type) {
1105    canary_.Assert();
1106
1107    if (unlikely(len == 0)) {
1108        return ZX_ERR_INVALID_ARGS;
1109    }
1110
1111    Guard<fbl::Mutex> guard{&lock_};
1112
1113    if (unlikely(!InRange(start_offset, len, size_))) {
1114        return ZX_ERR_OUT_OF_RANGE;
1115    }
1116
1117    const size_t end_offset = static_cast<size_t>(start_offset + len);
1118    size_t op_start_offset = static_cast<size_t>(start_offset);
1119
1120    while (op_start_offset != end_offset) {
1121        // Offset at the end of the current page.
1122        const size_t page_end_offset = ROUNDUP(op_start_offset + 1, PAGE_SIZE);
1123
1124        // This cache op will either terminate at the end of the current page or
1125        // at the end of the whole op range -- whichever comes first.
1126        const size_t op_end_offset = MIN(page_end_offset, end_offset);
1127
1128        const size_t cache_op_len = op_end_offset - op_start_offset;
1129
1130        const size_t page_offset = op_start_offset % PAGE_SIZE;
1131
1132        // lookup the physical address of the page, careful not to fault in a new one
1133        paddr_t pa;
1134        auto status = GetPageLocked(op_start_offset, 0, nullptr, nullptr, &pa);
1135
1136        if (likely(status == ZX_OK)) {
1137            // Convert the page address to a Kernel virtual address.
1138            const void* ptr = paddr_to_physmap(pa);
1139            const addr_t cache_op_addr = reinterpret_cast<addr_t>(ptr) + page_offset;
1140
1141            LTRACEF("ptr %p op %d\n", ptr, (int)type);
1142
1143            // Perform the necessary cache op against this page.
1144            switch (type) {
1145            case CacheOpType::Invalidate:
1146                arch_invalidate_cache_range(cache_op_addr, cache_op_len);
1147                break;
1148            case CacheOpType::Clean:
1149                arch_clean_cache_range(cache_op_addr, cache_op_len);
1150                break;
1151            case CacheOpType::CleanInvalidate:
1152                arch_clean_invalidate_cache_range(cache_op_addr, cache_op_len);
1153                break;
1154            case CacheOpType::Sync:
1155                arch_sync_cache_range(cache_op_addr, cache_op_len);
1156                break;
1157            }
1158        }
1159
1160        op_start_offset += cache_op_len;
1161    }
1162
1163    return ZX_OK;
1164}
1165
1166zx_status_t VmObjectPaged::GetMappingCachePolicy(uint32_t* cache_policy) {
1167    Guard<fbl::Mutex> guard{&lock_};
1168
1169    *cache_policy = cache_policy_;
1170
1171    return ZX_OK;
1172}
1173
1174zx_status_t VmObjectPaged::SetMappingCachePolicy(const uint32_t cache_policy) {
1175    // Is it a valid cache flag?
1176    if (cache_policy & ~ZX_CACHE_POLICY_MASK) {
1177        return ZX_ERR_INVALID_ARGS;
1178    }
1179
1180    Guard<fbl::Mutex> guard{&lock_};
1181
1182    // conditions for allowing the cache policy to be set:
1183    // 1) vmo has no pages committed currently
1184    // 2) vmo has no mappings
1185    // 3) vmo has no clones
1186    // 4) vmo is not a clone
1187    if (!page_list_.IsEmpty()) {
1188        return ZX_ERR_BAD_STATE;
1189    }
1190    if (!mapping_list_.is_empty()) {
1191        return ZX_ERR_BAD_STATE;
1192    }
1193    if (!children_list_.is_empty()) {
1194        return ZX_ERR_BAD_STATE;
1195    }
1196    if (parent_) {
1197        return ZX_ERR_BAD_STATE;
1198    }
1199
1200    cache_policy_ = cache_policy;
1201
1202    return ZX_OK;
1203}
1204
1205void VmObjectPaged::RangeChangeUpdateFromParentLocked(const uint64_t offset, const uint64_t len) {
1206    canary_.Assert();
1207
1208    LTRACEF("offset %#" PRIx64 " len %#" PRIx64 " p_offset %#" PRIx64 " size_ %#" PRIx64 "\n",
1209            offset, len, parent_offset_, size_);
1210
1211    // our parent is notifying that a range of theirs changed, see where it intersects
1212    // with our offset into the parent and pass it on
1213    uint64_t offset_new;
1214    uint64_t len_new;
1215    if (!GetIntersect(parent_offset_, size_, offset, len,
1216                      &offset_new, &len_new)) {
1217        return;
1218    }
1219
1220    // if they intersect with us, then by definition the new offset must be >= parent_offset_
1221    DEBUG_ASSERT(offset_new >= parent_offset_);
1222
1223    // subtract our offset
1224    offset_new -= parent_offset_;
1225
1226    // verify that it's still within range of us
1227    DEBUG_ASSERT(offset_new + len_new <= size_);
1228
1229    LTRACEF("new offset %#" PRIx64 " new len %#" PRIx64 "\n",
1230            offset_new, len_new);
1231
1232    // pass it on
1233    // TODO: optimize by not passing on ranges that are completely covered by pages local to this vmo
1234    RangeChangeUpdateLocked(offset_new, len_new);
1235}
1236