1// Copyright 2017 The Fuchsia Authors
2//
3// Use of this source code is governed by a MIT-style
4// license that can be found in the LICENSE file or at
5// https://opensource.org/licenses/MIT
6
7#include <arch/x86/page_tables/page_tables.h>
8
9#include <arch/x86/feature.h>
10#include <arch/x86/page_tables/constants.h>
11#include <assert.h>
12#include <fbl/algorithm.h>
13#include <fbl/auto_call.h>
14#include <fbl/auto_lock.h>
15#include <trace.h>
16#include <vm/physmap.h>
17#include <vm/pmm.h>
18
19#define LOCAL_TRACE 0
20
21namespace {
22
23// Return the page size for this level
24size_t page_size(PageTableLevel level) {
25    switch (level) {
26        case PT_L:
27            return 1ULL << PT_SHIFT;
28        case PD_L:
29            return 1ULL << PD_SHIFT;
30        case PDP_L:
31            return 1ULL << PDP_SHIFT;
32        case PML4_L:
33            return 1ULL << PML4_SHIFT;
34        default:
35            panic("page_size: invalid level\n");
36    }
37}
38
39// Whether an address is aligned to the page size of this level
40bool page_aligned(PageTableLevel level, vaddr_t vaddr) {
41    return (vaddr & (page_size(level) - 1)) == 0;
42}
43
44// Extract the index needed for finding |vaddr| for the given level
45uint vaddr_to_index(PageTableLevel level, vaddr_t vaddr) {
46    switch (level) {
47    case PML4_L:
48        return VADDR_TO_PML4_INDEX(vaddr);
49    case PDP_L:
50        return VADDR_TO_PDP_INDEX(vaddr);
51    case PD_L:
52        return VADDR_TO_PD_INDEX(vaddr);
53    case PT_L:
54        return VADDR_TO_PT_INDEX(vaddr);
55    default:
56        panic("vaddr_to_index: invalid level\n");
57    }
58}
59
60// Convert a PTE to a physical address
61paddr_t paddr_from_pte(PageTableLevel level, pt_entry_t pte) {
62    DEBUG_ASSERT(IS_PAGE_PRESENT(pte));
63
64    paddr_t pa;
65    switch (level) {
66    case PDP_L:
67        pa = (pte & X86_HUGE_PAGE_FRAME);
68        break;
69    case PD_L:
70        pa = (pte & X86_LARGE_PAGE_FRAME);
71        break;
72    case PT_L:
73        pa = (pte & X86_PG_FRAME);
74        break;
75    default:
76        panic("paddr_from_pte at unhandled level %d\n", level);
77    }
78
79    return pa;
80}
81
82PageTableLevel lower_level(PageTableLevel level) {
83    DEBUG_ASSERT(level != 0);
84    return (PageTableLevel)(level - 1);
85}
86
87} // namespace
88
89void PendingTlbInvalidation::enqueue(vaddr_t v, PageTableLevel level, bool is_global_page,
90                                     bool is_terminal) {
91    if (is_global_page) {
92        contains_global = true;
93    }
94
95    // We mark PML4_L entries as full shootdowns, since it's going to be
96    // expensive one way or another.
97    if (count >= fbl::count_of(item) || level == PML4_L) {
98        full_shootdown = true;
99        return;
100    }
101    item[count].set_page_level(static_cast<uint64_t>(level));
102    item[count].set_is_global(is_global_page);
103    item[count].set_is_terminal(is_terminal);
104    item[count].set_encoded_addr(v >> PAGE_SIZE_SHIFT);
105    count++;
106}
107
108void PendingTlbInvalidation::clear() {
109    count = 0;
110    full_shootdown = false;
111    contains_global = false;
112}
113
114PendingTlbInvalidation::~PendingTlbInvalidation() {
115    DEBUG_ASSERT(count == 0);
116}
117
118// Utility for coalescing cache line flushes when modifying page tables.  This
119// allows us to mutate adjacent page table entries without having to flush for
120// each cache line multiple times.
121class X86PageTableBase::CacheLineFlusher {
122public:
123    // If |perform_invalidations| is false, this class acts as a no-op.
124    explicit CacheLineFlusher(bool perform_invalidations);
125    ~CacheLineFlusher();
126    void FlushPtEntry(const volatile pt_entry_t* entry);
127
128    void ForceFlush();
129private:
130    DISALLOW_COPY_ASSIGN_AND_MOVE(CacheLineFlusher);
131
132    // The cache-aligned address that currently dirty.  If 0, no dirty line.
133    uintptr_t dirty_line_;
134
135    const uintptr_t cl_mask_;
136    const bool perform_invalidations_;
137};
138
139X86PageTableBase::CacheLineFlusher::CacheLineFlusher(bool perform_invalidations)
140    : dirty_line_(0), cl_mask_(~(x86_get_clflush_line_size() - 1ull)),
141      perform_invalidations_(perform_invalidations) {
142}
143
144X86PageTableBase::CacheLineFlusher::~CacheLineFlusher() {
145    ForceFlush();
146}
147
148void X86PageTableBase::CacheLineFlusher::ForceFlush() {
149    if (dirty_line_ && perform_invalidations_) {
150        __asm__ volatile("clflush %0\n"
151                         :
152                         : "m"(*reinterpret_cast<char*>(dirty_line_))
153                         : "memory");
154        dirty_line_ = 0;
155    }
156}
157
158void X86PageTableBase::CacheLineFlusher::FlushPtEntry(const volatile pt_entry_t* entry) {
159    uintptr_t entry_line = reinterpret_cast<uintptr_t>(entry) & cl_mask_;
160    if (entry_line != dirty_line_) {
161        ForceFlush();
162        dirty_line_ = entry_line;
163    }
164}
165
166// Utility for managing consistency of the page tables from a cache and TLB
167// point-of-view.  It ensures that memory is not freed while a TLB entry may
168// refer to it, and that changes to the page tables have appropriate visiblity
169// to the hardware interpreting them.  Finish MUST be called on this
170// class, even if the page table change failed.
171class X86PageTableBase::ConsistencyManager {
172public:
173    explicit ConsistencyManager(X86PageTableBase* pt);
174    ~ConsistencyManager();
175
176    // Disable thread safety analysis here because it has trouble identifying
177    // that |pt_->lock_| is held here.
178    void queue_free(vm_page_t* page) TA_NO_THREAD_SAFETY_ANALYSIS {
179        DEBUG_ASSERT(pt_->lock_.IsHeld());
180
181        list_add_tail(&to_free_, &page->queue_node);
182        pt_->pages_--;
183    }
184
185    CacheLineFlusher* cache_line_flusher() { return &clf_; }
186    PendingTlbInvalidation* pending_tlb() { return &tlb_; }
187
188    // This function must be called while holding pt_->lock_.
189    void Finish();
190private:
191    X86PageTableBase* pt_;
192
193    // Cache line to flush prior to TLB invalidations
194    X86PageTableBase::CacheLineFlusher clf_;
195
196    // TLB invalidations that need to occur
197    PendingTlbInvalidation tlb_;
198
199    // vm_page_t's to relese to the PMM after the TLB invalidation occurs
200    list_node to_free_;
201};
202
203X86PageTableBase::ConsistencyManager::ConsistencyManager(X86PageTableBase* pt)
204    : pt_(pt), clf_(pt->needs_cache_flushes()) {
205
206    to_free_ = LIST_INITIAL_VALUE(to_free_);
207}
208
209X86PageTableBase::ConsistencyManager::~ConsistencyManager() {
210    DEBUG_ASSERT(pt_ == nullptr);
211
212    // We free the paging structures here rather than in Finish(), to allow
213    // support deferring invoking pmm_free() until after we've left the page
214    // table lock.
215    if (!list_is_empty(&to_free_)) {
216        pmm_free(&to_free_);
217    }
218}
219
220void X86PageTableBase::ConsistencyManager::Finish() {
221    DEBUG_ASSERT(pt_->lock_.IsHeld());
222
223    clf_.ForceFlush();
224    if (pt_->needs_cache_flushes()) {
225        // If the hardware needs cache flushes for the tables to be visible,
226        // make sure we serialize the flushes before issuing the TLB
227        // invalidations.
228        mb();
229    }
230    pt_->TlbInvalidate(&tlb_);
231    pt_ = nullptr;
232}
233
234struct X86PageTableBase::MappingCursor {
235public:
236    /**
237   * @brief Update the cursor to skip over a not-present page table entry.
238   */
239    void SkipEntry(PageTableLevel level) {
240        const size_t ps = page_size(level);
241        // Calculate the amount the cursor should skip to get to the next entry at
242        // this page table level.
243        const size_t skipped_size = ps - (vaddr & (ps - 1));
244        // If our endpoint was in the middle of this range, clamp the
245        // amount we remove from the cursor
246        const size_t _size = (size > skipped_size) ? skipped_size : size;
247
248        size -= _size;
249        vaddr += _size;
250    }
251
252    paddr_t paddr;
253    vaddr_t vaddr;
254    size_t size;
255};
256
257void X86PageTableBase::UpdateEntry(ConsistencyManager* cm, PageTableLevel level, vaddr_t vaddr,
258                                   volatile pt_entry_t* pte, paddr_t paddr, PtFlags flags,
259                                   bool was_terminal) {
260    DEBUG_ASSERT(pte);
261    DEBUG_ASSERT(IS_PAGE_ALIGNED(paddr));
262
263    pt_entry_t olde = *pte;
264
265    /* set the new entry */
266    *pte = paddr | flags | X86_MMU_PG_P;
267    cm->cache_line_flusher()->FlushPtEntry(pte);
268
269    /* attempt to invalidate the page */
270    if (IS_PAGE_PRESENT(olde)) {
271        // TODO(teisenbe): the is_kernel_address should be a check for the
272        // global bit
273        cm->pending_tlb()->enqueue(vaddr, level, is_kernel_address(vaddr), was_terminal);
274    }
275}
276
277void X86PageTableBase::UnmapEntry(ConsistencyManager* cm, PageTableLevel level, vaddr_t vaddr,
278                                  volatile pt_entry_t* pte, bool was_terminal) {
279    DEBUG_ASSERT(pte);
280
281    pt_entry_t olde = *pte;
282
283    *pte = 0;
284    cm->cache_line_flusher()->FlushPtEntry(pte);
285
286    /* attempt to invalidate the page */
287    if (IS_PAGE_PRESENT(olde)) {
288        // TODO(teisenbe): the is_kernel_address should be a check for the
289        // global bit
290        cm->pending_tlb()->enqueue(vaddr, level, is_kernel_address(vaddr), was_terminal);
291    }
292}
293
294/**
295 * @brief Allocating a new page table
296 */
297static volatile pt_entry_t* _map_alloc_page(void) {
298    paddr_t pa;
299    vm_page* p;
300    zx_status_t status = pmm_alloc_page(0, &p, &pa);
301    if (status != ZX_OK) {
302        return nullptr;
303    }
304    p->state = VM_PAGE_STATE_MMU;
305
306    pt_entry_t* page_ptr = static_cast<pt_entry_t*>(paddr_to_physmap(pa));
307    DEBUG_ASSERT(page_ptr);
308
309    arch_zero_page(page_ptr);
310
311    return page_ptr;
312}
313
314/*
315 * @brief Split the given large page into smaller pages
316 */
317zx_status_t X86PageTableBase::SplitLargePage(PageTableLevel level, vaddr_t vaddr,
318                                             volatile pt_entry_t* pte, ConsistencyManager* cm) {
319    DEBUG_ASSERT_MSG(level != PT_L, "tried splitting PT_L");
320    LTRACEF_LEVEL(2, "splitting table %p at level %d\n", pte, level);
321
322    DEBUG_ASSERT(IS_PAGE_PRESENT(*pte) && IS_LARGE_PAGE(*pte));
323    volatile pt_entry_t* m = _map_alloc_page();
324    if (m == nullptr) {
325        return ZX_ERR_NO_MEMORY;
326    }
327
328    paddr_t paddr_base = paddr_from_pte(level, *pte);
329    PtFlags flags = split_flags(level, *pte & X86_LARGE_FLAGS_MASK);
330
331    DEBUG_ASSERT(page_aligned(level, vaddr));
332    vaddr_t new_vaddr = vaddr;
333    paddr_t new_paddr = paddr_base;
334    size_t ps = page_size(lower_level(level));
335    for (int i = 0; i < NO_OF_PT_ENTRIES; i++) {
336        volatile pt_entry_t* e = m + i;
337        // If this is a PDP_L (i.e. huge page), flags will include the
338        // PS bit still, so the new PD entries will be large pages.
339        UpdateEntry(cm, lower_level(level), new_vaddr, e, new_paddr, flags,
340                    false /* was_terminal */);
341        new_vaddr += ps;
342        new_paddr += ps;
343    }
344    DEBUG_ASSERT(new_vaddr == vaddr + page_size(level));
345
346    flags = intermediate_flags();
347    UpdateEntry(cm, level, vaddr, pte, X86_VIRT_TO_PHYS(m), flags, true /* was_terminal */);
348    pages_++;
349    return ZX_OK;
350}
351
352/*
353 * @brief given a page table entry, return a pointer to the next page table one level down
354 */
355static inline volatile pt_entry_t* get_next_table_from_entry(pt_entry_t entry) {
356    if (!IS_PAGE_PRESENT(entry) || IS_LARGE_PAGE(entry))
357        return nullptr;
358
359    return reinterpret_cast<volatile pt_entry_t*>(X86_PHYS_TO_VIRT(entry & X86_PG_FRAME));
360}
361
362/**
363 * @brief  Walk the page table structures returning the entry and level that maps the address.
364 *
365 * @param table The top-level paging structure's virtual address
366 * @param vaddr The virtual address to retrieve the mapping for
367 * @param ret_level The level of the table that defines the found mapping
368 * @param mapping The mapping that was found
369 *
370 * @return ZX_OK if mapping is found
371 * @return ZX_ERR_NOT_FOUND if mapping is not found
372 */
373zx_status_t X86PageTableBase::GetMapping(volatile pt_entry_t* table, vaddr_t vaddr,
374                                         PageTableLevel level,
375                                         PageTableLevel* ret_level,
376                                         volatile pt_entry_t** mapping) {
377    DEBUG_ASSERT(table);
378    DEBUG_ASSERT(ret_level);
379    DEBUG_ASSERT(mapping);
380
381    if (level == PT_L) {
382        return GetMappingL0(table, vaddr, ret_level, mapping);
383    }
384
385    LTRACEF_LEVEL(2, "table %p\n", table);
386
387    uint index = vaddr_to_index(level, vaddr);
388    volatile pt_entry_t* e = table + index;
389    pt_entry_t pt_val = *e;
390    if (!IS_PAGE_PRESENT(pt_val))
391        return ZX_ERR_NOT_FOUND;
392
393    /* if this is a large page, stop here */
394    if (IS_LARGE_PAGE(pt_val)) {
395        *mapping = e;
396        *ret_level = level;
397        return ZX_OK;
398    }
399
400    volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
401    return GetMapping(next_table, vaddr, lower_level(level), ret_level, mapping);
402}
403
404zx_status_t X86PageTableBase::GetMappingL0(volatile pt_entry_t* table, vaddr_t vaddr,
405                                           PageTableLevel* ret_level,
406                                           volatile pt_entry_t** mapping) {
407    /* do the final page table lookup */
408    uint index = vaddr_to_index(PT_L, vaddr);
409    volatile pt_entry_t* e = table + index;
410    if (!IS_PAGE_PRESENT(*e))
411        return ZX_ERR_NOT_FOUND;
412
413    *mapping = e;
414    *ret_level = PT_L;
415    return ZX_OK;
416}
417
418/**
419 * @brief Unmaps the range specified by start_cursor.
420 *
421 * Level must be top_level() when invoked.  The caller must, even on failure,
422 * free all pages in the |to_free| list and adjust the |pages_| count.
423 *
424 * @param table The top-level paging structure's virtual address.
425 * @param start_cursor A cursor describing the range of address space to
426 * unmap within table
427 * @param new_cursor A returned cursor describing how much work was not
428 * completed.  Must be non-null.
429 *
430 * @return true if at least one page was unmapped at this level
431 */
432bool X86PageTableBase::RemoveMapping(volatile pt_entry_t* table, PageTableLevel level,
433                                     const MappingCursor& start_cursor, MappingCursor* new_cursor,
434                                     ConsistencyManager* cm) {
435    DEBUG_ASSERT(table);
436    LTRACEF("L: %d, %016" PRIxPTR " %016zx\n", level, start_cursor.vaddr,
437            start_cursor.size);
438    DEBUG_ASSERT(check_vaddr(start_cursor.vaddr));
439
440    if (level == PT_L) {
441        return RemoveMappingL0(table, start_cursor, new_cursor, cm);
442    }
443
444    *new_cursor = start_cursor;
445
446    bool unmapped = false;
447    size_t ps = page_size(level);
448    uint index = vaddr_to_index(level, new_cursor->vaddr);
449    for (; index != NO_OF_PT_ENTRIES && new_cursor->size != 0; ++index) {
450        volatile pt_entry_t* e = table + index;
451        pt_entry_t pt_val = *e;
452        // If the page isn't even mapped, just skip it
453        if (!IS_PAGE_PRESENT(pt_val)) {
454            new_cursor->SkipEntry(level);
455            DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
456            continue;
457        }
458
459        if (IS_LARGE_PAGE(pt_val)) {
460            bool vaddr_level_aligned = page_aligned(level, new_cursor->vaddr);
461            // If the request covers the entire large page, just unmap it
462            if (vaddr_level_aligned && new_cursor->size >= ps) {
463                UnmapEntry(cm, level, new_cursor->vaddr, e, true /* was_terminal */);
464                unmapped = true;
465
466                new_cursor->vaddr += ps;
467                new_cursor->size -= ps;
468                DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
469                continue;
470            }
471            // Otherwise, we need to split it
472            vaddr_t page_vaddr = new_cursor->vaddr & ~(ps - 1);
473            zx_status_t status = SplitLargePage(level, page_vaddr, e, cm);
474            if (status != ZX_OK) {
475                // If split fails, just unmap the whole thing, and let a
476                // subsequent page fault clean it up.
477                UnmapEntry(cm, level, new_cursor->vaddr, e, true /* was_terminal */);
478                unmapped = true;
479
480                new_cursor->SkipEntry(level);
481                DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
482            }
483            pt_val = *e;
484        }
485
486        MappingCursor cursor;
487        volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
488        bool lower_unmapped = RemoveMapping(next_table, lower_level(level),
489                                            *new_cursor, &cursor, cm);
490
491        // If we were requesting to unmap everything in the lower page table,
492        // we know we can unmap the lower level page table.  Otherwise, if
493        // we unmapped anything in the lower level, check to see if that
494        // level is now empty.
495        bool unmap_page_table =
496            page_aligned(level, new_cursor->vaddr) && new_cursor->size >= ps;
497        if (!unmap_page_table && lower_unmapped) {
498            uint lower_idx;
499            for (lower_idx = 0; lower_idx < NO_OF_PT_ENTRIES; ++lower_idx) {
500                if (IS_PAGE_PRESENT(next_table[lower_idx])) {
501                    break;
502                }
503            }
504            if (lower_idx == NO_OF_PT_ENTRIES) {
505                unmap_page_table = true;
506            }
507        }
508        if (unmap_page_table) {
509            paddr_t ptable_phys = X86_VIRT_TO_PHYS(next_table);
510            LTRACEF("L: %d free pt v %#" PRIxPTR " phys %#" PRIxPTR "\n",
511                    level, (uintptr_t)next_table, ptable_phys);
512
513            UnmapEntry(cm, level, new_cursor->vaddr, e, false /* was_terminal */);
514            vm_page_t* page = paddr_to_vm_page(ptable_phys);
515
516            DEBUG_ASSERT(page);
517            DEBUG_ASSERT_MSG(page->state == VM_PAGE_STATE_MMU,
518                             "page %p state %u, paddr %#" PRIxPTR "\n", page, page->state,
519                             X86_VIRT_TO_PHYS(next_table));
520            DEBUG_ASSERT(!list_in_list(&page->queue_node));
521
522            cm->queue_free(page);
523            unmapped = true;
524        }
525        *new_cursor = cursor;
526        DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
527
528        DEBUG_ASSERT(new_cursor->size == 0 || page_aligned(level, new_cursor->vaddr));
529    }
530
531    return unmapped;
532}
533
534// Base case of RemoveMapping for smallest page size.
535bool X86PageTableBase::RemoveMappingL0(volatile pt_entry_t* table,
536                                       const MappingCursor& start_cursor, MappingCursor* new_cursor,
537                                       ConsistencyManager* cm) {
538    LTRACEF("%016" PRIxPTR " %016zx\n", start_cursor.vaddr, start_cursor.size);
539    DEBUG_ASSERT(IS_PAGE_ALIGNED(start_cursor.size));
540
541    *new_cursor = start_cursor;
542
543    bool unmapped = false;
544    uint index = vaddr_to_index(PT_L, new_cursor->vaddr);
545    for (; index != NO_OF_PT_ENTRIES && new_cursor->size != 0; ++index) {
546        volatile pt_entry_t* e = table + index;
547        if (IS_PAGE_PRESENT(*e)) {
548            UnmapEntry(cm, PT_L, new_cursor->vaddr, e, true /* was_terminal */);
549            unmapped = true;
550        }
551
552        new_cursor->vaddr += PAGE_SIZE;
553        new_cursor->size -= PAGE_SIZE;
554        DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
555    }
556    return unmapped;
557}
558
559/**
560 * @brief Creates mappings for the range specified by start_cursor
561 *
562 * Level must be top_level() when invoked.
563 *
564 * @param table The top-level paging structure's virtual address.
565 * @param start_cursor A cursor describing the range of address space to
566 * act on within table
567 * @param new_cursor A returned cursor describing how much work was not
568 * completed.  Must be non-null.
569 *
570 * @return ZX_OK if successful
571 * @return ZX_ERR_ALREADY_EXISTS if the range overlaps an existing mapping
572 * @return ZX_ERR_NO_MEMORY if intermediate page tables could not be allocated
573 */
574zx_status_t X86PageTableBase::AddMapping(volatile pt_entry_t* table, uint mmu_flags,
575                                         PageTableLevel level, const MappingCursor& start_cursor,
576                                         MappingCursor* new_cursor,
577                                         ConsistencyManager* cm) {
578    DEBUG_ASSERT(table);
579    DEBUG_ASSERT(check_vaddr(start_cursor.vaddr));
580    DEBUG_ASSERT(check_paddr(start_cursor.paddr));
581
582    zx_status_t ret = ZX_OK;
583    *new_cursor = start_cursor;
584
585    if (level == PT_L) {
586        return AddMappingL0(table, mmu_flags, start_cursor, new_cursor, cm);
587    }
588
589    // Disable thread safety analysis, since Clang has trouble noticing that
590    // lock_ is held when RemoveMapping is called.
591    auto abort = fbl::MakeAutoCall([&]() TA_NO_THREAD_SAFETY_ANALYSIS {
592        if (level == top_level()) {
593            MappingCursor cursor = start_cursor;
594            MappingCursor result;
595            // new_cursor->size should be how much is left to be mapped still
596            cursor.size -= new_cursor->size;
597            if (cursor.size > 0) {
598                RemoveMapping(table, level, cursor, &result, cm);
599                DEBUG_ASSERT(result.size == 0);
600            }
601        }
602    });
603
604    X86PageTableBase::IntermediatePtFlags interm_flags = intermediate_flags();
605    X86PageTableBase::PtFlags term_flags = terminal_flags(level, mmu_flags);
606
607    size_t ps = page_size(level);
608    bool level_supports_large_pages = supports_page_size(level);
609    uint index = vaddr_to_index(level, new_cursor->vaddr);
610    for (; index != NO_OF_PT_ENTRIES && new_cursor->size != 0; ++index) {
611        volatile pt_entry_t* e = table + index;
612        pt_entry_t pt_val = *e;
613        // See if there's a large page in our way
614        if (IS_PAGE_PRESENT(pt_val) && IS_LARGE_PAGE(pt_val)) {
615            return ZX_ERR_ALREADY_EXISTS;
616        }
617
618        // Check if this is a candidate for a new large page
619        bool level_valigned = page_aligned(level, new_cursor->vaddr);
620        bool level_paligned = page_aligned(level, new_cursor->paddr);
621        if (level_supports_large_pages && !IS_PAGE_PRESENT(pt_val) && level_valigned &&
622            level_paligned && new_cursor->size >= ps) {
623
624            UpdateEntry(cm, level, new_cursor->vaddr, table + index,
625                        new_cursor->paddr, term_flags | X86_MMU_PG_PS, false /* was_terminal */);
626            new_cursor->paddr += ps;
627            new_cursor->vaddr += ps;
628            new_cursor->size -= ps;
629            DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
630        } else {
631            // See if we need to create a new table
632            if (!IS_PAGE_PRESENT(pt_val)) {
633                volatile pt_entry_t* m = _map_alloc_page();
634                if (m == nullptr) {
635                    return ZX_ERR_NO_MEMORY;
636                }
637
638                LTRACEF_LEVEL(2, "new table %p at level %d\n", m, level);
639
640                UpdateEntry(cm, level, new_cursor->vaddr, e,
641                            X86_VIRT_TO_PHYS(m), interm_flags, false /* was_terminal */);
642                pt_val = *e;
643                pages_++;
644            }
645
646            MappingCursor cursor;
647            ret = AddMapping(get_next_table_from_entry(pt_val), mmu_flags,
648                             lower_level(level), *new_cursor, &cursor, cm);
649            *new_cursor = cursor;
650            DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
651            if (ret != ZX_OK) {
652                return ret;
653            }
654        }
655    }
656    abort.cancel();
657    return ZX_OK;
658}
659
660// Base case of AddMapping for smallest page size.
661zx_status_t X86PageTableBase::AddMappingL0(volatile pt_entry_t* table, uint mmu_flags,
662                                           const MappingCursor& start_cursor,
663                                           MappingCursor* new_cursor, ConsistencyManager* cm) {
664    DEBUG_ASSERT(IS_PAGE_ALIGNED(start_cursor.size));
665
666    *new_cursor = start_cursor;
667
668    X86PageTableBase::PtFlags term_flags = terminal_flags(PT_L, mmu_flags);
669
670    uint index = vaddr_to_index(PT_L, new_cursor->vaddr);
671    for (; index != NO_OF_PT_ENTRIES && new_cursor->size != 0; ++index) {
672        volatile pt_entry_t* e = table + index;
673        if (IS_PAGE_PRESENT(*e)) {
674            return ZX_ERR_ALREADY_EXISTS;
675        }
676
677        UpdateEntry(cm, PT_L, new_cursor->vaddr, e, new_cursor->paddr, term_flags,
678                    false /* was_terminal */);
679
680        new_cursor->paddr += PAGE_SIZE;
681        new_cursor->vaddr += PAGE_SIZE;
682        new_cursor->size -= PAGE_SIZE;
683        DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
684    }
685
686    return ZX_OK;
687}
688
689/**
690 * @brief Changes the permissions/caching of the range specified by start_cursor
691 *
692 * Level must be top_level() when invoked.  The caller must, even on failure,
693 * free all pages in the |to_free| list and adjust the |pages_| count.
694 *
695 * @param table The top-level paging structure's virtual address.
696 * @param start_cursor A cursor describing the range of address space to
697 * act on within table
698 * @param new_cursor A returned cursor describing how much work was not
699 * completed.  Must be non-null.
700 */
701zx_status_t X86PageTableBase::UpdateMapping(volatile pt_entry_t* table, uint mmu_flags,
702                                            PageTableLevel level, const MappingCursor& start_cursor,
703                                            MappingCursor* new_cursor, ConsistencyManager* cm) {
704    DEBUG_ASSERT(table);
705    LTRACEF("L: %d, %016" PRIxPTR " %016zx\n", level, start_cursor.vaddr,
706            start_cursor.size);
707    DEBUG_ASSERT(check_vaddr(start_cursor.vaddr));
708
709    if (level == PT_L) {
710        return UpdateMappingL0(table, mmu_flags, start_cursor, new_cursor, cm);
711    }
712
713    zx_status_t ret = ZX_OK;
714    *new_cursor = start_cursor;
715
716    X86PageTableBase::PtFlags term_flags = terminal_flags(level, mmu_flags);
717
718    size_t ps = page_size(level);
719    uint index = vaddr_to_index(level, new_cursor->vaddr);
720    for (; index != NO_OF_PT_ENTRIES && new_cursor->size != 0; ++index) {
721        volatile pt_entry_t* e = table + index;
722        pt_entry_t pt_val = *e;
723        // Skip unmapped pages (we may encounter these due to demand paging)
724        if (!IS_PAGE_PRESENT(pt_val)) {
725            new_cursor->SkipEntry(level);
726            continue;
727        }
728
729        if (IS_LARGE_PAGE(pt_val)) {
730            bool vaddr_level_aligned = page_aligned(level, new_cursor->vaddr);
731            // If the request covers the entire large page, just change the
732            // permissions
733            if (vaddr_level_aligned && new_cursor->size >= ps) {
734                UpdateEntry(cm, level, new_cursor->vaddr, e,
735                            paddr_from_pte(level, pt_val),
736                            term_flags | X86_MMU_PG_PS, true /* was_terminal */);
737                new_cursor->vaddr += ps;
738                new_cursor->size -= ps;
739                DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
740                continue;
741            }
742            // Otherwise, we need to split it
743            vaddr_t page_vaddr = new_cursor->vaddr & ~(ps - 1);
744            ret = SplitLargePage(level, page_vaddr, e, cm);
745            if (ret != ZX_OK) {
746                // If we failed to split the table, just unmap it.  Subsequent
747                // page faults will bring it back in.
748                MappingCursor cursor;
749                cursor.vaddr = new_cursor->vaddr;
750                cursor.size = ps;
751
752                MappingCursor tmp_cursor;
753                RemoveMapping(table, level, cursor, &tmp_cursor, cm);
754
755                new_cursor->SkipEntry(level);
756            }
757            pt_val = *e;
758        }
759
760        MappingCursor cursor;
761        volatile pt_entry_t* next_table = get_next_table_from_entry(pt_val);
762        ret = UpdateMapping(next_table, mmu_flags, lower_level(level),
763                            *new_cursor, &cursor, cm);
764        *new_cursor = cursor;
765        if (ret != ZX_OK) {
766            // Currently this can't happen
767            ASSERT(false);
768        }
769        DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
770        DEBUG_ASSERT(new_cursor->size == 0 || page_aligned(level, new_cursor->vaddr));
771    }
772    return ZX_OK;
773}
774
775// Base case of UpdateMapping for smallest page size.
776zx_status_t X86PageTableBase::UpdateMappingL0(volatile pt_entry_t* table, uint mmu_flags,
777                                              const MappingCursor& start_cursor,
778                                              MappingCursor* new_cursor,
779                                              ConsistencyManager* cm) {
780    LTRACEF("%016" PRIxPTR " %016zx\n", start_cursor.vaddr, start_cursor.size);
781    DEBUG_ASSERT(IS_PAGE_ALIGNED(start_cursor.size));
782
783    *new_cursor = start_cursor;
784
785    X86PageTableBase::PtFlags term_flags = terminal_flags(PT_L, mmu_flags);
786
787    uint index = vaddr_to_index(PT_L, new_cursor->vaddr);
788    for (; index != NO_OF_PT_ENTRIES && new_cursor->size != 0; ++index) {
789        volatile pt_entry_t* e = table + index;
790        pt_entry_t pt_val = *e;
791        // Skip unmapped pages (we may encounter these due to demand paging)
792        if (IS_PAGE_PRESENT(pt_val)) {
793            UpdateEntry(cm, PT_L, new_cursor->vaddr, e, paddr_from_pte(PT_L, pt_val),
794                        term_flags, true /* was_terminal */);
795        }
796
797        new_cursor->vaddr += PAGE_SIZE;
798        new_cursor->size -= PAGE_SIZE;
799        DEBUG_ASSERT(new_cursor->size <= start_cursor.size);
800    }
801    DEBUG_ASSERT(new_cursor->size == 0 || page_aligned(PT_L, new_cursor->vaddr));
802    return ZX_OK;
803}
804
805zx_status_t X86PageTableBase::UnmapPages(vaddr_t vaddr, const size_t count,
806                                         size_t* unmapped) {
807    LTRACEF("aspace %p, vaddr %#" PRIxPTR ", count %#zx\n", this, vaddr, count);
808
809    canary_.Assert();
810
811    if (!check_vaddr(vaddr))
812        return ZX_ERR_INVALID_ARGS;
813    if (count == 0)
814        return ZX_OK;
815
816    MappingCursor start = {
817        .paddr = 0, .vaddr = vaddr, .size = count * PAGE_SIZE,
818    };
819    MappingCursor result;
820
821    ConsistencyManager cm(this);
822    {
823        fbl::AutoLock a(&lock_);
824        DEBUG_ASSERT(virt_);
825        RemoveMapping(virt_, top_level(), start, &result, &cm);
826        cm.Finish();
827    }
828    DEBUG_ASSERT(result.size == 0);
829
830    if (unmapped)
831        *unmapped = count;
832
833    return ZX_OK;
834}
835
836zx_status_t X86PageTableBase::MapPages(vaddr_t vaddr, paddr_t* phys, size_t count,
837                                       uint mmu_flags, size_t* mapped) {
838    canary_.Assert();
839
840    LTRACEF("aspace %p, vaddr %#" PRIxPTR " count %#zx mmu_flags 0x%x\n",
841            this, vaddr, count, mmu_flags);
842
843    if (!check_vaddr(vaddr))
844        return ZX_ERR_INVALID_ARGS;
845    for (size_t i = 0; i < count; ++i) {
846        if (!check_paddr(phys[i]))
847            return ZX_ERR_INVALID_ARGS;
848    }
849    if (count == 0)
850        return ZX_OK;
851
852    if (!allowed_flags(mmu_flags))
853        return ZX_ERR_INVALID_ARGS;
854
855    PageTableLevel top = top_level();
856    ConsistencyManager cm(this);
857    {
858        fbl::AutoLock a(&lock_);
859        DEBUG_ASSERT(virt_);
860
861        // TODO(teisenbe): Improve performance of this function by integrating deeper into
862        // the algorithm (e.g. make the cursors aware of the page array).
863        size_t idx = 0;
864        auto undo = fbl::MakeAutoCall([&]() TA_NO_THREAD_SAFETY_ANALYSIS {
865            if (idx > 0) {
866                MappingCursor start = {
867                    .paddr = 0, .vaddr = vaddr, .size = idx * PAGE_SIZE,
868                };
869
870                MappingCursor result;
871                RemoveMapping(virt_, top, start, &result, &cm);
872                DEBUG_ASSERT(result.size == 0);
873            }
874            cm.Finish();
875        });
876
877        vaddr_t v = vaddr;
878        for (; idx < count; ++idx) {
879            MappingCursor start = {
880                .paddr = phys[idx], .vaddr = v, .size = PAGE_SIZE,
881            };
882            MappingCursor result;
883            zx_status_t status = AddMapping(virt_, mmu_flags, top, start, &result, &cm);
884            if (status != ZX_OK) {
885                dprintf(SPEW, "Add mapping failed with err=%d\n", status);
886                return status;
887            }
888            DEBUG_ASSERT(result.size == 0);
889
890            v += PAGE_SIZE;
891        }
892
893        undo.cancel();
894        cm.Finish();
895    }
896
897    if (mapped) {
898        *mapped = count;
899    }
900    return ZX_OK;
901}
902
903zx_status_t X86PageTableBase::MapPagesContiguous(vaddr_t vaddr, paddr_t paddr,
904                                                 const size_t count, uint mmu_flags,
905                                                 size_t* mapped) {
906    canary_.Assert();
907
908    LTRACEF("aspace %p, vaddr %#" PRIxPTR " paddr %#" PRIxPTR " count %#zx mmu_flags 0x%x\n",
909            this, vaddr, paddr, count, mmu_flags);
910
911    if (!check_paddr(paddr))
912        return ZX_ERR_INVALID_ARGS;
913    if (!check_vaddr(vaddr))
914        return ZX_ERR_INVALID_ARGS;
915    if (count == 0)
916        return ZX_OK;
917
918    if (!allowed_flags(mmu_flags))
919        return ZX_ERR_INVALID_ARGS;
920
921    MappingCursor start = {
922        .paddr = paddr, .vaddr = vaddr, .size = count * PAGE_SIZE,
923    };
924    MappingCursor result;
925    ConsistencyManager cm(this);
926    {
927        fbl::AutoLock a(&lock_);
928        DEBUG_ASSERT(virt_);
929        zx_status_t status = AddMapping(virt_, mmu_flags, top_level(), start, &result, &cm);
930        cm.Finish();
931        if (status != ZX_OK) {
932            dprintf(SPEW, "Add mapping failed with err=%d\n", status);
933            return status;
934        }
935    }
936    DEBUG_ASSERT(result.size == 0);
937
938    if (mapped)
939        *mapped = count;
940
941    return ZX_OK;
942}
943
944zx_status_t X86PageTableBase::ProtectPages(vaddr_t vaddr, size_t count, uint mmu_flags) {
945    canary_.Assert();
946
947    LTRACEF("aspace %p, vaddr %#" PRIxPTR " count %#zx mmu_flags 0x%x\n",
948            this, vaddr, count, mmu_flags);
949
950    if (!check_vaddr(vaddr))
951        return ZX_ERR_INVALID_ARGS;
952    if (count == 0)
953        return ZX_OK;
954
955    if (!allowed_flags(mmu_flags))
956        return ZX_ERR_INVALID_ARGS;
957
958    MappingCursor start = {
959        .paddr = 0, .vaddr = vaddr, .size = count * PAGE_SIZE,
960    };
961    MappingCursor result;
962    ConsistencyManager cm(this);
963    {
964        fbl::AutoLock a(&lock_);
965        zx_status_t status = UpdateMapping(virt_, mmu_flags, top_level(), start, &result, &cm);
966        cm.Finish();
967        if (status != ZX_OK) {
968            return status;
969        }
970    }
971    DEBUG_ASSERT(result.size == 0);
972    return ZX_OK;
973}
974
975zx_status_t X86PageTableBase::QueryVaddr(vaddr_t vaddr, paddr_t* paddr, uint* mmu_flags) {
976    canary_.Assert();
977
978    PageTableLevel ret_level;
979
980    LTRACEF("aspace %p, vaddr %#" PRIxPTR ", paddr %p, mmu_flags %p\n", this, vaddr, paddr,
981            mmu_flags);
982
983    fbl::AutoLock a(&lock_);
984
985    volatile pt_entry_t* last_valid_entry;
986    zx_status_t status = GetMapping(virt_, vaddr, top_level(), &ret_level, &last_valid_entry);
987    if (status != ZX_OK)
988        return status;
989
990    DEBUG_ASSERT(last_valid_entry);
991    LTRACEF("last_valid_entry (%p) 0x%" PRIxPTE ", level %d\n", last_valid_entry, *last_valid_entry,
992            ret_level);
993
994    /* based on the return level, parse the page table entry */
995    if (paddr) {
996        switch (ret_level) {
997        case PDP_L: /* 1GB page */
998            *paddr = paddr_from_pte(PDP_L, *last_valid_entry);
999            *paddr |= vaddr & PAGE_OFFSET_MASK_HUGE;
1000            break;
1001        case PD_L: /* 2MB page */
1002            *paddr = paddr_from_pte(PD_L, *last_valid_entry);
1003            *paddr |= vaddr & PAGE_OFFSET_MASK_LARGE;
1004            break;
1005        case PT_L: /* 4K page */
1006            *paddr = paddr_from_pte(PT_L, *last_valid_entry);
1007            *paddr |= vaddr & PAGE_OFFSET_MASK_4KB;
1008            break;
1009        default:
1010            panic("arch_mmu_query: unhandled frame level\n");
1011        }
1012
1013        LTRACEF("paddr %#" PRIxPTR "\n", *paddr);
1014    }
1015
1016    /* converting arch-specific flags to mmu flags */
1017    if (mmu_flags) {
1018        *mmu_flags = pt_flags_to_mmu_flags(*last_valid_entry, ret_level);
1019    }
1020
1021    return ZX_OK;
1022}
1023
1024void X86PageTableBase::Destroy(vaddr_t base, size_t size) {
1025    canary_.Assert();
1026
1027#if LK_DEBUGLEVEL > 1
1028    PageTableLevel top = top_level();
1029    if (virt_) {
1030        pt_entry_t* table = static_cast<pt_entry_t*>(virt_);
1031        uint start = vaddr_to_index(top, base);
1032        uint end = vaddr_to_index(top, base + size - 1);
1033
1034        // Don't check start if that table is shared with another aspace.
1035        if (!page_aligned(top, base)) {
1036            start += 1;
1037        }
1038        // Do check the end if it fills out the table entry.
1039        if (page_aligned(top, base + size)) {
1040            end += 1;
1041        }
1042
1043        for (uint i = start; i < end; ++i) {
1044            DEBUG_ASSERT(!IS_PAGE_PRESENT(table[i]));
1045        }
1046    }
1047#endif
1048
1049    if (phys_) {
1050        pmm_free_page(paddr_to_vm_page(phys_));
1051        phys_ = 0;
1052    }
1053}
1054