1/* 2 * Copyright 2002 Andi Kleen, SuSE Labs. 3 * Thanks to Ben LaHaise for precious feedback. 4 */ 5#include <linux/highmem.h> 6#include <linux/bootmem.h> 7#include <linux/module.h> 8#include <linux/sched.h> 9#include <linux/mm.h> 10#include <linux/interrupt.h> 11#include <linux/seq_file.h> 12#include <linux/debugfs.h> 13#include <linux/pfn.h> 14#include <linux/percpu.h> 15#include <linux/gfp.h> 16 17#include <asm/e820.h> 18#include <asm/processor.h> 19#include <asm/tlbflush.h> 20#include <asm/sections.h> 21#include <asm/setup.h> 22#include <asm/uaccess.h> 23#include <asm/pgalloc.h> 24#include <asm/proto.h> 25#include <asm/pat.h> 26 27/* 28 * The current flushing context - we pass it instead of 5 arguments: 29 */ 30struct cpa_data { 31 unsigned long *vaddr; 32 pgprot_t mask_set; 33 pgprot_t mask_clr; 34 int numpages; 35 int flags; 36 unsigned long pfn; 37 unsigned force_split : 1; 38 int curpage; 39 struct page **pages; 40}; 41 42/* 43 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) 44 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb 45 * entries change the page attribute in parallel to some other cpu 46 * splitting a large page entry along with changing the attribute. 47 */ 48static DEFINE_SPINLOCK(cpa_lock); 49 50#define CPA_FLUSHTLB 1 51#define CPA_ARRAY 2 52#define CPA_PAGES_ARRAY 4 53 54#ifdef CONFIG_PROC_FS 55static unsigned long direct_pages_count[PG_LEVEL_NUM]; 56 57void update_page_count(int level, unsigned long pages) 58{ 59 unsigned long flags; 60 61 /* Protect against CPA */ 62 spin_lock_irqsave(&pgd_lock, flags); 63 direct_pages_count[level] += pages; 64 spin_unlock_irqrestore(&pgd_lock, flags); 65} 66 67static void split_page_count(int level) 68{ 69 direct_pages_count[level]--; 70 direct_pages_count[level - 1] += PTRS_PER_PTE; 71} 72 73void arch_report_meminfo(struct seq_file *m) 74{ 75 seq_printf(m, "DirectMap4k: %8lu kB\n", 76 direct_pages_count[PG_LEVEL_4K] << 2); 77#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 78 seq_printf(m, "DirectMap2M: %8lu kB\n", 79 direct_pages_count[PG_LEVEL_2M] << 11); 80#else 81 seq_printf(m, "DirectMap4M: %8lu kB\n", 82 direct_pages_count[PG_LEVEL_2M] << 12); 83#endif 84#ifdef CONFIG_X86_64 85 if (direct_gbpages) 86 seq_printf(m, "DirectMap1G: %8lu kB\n", 87 direct_pages_count[PG_LEVEL_1G] << 20); 88#endif 89} 90#else 91static inline void split_page_count(int level) { } 92#endif 93 94#ifdef CONFIG_X86_64 95 96static inline unsigned long highmap_start_pfn(void) 97{ 98 return __pa(_text) >> PAGE_SHIFT; 99} 100 101static inline unsigned long highmap_end_pfn(void) 102{ 103 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; 104} 105 106#endif 107 108#ifdef CONFIG_DEBUG_PAGEALLOC 109# define debug_pagealloc 1 110#else 111# define debug_pagealloc 0 112#endif 113 114static inline int 115within(unsigned long addr, unsigned long start, unsigned long end) 116{ 117 return addr >= start && addr < end; 118} 119 120/* 121 * Flushing functions 122 */ 123 124/** 125 * clflush_cache_range - flush a cache range with clflush 126 * @addr: virtual start address 127 * @size: number of bytes to flush 128 * 129 * clflush is an unordered instruction which needs fencing with mfence 130 * to avoid ordering issues. 131 */ 132void clflush_cache_range(void *vaddr, unsigned int size) 133{ 134 void *vend = vaddr + size - 1; 135 136 mb(); 137 138 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) 139 clflush(vaddr); 140 /* 141 * Flush any possible final partial cacheline: 142 */ 143 clflush(vend); 144 145 mb(); 146} 147EXPORT_SYMBOL_GPL(clflush_cache_range); 148 149static void __cpa_flush_all(void *arg) 150{ 151 unsigned long cache = (unsigned long)arg; 152 153 __flush_tlb_all(); 154 155 if (cache && boot_cpu_data.x86 >= 4) 156 wbinvd(); 157} 158 159static void cpa_flush_all(unsigned long cache) 160{ 161 BUG_ON(irqs_disabled()); 162 163 on_each_cpu(__cpa_flush_all, (void *) cache, 1); 164} 165 166static void __cpa_flush_range(void *arg) 167{ 168 /* 169 * We could optimize that further and do individual per page 170 * tlb invalidates for a low number of pages. Caveat: we must 171 * flush the high aliases on 64bit as well. 172 */ 173 __flush_tlb_all(); 174} 175 176static void cpa_flush_range(unsigned long start, int numpages, int cache) 177{ 178 unsigned int i, level; 179 unsigned long addr; 180 181 BUG_ON(irqs_disabled()); 182 WARN_ON(PAGE_ALIGN(start) != start); 183 184 on_each_cpu(__cpa_flush_range, NULL, 1); 185 186 if (!cache) 187 return; 188 189 /* 190 * We only need to flush on one CPU, 191 * clflush is a MESI-coherent instruction that 192 * will cause all other CPUs to flush the same 193 * cachelines: 194 */ 195 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { 196 pte_t *pte = lookup_address(addr, &level); 197 198 /* 199 * Only flush present addresses: 200 */ 201 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 202 clflush_cache_range((void *) addr, PAGE_SIZE); 203 } 204} 205 206static void cpa_flush_array(unsigned long *start, int numpages, int cache, 207 int in_flags, struct page **pages) 208{ 209 unsigned int i, level; 210 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ 211 212 BUG_ON(irqs_disabled()); 213 214 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); 215 216 if (!cache || do_wbinvd) 217 return; 218 219 /* 220 * We only need to flush on one CPU, 221 * clflush is a MESI-coherent instruction that 222 * will cause all other CPUs to flush the same 223 * cachelines: 224 */ 225 for (i = 0; i < numpages; i++) { 226 unsigned long addr; 227 pte_t *pte; 228 229 if (in_flags & CPA_PAGES_ARRAY) 230 addr = (unsigned long)page_address(pages[i]); 231 else 232 addr = start[i]; 233 234 pte = lookup_address(addr, &level); 235 236 /* 237 * Only flush present addresses: 238 */ 239 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 240 clflush_cache_range((void *)addr, PAGE_SIZE); 241 } 242} 243 244/* 245 * Certain areas of memory on x86 require very specific protection flags, 246 * for example the BIOS area or kernel text. Callers don't always get this 247 * right (again, ioremap() on BIOS memory is not uncommon) so this function 248 * checks and fixes these known static required protection bits. 249 */ 250static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, 251 unsigned long pfn) 252{ 253 pgprot_t forbidden = __pgprot(0); 254 255 /* 256 * The BIOS area between 640k and 1Mb needs to be executable for 257 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 258 */ 259 if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 260 pgprot_val(forbidden) |= _PAGE_NX; 261 262 /* 263 * The kernel text needs to be executable for obvious reasons 264 * Does not cover __inittext since that is gone later on. On 265 * 64bit we do not enforce !NX on the low mapping 266 */ 267 if (within(address, (unsigned long)_text, (unsigned long)_etext)) 268 pgprot_val(forbidden) |= _PAGE_NX; 269 270 /* 271 * The .rodata section needs to be read-only. Using the pfn 272 * catches all aliases. 273 */ 274 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 275 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 276 pgprot_val(forbidden) |= _PAGE_RW; 277 278#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 279 /* 280 * Once the kernel maps the text as RO (kernel_set_to_readonly is set), 281 * kernel text mappings for the large page aligned text, rodata sections 282 * will be always read-only. For the kernel identity mappings covering 283 * the holes caused by this alignment can be anything that user asks. 284 * 285 * This will preserve the large page mappings for kernel text/data 286 * at no extra cost. 287 */ 288 if (kernel_set_to_readonly && 289 within(address, (unsigned long)_text, 290 (unsigned long)__end_rodata_hpage_align)) { 291 unsigned int level; 292 293 /* 294 * Don't enforce the !RW mapping for the kernel text mapping, 295 * if the current mapping is already using small page mapping. 296 * No need to work hard to preserve large page mappings in this 297 * case. 298 * 299 * This also fixes the Linux Xen paravirt guest boot failure 300 * (because of unexpected read-only mappings for kernel identity 301 * mappings). In this paravirt guest case, the kernel text 302 * mapping and the kernel identity mapping share the same 303 * page-table pages. Thus we can't really use different 304 * protections for the kernel text and identity mappings. Also, 305 * these shared mappings are made of small page mappings. 306 * Thus this don't enforce !RW mapping for small page kernel 307 * text mapping logic will help Linux Xen parvirt guest boot 308 * aswell. 309 */ 310 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 311 pgprot_val(forbidden) |= _PAGE_RW; 312 } 313#endif 314 315 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 316 317 return prot; 318} 319 320/* 321 * Lookup the page table entry for a virtual address. Return a pointer 322 * to the entry and the level of the mapping. 323 * 324 * Note: We return pud and pmd either when the entry is marked large 325 * or when the present bit is not set. Otherwise we would return a 326 * pointer to a nonexisting mapping. 327 */ 328pte_t *lookup_address(unsigned long address, unsigned int *level) 329{ 330 pgd_t *pgd = pgd_offset_k(address); 331 pud_t *pud; 332 pmd_t *pmd; 333 334 *level = PG_LEVEL_NONE; 335 336 if (pgd_none(*pgd)) 337 return NULL; 338 339 pud = pud_offset(pgd, address); 340 if (pud_none(*pud)) 341 return NULL; 342 343 *level = PG_LEVEL_1G; 344 if (pud_large(*pud) || !pud_present(*pud)) 345 return (pte_t *)pud; 346 347 pmd = pmd_offset(pud, address); 348 if (pmd_none(*pmd)) 349 return NULL; 350 351 *level = PG_LEVEL_2M; 352 if (pmd_large(*pmd) || !pmd_present(*pmd)) 353 return (pte_t *)pmd; 354 355 *level = PG_LEVEL_4K; 356 357 return pte_offset_kernel(pmd, address); 358} 359EXPORT_SYMBOL_GPL(lookup_address); 360 361/* 362 * Set the new pmd in all the pgds we know about: 363 */ 364static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 365{ 366 /* change init_mm */ 367 set_pte_atomic(kpte, pte); 368#ifdef CONFIG_X86_32 369 if (!SHARED_KERNEL_PMD) { 370 struct page *page; 371 372 list_for_each_entry(page, &pgd_list, lru) { 373 pgd_t *pgd; 374 pud_t *pud; 375 pmd_t *pmd; 376 377 pgd = (pgd_t *)page_address(page) + pgd_index(address); 378 pud = pud_offset(pgd, address); 379 pmd = pmd_offset(pud, address); 380 set_pte_atomic((pte_t *)pmd, pte); 381 } 382 } 383#endif 384} 385 386static int 387try_preserve_large_page(pte_t *kpte, unsigned long address, 388 struct cpa_data *cpa) 389{ 390 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 391 pte_t new_pte, old_pte, *tmp; 392 pgprot_t old_prot, new_prot; 393 int i, do_split = 1; 394 unsigned int level; 395 396 if (cpa->force_split) 397 return 1; 398 399 spin_lock_irqsave(&pgd_lock, flags); 400 /* 401 * Check for races, another CPU might have split this page 402 * up already: 403 */ 404 tmp = lookup_address(address, &level); 405 if (tmp != kpte) 406 goto out_unlock; 407 408 switch (level) { 409 case PG_LEVEL_2M: 410 psize = PMD_PAGE_SIZE; 411 pmask = PMD_PAGE_MASK; 412 break; 413#ifdef CONFIG_X86_64 414 case PG_LEVEL_1G: 415 psize = PUD_PAGE_SIZE; 416 pmask = PUD_PAGE_MASK; 417 break; 418#endif 419 default: 420 do_split = -EINVAL; 421 goto out_unlock; 422 } 423 424 /* 425 * Calculate the number of pages, which fit into this large 426 * page starting at address: 427 */ 428 nextpage_addr = (address + psize) & pmask; 429 numpages = (nextpage_addr - address) >> PAGE_SHIFT; 430 if (numpages < cpa->numpages) 431 cpa->numpages = numpages; 432 433 /* 434 * We are safe now. Check whether the new pgprot is the same: 435 */ 436 old_pte = *kpte; 437 old_prot = new_prot = pte_pgprot(old_pte); 438 439 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 440 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 441 442 /* 443 * old_pte points to the large page base address. So we need 444 * to add the offset of the virtual address: 445 */ 446 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 447 cpa->pfn = pfn; 448 449 new_prot = static_protections(new_prot, address, pfn); 450 451 /* 452 * We need to check the full range, whether 453 * static_protection() requires a different pgprot for one of 454 * the pages in the range we try to preserve: 455 */ 456 addr = address + PAGE_SIZE; 457 pfn++; 458 for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { 459 pgprot_t chk_prot = static_protections(new_prot, addr, pfn); 460 461 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 462 goto out_unlock; 463 } 464 465 /* 466 * If there are no changes, return. maxpages has been updated 467 * above: 468 */ 469 if (pgprot_val(new_prot) == pgprot_val(old_prot)) { 470 do_split = 0; 471 goto out_unlock; 472 } 473 474 /* 475 * We need to change the attributes. Check, whether we can 476 * change the large page in one go. We request a split, when 477 * the address is not aligned and the number of pages is 478 * smaller than the number of pages in the large page. Note 479 * that we limited the number of possible pages already to 480 * the number of pages in the large page. 481 */ 482 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { 483 /* 484 * The address is aligned and the number of pages 485 * covers the full page. 486 */ 487 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 488 __set_pmd_pte(kpte, address, new_pte); 489 cpa->flags |= CPA_FLUSHTLB; 490 do_split = 0; 491 } 492 493out_unlock: 494 spin_unlock_irqrestore(&pgd_lock, flags); 495 496 return do_split; 497} 498 499static int split_large_page(pte_t *kpte, unsigned long address) 500{ 501 unsigned long flags, pfn, pfninc = 1; 502 unsigned int i, level; 503 pte_t *pbase, *tmp; 504 pgprot_t ref_prot; 505 struct page *base; 506 507 if (!debug_pagealloc) 508 spin_unlock(&cpa_lock); 509 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); 510 if (!debug_pagealloc) 511 spin_lock(&cpa_lock); 512 if (!base) 513 return -ENOMEM; 514 515 spin_lock_irqsave(&pgd_lock, flags); 516 /* 517 * Check for races, another CPU might have split this page 518 * up for us already: 519 */ 520 tmp = lookup_address(address, &level); 521 if (tmp != kpte) 522 goto out_unlock; 523 524 pbase = (pte_t *)page_address(base); 525 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 526 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 527 /* 528 * If we ever want to utilize the PAT bit, we need to 529 * update this function to make sure it's converted from 530 * bit 12 to bit 7 when we cross from the 2MB level to 531 * the 4K level: 532 */ 533 WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); 534 535#ifdef CONFIG_X86_64 536 if (level == PG_LEVEL_1G) { 537 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; 538 pgprot_val(ref_prot) |= _PAGE_PSE; 539 } 540#endif 541 542 /* 543 * Get the target pfn from the original entry: 544 */ 545 pfn = pte_pfn(*kpte); 546 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 547 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 548 549 if (address >= (unsigned long)__va(0) && 550 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) 551 split_page_count(level); 552 553#ifdef CONFIG_X86_64 554 if (address >= (unsigned long)__va(1UL<<32) && 555 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) 556 split_page_count(level); 557#endif 558 559 /* 560 * Install the new, split up pagetable. 561 * 562 * We use the standard kernel pagetable protections for the new 563 * pagetable protections, the actual ptes set above control the 564 * primary protection behavior: 565 */ 566 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 567 568 __flush_tlb_all(); 569 570 base = NULL; 571 572out_unlock: 573 /* 574 * If we dropped out via the lookup_address check under 575 * pgd_lock then stick the page back into the pool: 576 */ 577 if (base) 578 __free_page(base); 579 spin_unlock_irqrestore(&pgd_lock, flags); 580 581 return 0; 582} 583 584static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, 585 int primary) 586{ 587 /* 588 * Ignore all non primary paths. 589 */ 590 if (!primary) 591 return 0; 592 593 /* 594 * Ignore the NULL PTE for kernel identity mapping, as it is expected 595 * to have holes. 596 * Also set numpages to '1' indicating that we processed cpa req for 597 * one virtual address page and its pfn. TBD: numpages can be set based 598 * on the initial value and the level returned by lookup_address(). 599 */ 600 if (within(vaddr, PAGE_OFFSET, 601 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 602 cpa->numpages = 1; 603 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; 604 return 0; 605 } else { 606 WARN(1, KERN_WARNING "CPA: called for zero pte. " 607 "vaddr = %lx cpa->vaddr = %lx\n", vaddr, 608 *cpa->vaddr); 609 610 return -EFAULT; 611 } 612} 613 614static int __change_page_attr(struct cpa_data *cpa, int primary) 615{ 616 unsigned long address; 617 int do_split, err; 618 unsigned int level; 619 pte_t *kpte, old_pte; 620 621 if (cpa->flags & CPA_PAGES_ARRAY) { 622 struct page *page = cpa->pages[cpa->curpage]; 623 if (unlikely(PageHighMem(page))) 624 return 0; 625 address = (unsigned long)page_address(page); 626 } else if (cpa->flags & CPA_ARRAY) 627 address = cpa->vaddr[cpa->curpage]; 628 else 629 address = *cpa->vaddr; 630repeat: 631 kpte = lookup_address(address, &level); 632 if (!kpte) 633 return __cpa_process_fault(cpa, address, primary); 634 635 old_pte = *kpte; 636 if (!pte_val(old_pte)) 637 return __cpa_process_fault(cpa, address, primary); 638 639 if (level == PG_LEVEL_4K) { 640 pte_t new_pte; 641 pgprot_t new_prot = pte_pgprot(old_pte); 642 unsigned long pfn = pte_pfn(old_pte); 643 644 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 645 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 646 647 new_prot = static_protections(new_prot, address, pfn); 648 649 /* 650 * We need to keep the pfn from the existing PTE, 651 * after all we're only going to change it's attributes 652 * not the memory it points to 653 */ 654 new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); 655 cpa->pfn = pfn; 656 /* 657 * Do we really change anything ? 658 */ 659 if (pte_val(old_pte) != pte_val(new_pte)) { 660 set_pte_atomic(kpte, new_pte); 661 cpa->flags |= CPA_FLUSHTLB; 662 } 663 cpa->numpages = 1; 664 return 0; 665 } 666 667 /* 668 * Check, whether we can keep the large page intact 669 * and just change the pte: 670 */ 671 do_split = try_preserve_large_page(kpte, address, cpa); 672 /* 673 * When the range fits into the existing large page, 674 * return. cp->numpages and cpa->tlbflush have been updated in 675 * try_large_page: 676 */ 677 if (do_split <= 0) 678 return do_split; 679 680 /* 681 * We have to split the large page: 682 */ 683 err = split_large_page(kpte, address); 684 if (!err) { 685 /* 686 * Do a global flush tlb after splitting the large page 687 * and before we do the actual change page attribute in the PTE. 688 * 689 * With out this, we violate the TLB application note, that says 690 * "The TLBs may contain both ordinary and large-page 691 * translations for a 4-KByte range of linear addresses. This 692 * may occur if software modifies the paging structures so that 693 * the page size used for the address range changes. If the two 694 * translations differ with respect to page frame or attributes 695 * (e.g., permissions), processor behavior is undefined and may 696 * be implementation-specific." 697 * 698 * We do this global tlb flush inside the cpa_lock, so that we 699 * don't allow any other cpu, with stale tlb entries change the 700 * page attribute in parallel, that also falls into the 701 * just split large page entry. 702 */ 703 flush_tlb_all(); 704 goto repeat; 705 } 706 707 return err; 708} 709 710static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); 711 712static int cpa_process_alias(struct cpa_data *cpa) 713{ 714 struct cpa_data alias_cpa; 715 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 716 unsigned long vaddr; 717 int ret; 718 719 if (cpa->pfn >= max_pfn_mapped) 720 return 0; 721 722#ifdef CONFIG_X86_64 723 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) 724 return 0; 725#endif 726 /* 727 * No need to redo, when the primary call touched the direct 728 * mapping already: 729 */ 730 if (cpa->flags & CPA_PAGES_ARRAY) { 731 struct page *page = cpa->pages[cpa->curpage]; 732 if (unlikely(PageHighMem(page))) 733 return 0; 734 vaddr = (unsigned long)page_address(page); 735 } else if (cpa->flags & CPA_ARRAY) 736 vaddr = cpa->vaddr[cpa->curpage]; 737 else 738 vaddr = *cpa->vaddr; 739 740 if (!(within(vaddr, PAGE_OFFSET, 741 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 742 743 alias_cpa = *cpa; 744 alias_cpa.vaddr = &laddr; 745 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 746 747 ret = __change_page_attr_set_clr(&alias_cpa, 0); 748 if (ret) 749 return ret; 750 } 751 752#ifdef CONFIG_X86_64 753 /* 754 * If the primary call didn't touch the high mapping already 755 * and the physical address is inside the kernel map, we need 756 * to touch the high mapped kernel as well: 757 */ 758 if (!within(vaddr, (unsigned long)_text, _brk_end) && 759 within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { 760 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + 761 __START_KERNEL_map - phys_base; 762 alias_cpa = *cpa; 763 alias_cpa.vaddr = &temp_cpa_vaddr; 764 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 765 766 /* 767 * The high mapping range is imprecise, so ignore the 768 * return value. 769 */ 770 __change_page_attr_set_clr(&alias_cpa, 0); 771 } 772#endif 773 774 return 0; 775} 776 777static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 778{ 779 int ret, numpages = cpa->numpages; 780 781 while (numpages) { 782 /* 783 * Store the remaining nr of pages for the large page 784 * preservation check. 785 */ 786 cpa->numpages = numpages; 787 /* for array changes, we can't use large page */ 788 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) 789 cpa->numpages = 1; 790 791 if (!debug_pagealloc) 792 spin_lock(&cpa_lock); 793 ret = __change_page_attr(cpa, checkalias); 794 if (!debug_pagealloc) 795 spin_unlock(&cpa_lock); 796 if (ret) 797 return ret; 798 799 if (checkalias) { 800 ret = cpa_process_alias(cpa); 801 if (ret) 802 return ret; 803 } 804 805 /* 806 * Adjust the number of pages with the result of the 807 * CPA operation. Either a large page has been 808 * preserved or a single page update happened. 809 */ 810 BUG_ON(cpa->numpages > numpages); 811 numpages -= cpa->numpages; 812 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) 813 cpa->curpage++; 814 else 815 *cpa->vaddr += cpa->numpages * PAGE_SIZE; 816 817 } 818 return 0; 819} 820 821static inline int cache_attr(pgprot_t attr) 822{ 823 return pgprot_val(attr) & 824 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 825} 826 827static int change_page_attr_set_clr(unsigned long *addr, int numpages, 828 pgprot_t mask_set, pgprot_t mask_clr, 829 int force_split, int in_flag, 830 struct page **pages) 831{ 832 struct cpa_data cpa; 833 int ret, cache, checkalias; 834 unsigned long baddr = 0; 835 836 /* 837 * Check, if we are requested to change a not supported 838 * feature: 839 */ 840 mask_set = canon_pgprot(mask_set); 841 mask_clr = canon_pgprot(mask_clr); 842 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) 843 return 0; 844 845 /* Ensure we are PAGE_SIZE aligned */ 846 if (in_flag & CPA_ARRAY) { 847 int i; 848 for (i = 0; i < numpages; i++) { 849 if (addr[i] & ~PAGE_MASK) { 850 addr[i] &= PAGE_MASK; 851 WARN_ON_ONCE(1); 852 } 853 } 854 } else if (!(in_flag & CPA_PAGES_ARRAY)) { 855 /* 856 * in_flag of CPA_PAGES_ARRAY implies it is aligned. 857 * No need to cehck in that case 858 */ 859 if (*addr & ~PAGE_MASK) { 860 *addr &= PAGE_MASK; 861 /* 862 * People should not be passing in unaligned addresses: 863 */ 864 WARN_ON_ONCE(1); 865 } 866 /* 867 * Save address for cache flush. *addr is modified in the call 868 * to __change_page_attr_set_clr() below. 869 */ 870 baddr = *addr; 871 } 872 873 /* Must avoid aliasing mappings in the highmem code */ 874 kmap_flush_unused(); 875 876 vm_unmap_aliases(); 877 878 cpa.vaddr = addr; 879 cpa.pages = pages; 880 cpa.numpages = numpages; 881 cpa.mask_set = mask_set; 882 cpa.mask_clr = mask_clr; 883 cpa.flags = 0; 884 cpa.curpage = 0; 885 cpa.force_split = force_split; 886 887 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) 888 cpa.flags |= in_flag; 889 890 /* No alias checking for _NX bit modifications */ 891 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 892 893 ret = __change_page_attr_set_clr(&cpa, checkalias); 894 895 /* 896 * Check whether we really changed something: 897 */ 898 if (!(cpa.flags & CPA_FLUSHTLB)) 899 goto out; 900 901 /* 902 * No need to flush, when we did not set any of the caching 903 * attributes: 904 */ 905 cache = cache_attr(mask_set); 906 907 /* 908 * On success we use clflush, when the CPU supports it to 909 * avoid the wbindv. If the CPU does not support it and in the 910 * error case we fall back to cpa_flush_all (which uses 911 * wbindv): 912 */ 913 if (!ret && cpu_has_clflush) { 914 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 915 cpa_flush_array(addr, numpages, cache, 916 cpa.flags, pages); 917 } else 918 cpa_flush_range(baddr, numpages, cache); 919 } else 920 cpa_flush_all(cache); 921 922out: 923 return ret; 924} 925 926static inline int change_page_attr_set(unsigned long *addr, int numpages, 927 pgprot_t mask, int array) 928{ 929 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, 930 (array ? CPA_ARRAY : 0), NULL); 931} 932 933static inline int change_page_attr_clear(unsigned long *addr, int numpages, 934 pgprot_t mask, int array) 935{ 936 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, 937 (array ? CPA_ARRAY : 0), NULL); 938} 939 940static inline int cpa_set_pages_array(struct page **pages, int numpages, 941 pgprot_t mask) 942{ 943 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, 944 CPA_PAGES_ARRAY, pages); 945} 946 947static inline int cpa_clear_pages_array(struct page **pages, int numpages, 948 pgprot_t mask) 949{ 950 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, 951 CPA_PAGES_ARRAY, pages); 952} 953 954int _set_memory_uc(unsigned long addr, int numpages) 955{ 956 /* 957 * for now UC MINUS. see comments in ioremap_nocache() 958 */ 959 return change_page_attr_set(&addr, numpages, 960 __pgprot(_PAGE_CACHE_UC_MINUS), 0); 961} 962 963int set_memory_uc(unsigned long addr, int numpages) 964{ 965 int ret; 966 967 /* 968 * for now UC MINUS. see comments in ioremap_nocache() 969 */ 970 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 971 _PAGE_CACHE_UC_MINUS, NULL); 972 if (ret) 973 goto out_err; 974 975 ret = _set_memory_uc(addr, numpages); 976 if (ret) 977 goto out_free; 978 979 return 0; 980 981out_free: 982 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 983out_err: 984 return ret; 985} 986EXPORT_SYMBOL(set_memory_uc); 987 988int _set_memory_array(unsigned long *addr, int addrinarray, 989 unsigned long new_type) 990{ 991 int i, j; 992 int ret; 993 994 /* 995 * for now UC MINUS. see comments in ioremap_nocache() 996 */ 997 for (i = 0; i < addrinarray; i++) { 998 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, 999 new_type, NULL); 1000 if (ret) 1001 goto out_free; 1002 } 1003 1004 ret = change_page_attr_set(addr, addrinarray, 1005 __pgprot(_PAGE_CACHE_UC_MINUS), 1); 1006 1007 if (!ret && new_type == _PAGE_CACHE_WC) 1008 ret = change_page_attr_set_clr(addr, addrinarray, 1009 __pgprot(_PAGE_CACHE_WC), 1010 __pgprot(_PAGE_CACHE_MASK), 1011 0, CPA_ARRAY, NULL); 1012 if (ret) 1013 goto out_free; 1014 1015 return 0; 1016 1017out_free: 1018 for (j = 0; j < i; j++) 1019 free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); 1020 1021 return ret; 1022} 1023 1024int set_memory_array_uc(unsigned long *addr, int addrinarray) 1025{ 1026 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS); 1027} 1028EXPORT_SYMBOL(set_memory_array_uc); 1029 1030int set_memory_array_wc(unsigned long *addr, int addrinarray) 1031{ 1032 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC); 1033} 1034EXPORT_SYMBOL(set_memory_array_wc); 1035 1036int _set_memory_wc(unsigned long addr, int numpages) 1037{ 1038 int ret; 1039 unsigned long addr_copy = addr; 1040 1041 ret = change_page_attr_set(&addr, numpages, 1042 __pgprot(_PAGE_CACHE_UC_MINUS), 0); 1043 if (!ret) { 1044 ret = change_page_attr_set_clr(&addr_copy, numpages, 1045 __pgprot(_PAGE_CACHE_WC), 1046 __pgprot(_PAGE_CACHE_MASK), 1047 0, 0, NULL); 1048 } 1049 return ret; 1050} 1051 1052int set_memory_wc(unsigned long addr, int numpages) 1053{ 1054 int ret; 1055 1056 if (!pat_enabled) 1057 return set_memory_uc(addr, numpages); 1058 1059 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1060 _PAGE_CACHE_WC, NULL); 1061 if (ret) 1062 goto out_err; 1063 1064 ret = _set_memory_wc(addr, numpages); 1065 if (ret) 1066 goto out_free; 1067 1068 return 0; 1069 1070out_free: 1071 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1072out_err: 1073 return ret; 1074} 1075EXPORT_SYMBOL(set_memory_wc); 1076 1077int _set_memory_wb(unsigned long addr, int numpages) 1078{ 1079 return change_page_attr_clear(&addr, numpages, 1080 __pgprot(_PAGE_CACHE_MASK), 0); 1081} 1082 1083int set_memory_wb(unsigned long addr, int numpages) 1084{ 1085 int ret; 1086 1087 ret = _set_memory_wb(addr, numpages); 1088 if (ret) 1089 return ret; 1090 1091 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1092 return 0; 1093} 1094EXPORT_SYMBOL(set_memory_wb); 1095 1096int set_memory_array_wb(unsigned long *addr, int addrinarray) 1097{ 1098 int i; 1099 int ret; 1100 1101 ret = change_page_attr_clear(addr, addrinarray, 1102 __pgprot(_PAGE_CACHE_MASK), 1); 1103 if (ret) 1104 return ret; 1105 1106 for (i = 0; i < addrinarray; i++) 1107 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); 1108 1109 return 0; 1110} 1111EXPORT_SYMBOL(set_memory_array_wb); 1112 1113int set_memory_x(unsigned long addr, int numpages) 1114{ 1115 if (!(__supported_pte_mask & _PAGE_NX)) 1116 return 0; 1117 1118 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1119} 1120EXPORT_SYMBOL(set_memory_x); 1121 1122int set_memory_nx(unsigned long addr, int numpages) 1123{ 1124 if (!(__supported_pte_mask & _PAGE_NX)) 1125 return 0; 1126 1127 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1128} 1129EXPORT_SYMBOL(set_memory_nx); 1130 1131int set_memory_ro(unsigned long addr, int numpages) 1132{ 1133 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); 1134} 1135EXPORT_SYMBOL_GPL(set_memory_ro); 1136 1137int set_memory_rw(unsigned long addr, int numpages) 1138{ 1139 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); 1140} 1141EXPORT_SYMBOL_GPL(set_memory_rw); 1142 1143int set_memory_np(unsigned long addr, int numpages) 1144{ 1145 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1146} 1147 1148int set_memory_4k(unsigned long addr, int numpages) 1149{ 1150 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1151 __pgprot(0), 1, 0, NULL); 1152} 1153 1154int set_pages_uc(struct page *page, int numpages) 1155{ 1156 unsigned long addr = (unsigned long)page_address(page); 1157 1158 return set_memory_uc(addr, numpages); 1159} 1160EXPORT_SYMBOL(set_pages_uc); 1161 1162static int _set_pages_array(struct page **pages, int addrinarray, 1163 unsigned long new_type) 1164{ 1165 unsigned long start; 1166 unsigned long end; 1167 int i; 1168 int free_idx; 1169 int ret; 1170 1171 for (i = 0; i < addrinarray; i++) { 1172 if (PageHighMem(pages[i])) 1173 continue; 1174 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1175 end = start + PAGE_SIZE; 1176 if (reserve_memtype(start, end, new_type, NULL)) 1177 goto err_out; 1178 } 1179 1180 ret = cpa_set_pages_array(pages, addrinarray, 1181 __pgprot(_PAGE_CACHE_UC_MINUS)); 1182 if (!ret && new_type == _PAGE_CACHE_WC) 1183 ret = change_page_attr_set_clr(NULL, addrinarray, 1184 __pgprot(_PAGE_CACHE_WC), 1185 __pgprot(_PAGE_CACHE_MASK), 1186 0, CPA_PAGES_ARRAY, pages); 1187 if (ret) 1188 goto err_out; 1189 return 0; /* Success */ 1190err_out: 1191 free_idx = i; 1192 for (i = 0; i < free_idx; i++) { 1193 if (PageHighMem(pages[i])) 1194 continue; 1195 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1196 end = start + PAGE_SIZE; 1197 free_memtype(start, end); 1198 } 1199 return -EINVAL; 1200} 1201 1202int set_pages_array_uc(struct page **pages, int addrinarray) 1203{ 1204 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS); 1205} 1206EXPORT_SYMBOL(set_pages_array_uc); 1207 1208int set_pages_array_wc(struct page **pages, int addrinarray) 1209{ 1210 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC); 1211} 1212EXPORT_SYMBOL(set_pages_array_wc); 1213 1214int set_pages_wb(struct page *page, int numpages) 1215{ 1216 unsigned long addr = (unsigned long)page_address(page); 1217 1218 return set_memory_wb(addr, numpages); 1219} 1220EXPORT_SYMBOL(set_pages_wb); 1221 1222int set_pages_array_wb(struct page **pages, int addrinarray) 1223{ 1224 int retval; 1225 unsigned long start; 1226 unsigned long end; 1227 int i; 1228 1229 retval = cpa_clear_pages_array(pages, addrinarray, 1230 __pgprot(_PAGE_CACHE_MASK)); 1231 if (retval) 1232 return retval; 1233 1234 for (i = 0; i < addrinarray; i++) { 1235 if (PageHighMem(pages[i])) 1236 continue; 1237 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1238 end = start + PAGE_SIZE; 1239 free_memtype(start, end); 1240 } 1241 1242 return 0; 1243} 1244EXPORT_SYMBOL(set_pages_array_wb); 1245 1246int set_pages_x(struct page *page, int numpages) 1247{ 1248 unsigned long addr = (unsigned long)page_address(page); 1249 1250 return set_memory_x(addr, numpages); 1251} 1252EXPORT_SYMBOL(set_pages_x); 1253 1254int set_pages_nx(struct page *page, int numpages) 1255{ 1256 unsigned long addr = (unsigned long)page_address(page); 1257 1258 return set_memory_nx(addr, numpages); 1259} 1260EXPORT_SYMBOL(set_pages_nx); 1261 1262int set_pages_ro(struct page *page, int numpages) 1263{ 1264 unsigned long addr = (unsigned long)page_address(page); 1265 1266 return set_memory_ro(addr, numpages); 1267} 1268 1269int set_pages_rw(struct page *page, int numpages) 1270{ 1271 unsigned long addr = (unsigned long)page_address(page); 1272 1273 return set_memory_rw(addr, numpages); 1274} 1275 1276#ifdef CONFIG_DEBUG_PAGEALLOC 1277 1278static int __set_pages_p(struct page *page, int numpages) 1279{ 1280 unsigned long tempaddr = (unsigned long) page_address(page); 1281 struct cpa_data cpa = { .vaddr = &tempaddr, 1282 .numpages = numpages, 1283 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1284 .mask_clr = __pgprot(0), 1285 .flags = 0}; 1286 1287 /* 1288 * No alias checking needed for setting present flag. otherwise, 1289 * we may need to break large pages for 64-bit kernel text 1290 * mappings (this adds to complexity if we want to do this from 1291 * atomic context especially). Let's keep it simple! 1292 */ 1293 return __change_page_attr_set_clr(&cpa, 0); 1294} 1295 1296static int __set_pages_np(struct page *page, int numpages) 1297{ 1298 unsigned long tempaddr = (unsigned long) page_address(page); 1299 struct cpa_data cpa = { .vaddr = &tempaddr, 1300 .numpages = numpages, 1301 .mask_set = __pgprot(0), 1302 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1303 .flags = 0}; 1304 1305 /* 1306 * No alias checking needed for setting not present flag. otherwise, 1307 * we may need to break large pages for 64-bit kernel text 1308 * mappings (this adds to complexity if we want to do this from 1309 * atomic context especially). Let's keep it simple! 1310 */ 1311 return __change_page_attr_set_clr(&cpa, 0); 1312} 1313 1314void kernel_map_pages(struct page *page, int numpages, int enable) 1315{ 1316 if (PageHighMem(page)) 1317 return; 1318 if (!enable) { 1319 debug_check_no_locks_freed(page_address(page), 1320 numpages * PAGE_SIZE); 1321 } 1322 1323 /* 1324 * If page allocator is not up yet then do not call c_p_a(): 1325 */ 1326 if (!debug_pagealloc_enabled) 1327 return; 1328 1329 /* 1330 * The return value is ignored as the calls cannot fail. 1331 * Large pages for identity mappings are not used at boot time 1332 * and hence no memory allocations during large page split. 1333 */ 1334 if (enable) 1335 __set_pages_p(page, numpages); 1336 else 1337 __set_pages_np(page, numpages); 1338 1339 /* 1340 * We should perform an IPI and flush all tlbs, 1341 * but that can deadlock->flush only current cpu: 1342 */ 1343 __flush_tlb_all(); 1344} 1345 1346#ifdef CONFIG_HIBERNATION 1347 1348bool kernel_page_present(struct page *page) 1349{ 1350 unsigned int level; 1351 pte_t *pte; 1352 1353 if (PageHighMem(page)) 1354 return false; 1355 1356 pte = lookup_address((unsigned long)page_address(page), &level); 1357 return (pte_val(*pte) & _PAGE_PRESENT); 1358} 1359 1360#endif /* CONFIG_HIBERNATION */ 1361 1362#endif /* CONFIG_DEBUG_PAGEALLOC */ 1363 1364/* 1365 * The testcases use internal knowledge of the implementation that shouldn't 1366 * be exposed to the rest of the kernel. Include these directly here. 1367 */ 1368#ifdef CONFIG_CPA_DEBUG 1369#include "pageattr-test.c" 1370#endif 1371