1/* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright 2010 Tilera Corporation. All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation, version 2. 8 * 9 * This program is distributed in the hope that it will be useful, but 10 * WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 12 * NON INFRINGEMENT. See the GNU General Public License for 13 * more details. 14 */ 15 16#include <linux/module.h> 17#include <linux/signal.h> 18#include <linux/sched.h> 19#include <linux/kernel.h> 20#include <linux/errno.h> 21#include <linux/string.h> 22#include <linux/types.h> 23#include <linux/ptrace.h> 24#include <linux/mman.h> 25#include <linux/mm.h> 26#include <linux/hugetlb.h> 27#include <linux/swap.h> 28#include <linux/smp.h> 29#include <linux/init.h> 30#include <linux/highmem.h> 31#include <linux/pagemap.h> 32#include <linux/poison.h> 33#include <linux/bootmem.h> 34#include <linux/slab.h> 35#include <linux/proc_fs.h> 36#include <linux/efi.h> 37#include <linux/memory_hotplug.h> 38#include <linux/uaccess.h> 39#include <asm/mmu_context.h> 40#include <asm/processor.h> 41#include <asm/system.h> 42#include <asm/pgtable.h> 43#include <asm/pgalloc.h> 44#include <asm/dma.h> 45#include <asm/fixmap.h> 46#include <asm/tlb.h> 47#include <asm/tlbflush.h> 48#include <asm/sections.h> 49#include <asm/setup.h> 50#include <asm/homecache.h> 51#include <hv/hypervisor.h> 52#include <arch/chip.h> 53 54#include "migrate.h" 55 56/* 57 * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)" 58 * in the Tile Kconfig, but this generates configure warnings. 59 * Do it here and force people to get it right to compile this file. 60 * The problem is that with 4KB small pages and 16MB huge pages, 61 * the default value doesn't allow us to group enough small pages 62 * together to make up a huge page. 63 */ 64#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1 65# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size" 66#endif 67 68#define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0)) 69 70#ifndef __tilegx__ 71unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE; 72#endif 73 74DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 75 76/* Create an L2 page table */ 77static pte_t * __init alloc_pte(void) 78{ 79 return __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0); 80} 81 82/* 83 * L2 page tables per controller. We allocate these all at once from 84 * the bootmem allocator and store them here. This saves on kernel L2 85 * page table memory, compared to allocating a full 64K page per L2 86 * page table, and also means that in cases where we use huge pages, 87 * we are guaranteed to later be able to shatter those huge pages and 88 * switch to using these page tables instead, without requiring 89 * further allocation. Each l2_ptes[] entry points to the first page 90 * table for the first hugepage-size piece of memory on the 91 * controller; other page tables are just indexed directly, i.e. the 92 * L2 page tables are contiguous in memory for each controller. 93 */ 94static pte_t *l2_ptes[MAX_NUMNODES]; 95static int num_l2_ptes[MAX_NUMNODES]; 96 97static void init_prealloc_ptes(int node, int pages) 98{ 99 BUG_ON(pages & (HV_L2_ENTRIES-1)); 100 if (pages) { 101 num_l2_ptes[node] = pages; 102 l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t), 103 HV_PAGE_TABLE_ALIGN, 0); 104 } 105} 106 107pte_t *get_prealloc_pte(unsigned long pfn) 108{ 109 int node = pfn_to_nid(pfn); 110 pfn &= ~(-1UL << (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT)); 111 BUG_ON(node >= MAX_NUMNODES); 112 BUG_ON(pfn >= num_l2_ptes[node]); 113 return &l2_ptes[node][pfn]; 114} 115 116/* 117 * What caching do we expect pages from the heap to have when 118 * they are allocated during bootup? (Once we've installed the 119 * "real" swapper_pg_dir.) 120 */ 121static int initial_heap_home(void) 122{ 123#if CHIP_HAS_CBOX_HOME_MAP() 124 if (hash_default) 125 return PAGE_HOME_HASH; 126#endif 127 return smp_processor_id(); 128} 129 130/* 131 * Place a pointer to an L2 page table in a middle page 132 * directory entry. 133 */ 134static void __init assign_pte(pmd_t *pmd, pte_t *page_table) 135{ 136 phys_addr_t pa = __pa(page_table); 137 unsigned long l2_ptfn = pa >> HV_LOG2_PAGE_TABLE_ALIGN; 138 pte_t pteval = hv_pte_set_ptfn(__pgprot(_PAGE_TABLE), l2_ptfn); 139 BUG_ON((pa & (HV_PAGE_TABLE_ALIGN-1)) != 0); 140 pteval = pte_set_home(pteval, initial_heap_home()); 141 *(pte_t *)pmd = pteval; 142 if (page_table != (pte_t *)pmd_page_vaddr(*pmd)) 143 BUG(); 144} 145 146#ifdef __tilegx__ 147 148#if HV_L1_SIZE != HV_L2_SIZE 149# error Rework assumption that L1 and L2 page tables are same size. 150#endif 151 152/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */ 153static inline pmd_t *alloc_pmd(void) 154{ 155 return (pmd_t *)alloc_pte(); 156} 157 158static inline void assign_pmd(pud_t *pud, pmd_t *pmd) 159{ 160 assign_pte((pmd_t *)pud, (pte_t *)pmd); 161} 162 163#endif /* __tilegx__ */ 164 165/* Replace the given pmd with a full PTE table. */ 166void __init shatter_pmd(pmd_t *pmd) 167{ 168 pte_t *pte = get_prealloc_pte(pte_pfn(*(pte_t *)pmd)); 169 assign_pte(pmd, pte); 170} 171 172#ifdef CONFIG_HIGHMEM 173/* 174 * This function initializes a certain range of kernel virtual memory 175 * with new bootmem page tables, everywhere page tables are missing in 176 * the given range. 177 */ 178 179/* 180 * NOTE: The pagetables are allocated contiguous on the physical space 181 * so we can cache the place of the first one and move around without 182 * checking the pgd every time. 183 */ 184static void __init page_table_range_init(unsigned long start, 185 unsigned long end, pgd_t *pgd_base) 186{ 187 pgd_t *pgd; 188 int pgd_idx; 189 unsigned long vaddr; 190 191 vaddr = start; 192 pgd_idx = pgd_index(vaddr); 193 pgd = pgd_base + pgd_idx; 194 195 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { 196 pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr); 197 if (pmd_none(*pmd)) 198 assign_pte(pmd, alloc_pte()); 199 vaddr += PMD_SIZE; 200 } 201} 202#endif /* CONFIG_HIGHMEM */ 203 204 205#if CHIP_HAS_CBOX_HOME_MAP() 206 207static int __initdata ktext_hash = 1; /* .text pages */ 208static int __initdata kdata_hash = 1; /* .data and .bss pages */ 209int __write_once hash_default = 1; /* kernel allocator pages */ 210EXPORT_SYMBOL(hash_default); 211int __write_once kstack_hash = 1; /* if no homecaching, use h4h */ 212#endif /* CHIP_HAS_CBOX_HOME_MAP */ 213 214/* 215 * CPUs to use to for striping the pages of kernel data. If hash-for-home 216 * is available, this is only relevant if kcache_hash sets up the 217 * .data and .bss to be page-homed, and we don't want the default mode 218 * of using the full set of kernel cpus for the striping. 219 */ 220static __initdata struct cpumask kdata_mask; 221static __initdata int kdata_arg_seen; 222 223int __write_once kdata_huge; /* if no homecaching, small pages */ 224 225 226/* Combine a generic pgprot_t with cache home to get a cache-aware pgprot. */ 227static pgprot_t __init construct_pgprot(pgprot_t prot, int home) 228{ 229 prot = pte_set_home(prot, home); 230#if CHIP_HAS_CBOX_HOME_MAP() 231 if (home == PAGE_HOME_IMMUTABLE) { 232 if (ktext_hash) 233 prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3); 234 else 235 prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3); 236 } 237#endif 238 return prot; 239} 240 241/* 242 * For a given kernel data VA, how should it be cached? 243 * We return the complete pgprot_t with caching bits set. 244 */ 245static pgprot_t __init init_pgprot(ulong address) 246{ 247 int cpu; 248 unsigned long page; 249 enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET }; 250 251#if CHIP_HAS_CBOX_HOME_MAP() 252 /* For kdata=huge, everything is just hash-for-home. */ 253 if (kdata_huge) 254 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); 255#endif 256 257 /* We map the aliased pages of permanent text inaccessible. */ 258 if (address < (ulong) _sinittext - CODE_DELTA) 259 return PAGE_NONE; 260 261 /* 262 * We map read-only data non-coherent for performance. We could 263 * use neighborhood caching on TILE64, but it's not clear it's a win. 264 */ 265 if ((address >= (ulong) __start_rodata && 266 address < (ulong) __end_rodata) || 267 address == (ulong) empty_zero_page) { 268 return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE); 269 } 270 271 /* As a performance optimization, keep the boot init stack here. */ 272 if (address >= (ulong)&init_thread_union && 273 address < (ulong)&init_thread_union + THREAD_SIZE) 274 return construct_pgprot(PAGE_KERNEL, smp_processor_id()); 275 276#ifndef __tilegx__ 277#if !ATOMIC_LOCKS_FOUND_VIA_TABLE() 278 /* Force the atomic_locks[] array page to be hash-for-home. */ 279 if (address == (ulong) atomic_locks) 280 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); 281#endif 282#endif 283 284 /* 285 * Everything else that isn't data or bss is heap, so mark it 286 * with the initial heap home (hash-for-home, or this cpu). This 287 * includes any addresses after the loaded image and any address before 288 * _einitdata, since we already captured the case of text before 289 * _sinittext, and __pa(einittext) is approximately __pa(sinitdata). 290 * 291 * All the LOWMEM pages that we mark this way will get their 292 * struct page homecache properly marked later, in set_page_homes(). 293 * The HIGHMEM pages we leave with a default zero for their 294 * homes, but with a zero free_time we don't have to actually 295 * do a flush action the first time we use them, either. 296 */ 297 if (address >= (ulong) _end || address < (ulong) _einitdata) 298 return construct_pgprot(PAGE_KERNEL, initial_heap_home()); 299 300#if CHIP_HAS_CBOX_HOME_MAP() 301 /* Use hash-for-home if requested for data/bss. */ 302 if (kdata_hash) 303 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); 304#endif 305 306 /* 307 * Make the w1data homed like heap to start with, to avoid 308 * making it part of the page-striped data area when we're just 309 * going to convert it to read-only soon anyway. 310 */ 311 if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end) 312 return construct_pgprot(PAGE_KERNEL, initial_heap_home()); 313 314 /* 315 * Otherwise we just hand out consecutive cpus. To avoid 316 * requiring this function to hold state, we just walk forward from 317 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach 318 * the requested address, while walking cpu home around kdata_mask. 319 * This is typically no more than a dozen or so iterations. 320 */ 321 page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK; 322 BUG_ON(address < page || address >= (ulong)_end); 323 cpu = cpumask_first(&kdata_mask); 324 for (; page < address; page += PAGE_SIZE) { 325 if (page >= (ulong)&init_thread_union && 326 page < (ulong)&init_thread_union + THREAD_SIZE) 327 continue; 328 if (page == (ulong)empty_zero_page) 329 continue; 330#ifndef __tilegx__ 331#if !ATOMIC_LOCKS_FOUND_VIA_TABLE() 332 if (page == (ulong)atomic_locks) 333 continue; 334#endif 335#endif 336 cpu = cpumask_next(cpu, &kdata_mask); 337 if (cpu == NR_CPUS) 338 cpu = cpumask_first(&kdata_mask); 339 } 340 return construct_pgprot(PAGE_KERNEL, cpu); 341} 342 343/* 344 * This function sets up how we cache the kernel text. If we have 345 * hash-for-home support, normally that is used instead (see the 346 * kcache_hash boot flag for more information). But if we end up 347 * using a page-based caching technique, this option sets up the 348 * details of that. In addition, the "ktext=nocache" option may 349 * always be used to disable local caching of text pages, if desired. 350 */ 351 352static int __initdata ktext_arg_seen; 353static int __initdata ktext_small; 354static int __initdata ktext_local; 355static int __initdata ktext_all; 356static int __initdata ktext_nondataplane; 357static int __initdata ktext_nocache; 358static struct cpumask __initdata ktext_mask; 359 360static int __init setup_ktext(char *str) 361{ 362 if (str == NULL) 363 return -EINVAL; 364 365 /* If you have a leading "nocache", turn off ktext caching */ 366 if (strncmp(str, "nocache", 7) == 0) { 367 ktext_nocache = 1; 368 pr_info("ktext: disabling local caching of kernel text\n"); 369 str += 7; 370 if (*str == ',') 371 ++str; 372 if (*str == '\0') 373 return 0; 374 } 375 376 ktext_arg_seen = 1; 377 378 /* Default setting on Tile64: use a huge page */ 379 if (strcmp(str, "huge") == 0) 380 pr_info("ktext: using one huge locally cached page\n"); 381 382 /* Pay TLB cost but get no cache benefit: cache small pages locally */ 383 else if (strcmp(str, "local") == 0) { 384 ktext_small = 1; 385 ktext_local = 1; 386 pr_info("ktext: using small pages with local caching\n"); 387 } 388 389 /* Neighborhood cache ktext pages on all cpus. */ 390 else if (strcmp(str, "all") == 0) { 391 ktext_small = 1; 392 ktext_all = 1; 393 pr_info("ktext: using maximal caching neighborhood\n"); 394 } 395 396 397 /* Neighborhood ktext pages on specified mask */ 398 else if (cpulist_parse(str, &ktext_mask) == 0) { 399 char buf[NR_CPUS * 5]; 400 cpulist_scnprintf(buf, sizeof(buf), &ktext_mask); 401 if (cpumask_weight(&ktext_mask) > 1) { 402 ktext_small = 1; 403 pr_info("ktext: using caching neighborhood %s " 404 "with small pages\n", buf); 405 } else { 406 pr_info("ktext: caching on cpu %s with one huge page\n", 407 buf); 408 } 409 } 410 411 else if (*str) 412 return -EINVAL; 413 414 return 0; 415} 416 417early_param("ktext", setup_ktext); 418 419 420static inline pgprot_t ktext_set_nocache(pgprot_t prot) 421{ 422 if (!ktext_nocache) 423 prot = hv_pte_set_nc(prot); 424#if CHIP_HAS_NC_AND_NOALLOC_BITS() 425 else 426 prot = hv_pte_set_no_alloc_l2(prot); 427#endif 428 return prot; 429} 430 431#ifndef __tilegx__ 432static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) 433{ 434 return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va); 435} 436#else 437static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) 438{ 439 pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va); 440 if (pud_none(*pud)) 441 assign_pmd(pud, alloc_pmd()); 442 return pmd_offset(pud, va); 443} 444#endif 445 446/* Temporary page table we use for staging. */ 447static pgd_t pgtables[PTRS_PER_PGD] 448 __attribute__((section(".init.page"))); 449 450/* 451 * This maps the physical memory to kernel virtual address space, a total 452 * of max_low_pfn pages, by creating page tables starting from address 453 * PAGE_OFFSET. 454 * 455 * This routine transitions us from using a set of compiled-in large 456 * pages to using some more precise caching, including removing access 457 * to code pages mapped at PAGE_OFFSET (executed only at MEM_SV_START) 458 * marking read-only data as locally cacheable, striping the remaining 459 * .data and .bss across all the available tiles, and removing access 460 * to pages above the top of RAM (thus ensuring a page fault from a bad 461 * virtual address rather than a hypervisor shoot down for accessing 462 * memory outside the assigned limits). 463 */ 464static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 465{ 466 unsigned long address, pfn; 467 pmd_t *pmd; 468 pte_t *pte; 469 int pte_ofs; 470 const struct cpumask *my_cpu_mask = cpumask_of(smp_processor_id()); 471 struct cpumask kstripe_mask; 472 int rc, i; 473 474#if CHIP_HAS_CBOX_HOME_MAP() 475 if (ktext_arg_seen && ktext_hash) { 476 pr_warning("warning: \"ktext\" boot argument ignored" 477 " if \"kcache_hash\" sets up text hash-for-home\n"); 478 ktext_small = 0; 479 } 480 481 if (kdata_arg_seen && kdata_hash) { 482 pr_warning("warning: \"kdata\" boot argument ignored" 483 " if \"kcache_hash\" sets up data hash-for-home\n"); 484 } 485 486 if (kdata_huge && !hash_default) { 487 pr_warning("warning: disabling \"kdata=huge\"; requires" 488 " kcache_hash=all or =allbutstack\n"); 489 kdata_huge = 0; 490 } 491#endif 492 493 /* 494 * Set up a mask for cpus to use for kernel striping. 495 * This is normally all cpus, but minus dataplane cpus if any. 496 * If the dataplane covers the whole chip, we stripe over 497 * the whole chip too. 498 */ 499 cpumask_copy(&kstripe_mask, cpu_possible_mask); 500 if (!kdata_arg_seen) 501 kdata_mask = kstripe_mask; 502 503 /* Allocate and fill in L2 page tables */ 504 for (i = 0; i < MAX_NUMNODES; ++i) { 505#ifdef CONFIG_HIGHMEM 506 unsigned long end_pfn = node_lowmem_end_pfn[i]; 507#else 508 unsigned long end_pfn = node_end_pfn[i]; 509#endif 510 unsigned long end_huge_pfn = 0; 511 512 /* Pre-shatter the last huge page to allow per-cpu pages. */ 513 if (kdata_huge) 514 end_huge_pfn = end_pfn - (HPAGE_SIZE >> PAGE_SHIFT); 515 516 pfn = node_start_pfn[i]; 517 518 /* Allocate enough memory to hold L2 page tables for node. */ 519 init_prealloc_ptes(i, end_pfn - pfn); 520 521 address = (unsigned long) pfn_to_kaddr(pfn); 522 while (pfn < end_pfn) { 523 BUG_ON(address & (HPAGE_SIZE-1)); 524 pmd = get_pmd(pgtables, address); 525 pte = get_prealloc_pte(pfn); 526 if (pfn < end_huge_pfn) { 527 pgprot_t prot = init_pgprot(address); 528 *(pte_t *)pmd = pte_mkhuge(pfn_pte(pfn, prot)); 529 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE; 530 pfn++, pte_ofs++, address += PAGE_SIZE) 531 pte[pte_ofs] = pfn_pte(pfn, prot); 532 } else { 533 if (kdata_huge) 534 printk(KERN_DEBUG "pre-shattered huge" 535 " page at %#lx\n", address); 536 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE; 537 pfn++, pte_ofs++, address += PAGE_SIZE) { 538 pgprot_t prot = init_pgprot(address); 539 pte[pte_ofs] = pfn_pte(pfn, prot); 540 } 541 assign_pte(pmd, pte); 542 } 543 } 544 } 545 546 /* 547 * Set or check ktext_map now that we have cpu_possible_mask 548 * and kstripe_mask to work with. 549 */ 550 if (ktext_all) 551 cpumask_copy(&ktext_mask, cpu_possible_mask); 552 else if (ktext_nondataplane) 553 ktext_mask = kstripe_mask; 554 else if (!cpumask_empty(&ktext_mask)) { 555 /* Sanity-check any mask that was requested */ 556 struct cpumask bad; 557 cpumask_andnot(&bad, &ktext_mask, cpu_possible_mask); 558 cpumask_and(&ktext_mask, &ktext_mask, cpu_possible_mask); 559 if (!cpumask_empty(&bad)) { 560 char buf[NR_CPUS * 5]; 561 cpulist_scnprintf(buf, sizeof(buf), &bad); 562 pr_info("ktext: not using unavailable cpus %s\n", buf); 563 } 564 if (cpumask_empty(&ktext_mask)) { 565 pr_warning("ktext: no valid cpus; caching on %d.\n", 566 smp_processor_id()); 567 cpumask_copy(&ktext_mask, 568 cpumask_of(smp_processor_id())); 569 } 570 } 571 572 address = MEM_SV_INTRPT; 573 pmd = get_pmd(pgtables, address); 574 if (ktext_small) { 575 /* Allocate an L2 PTE for the kernel text */ 576 int cpu = 0; 577 pgprot_t prot = construct_pgprot(PAGE_KERNEL_EXEC, 578 PAGE_HOME_IMMUTABLE); 579 580 if (ktext_local) { 581 if (ktext_nocache) 582 prot = hv_pte_set_mode(prot, 583 HV_PTE_MODE_UNCACHED); 584 else 585 prot = hv_pte_set_mode(prot, 586 HV_PTE_MODE_CACHE_NO_L3); 587 } else { 588 prot = hv_pte_set_mode(prot, 589 HV_PTE_MODE_CACHE_TILE_L3); 590 cpu = cpumask_first(&ktext_mask); 591 592 prot = ktext_set_nocache(prot); 593 } 594 595 BUG_ON(address != (unsigned long)_stext); 596 pfn = 0; /* code starts at PA 0 */ 597 pte = alloc_pte(); 598 for (pte_ofs = 0; address < (unsigned long)_einittext; 599 pfn++, pte_ofs++, address += PAGE_SIZE) { 600 if (!ktext_local) { 601 prot = set_remote_cache_cpu(prot, cpu); 602 cpu = cpumask_next(cpu, &ktext_mask); 603 if (cpu == NR_CPUS) 604 cpu = cpumask_first(&ktext_mask); 605 } 606 pte[pte_ofs] = pfn_pte(pfn, prot); 607 } 608 assign_pte(pmd, pte); 609 } else { 610 pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC); 611 pteval = pte_mkhuge(pteval); 612#if CHIP_HAS_CBOX_HOME_MAP() 613 if (ktext_hash) { 614 pteval = hv_pte_set_mode(pteval, 615 HV_PTE_MODE_CACHE_HASH_L3); 616 pteval = ktext_set_nocache(pteval); 617 } else 618#endif /* CHIP_HAS_CBOX_HOME_MAP() */ 619 if (cpumask_weight(&ktext_mask) == 1) { 620 pteval = set_remote_cache_cpu(pteval, 621 cpumask_first(&ktext_mask)); 622 pteval = hv_pte_set_mode(pteval, 623 HV_PTE_MODE_CACHE_TILE_L3); 624 pteval = ktext_set_nocache(pteval); 625 } else if (ktext_nocache) 626 pteval = hv_pte_set_mode(pteval, 627 HV_PTE_MODE_UNCACHED); 628 else 629 pteval = hv_pte_set_mode(pteval, 630 HV_PTE_MODE_CACHE_NO_L3); 631 *(pte_t *)pmd = pteval; 632 } 633 634 /* Set swapper_pgprot here so it is flushed to memory right away. */ 635 swapper_pgprot = init_pgprot((unsigned long)swapper_pg_dir); 636 637 /* 638 * Since we may be changing the caching of the stack and page 639 * table itself, we invoke an assembly helper to do the 640 * following steps: 641 * 642 * - flush the cache so we start with an empty slate 643 * - install pgtables[] as the real page table 644 * - flush the TLB so the new page table takes effect 645 */ 646 rc = flush_and_install_context(__pa(pgtables), 647 init_pgprot((unsigned long)pgtables), 648 __get_cpu_var(current_asid), 649 cpumask_bits(my_cpu_mask)); 650 BUG_ON(rc != 0); 651 652 /* Copy the page table back to the normal swapper_pg_dir. */ 653 memcpy(pgd_base, pgtables, sizeof(pgtables)); 654 __install_page_table(pgd_base, __get_cpu_var(current_asid), 655 swapper_pgprot); 656} 657 658/* 659 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 660 * is valid. The argument is a physical page number. 661 * 662 * On Tile, the only valid things for which we can just hand out unchecked 663 * PTEs are the kernel code and data. Anything else might change its 664 * homing with time, and we wouldn't know to adjust the /dev/mem PTEs. 665 * Note that init_thread_union is released to heap soon after boot, 666 * so we include it in the init data. 667 * 668 * For TILE-Gx, we might want to consider allowing access to PA 669 * regions corresponding to PCI space, etc. 670 */ 671int devmem_is_allowed(unsigned long pagenr) 672{ 673 return pagenr < kaddr_to_pfn(_end) && 674 !(pagenr >= kaddr_to_pfn(&init_thread_union) || 675 pagenr < kaddr_to_pfn(_einitdata)) && 676 !(pagenr >= kaddr_to_pfn(_sinittext) || 677 pagenr <= kaddr_to_pfn(_einittext-1)); 678} 679 680#ifdef CONFIG_HIGHMEM 681static void __init permanent_kmaps_init(pgd_t *pgd_base) 682{ 683 pgd_t *pgd; 684 pud_t *pud; 685 pmd_t *pmd; 686 pte_t *pte; 687 unsigned long vaddr; 688 689 vaddr = PKMAP_BASE; 690 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); 691 692 pgd = swapper_pg_dir + pgd_index(vaddr); 693 pud = pud_offset(pgd, vaddr); 694 pmd = pmd_offset(pud, vaddr); 695 pte = pte_offset_kernel(pmd, vaddr); 696 pkmap_page_table = pte; 697} 698#endif /* CONFIG_HIGHMEM */ 699 700 701static void __init init_free_pfn_range(unsigned long start, unsigned long end) 702{ 703 unsigned long pfn; 704 struct page *page = pfn_to_page(start); 705 706 for (pfn = start; pfn < end; ) { 707 /* Optimize by freeing pages in large batches */ 708 int order = __ffs(pfn); 709 int count, i; 710 struct page *p; 711 712 if (order >= MAX_ORDER) 713 order = MAX_ORDER-1; 714 count = 1 << order; 715 while (pfn + count > end) { 716 count >>= 1; 717 --order; 718 } 719 for (p = page, i = 0; i < count; ++i, ++p) { 720 __ClearPageReserved(p); 721 /* 722 * Hacky direct set to avoid unnecessary 723 * lock take/release for EVERY page here. 724 */ 725 p->_count.counter = 0; 726 p->_mapcount.counter = -1; 727 } 728 init_page_count(page); 729 __free_pages(page, order); 730 totalram_pages += count; 731 732 page += count; 733 pfn += count; 734 } 735} 736 737static void __init set_non_bootmem_pages_init(void) 738{ 739 struct zone *z; 740 for_each_zone(z) { 741 unsigned long start, end; 742 int nid = z->zone_pgdat->node_id; 743 int idx = zone_idx(z); 744 745 start = z->zone_start_pfn; 746 if (start == 0) 747 continue; /* bootmem */ 748 end = start + z->spanned_pages; 749 if (idx == ZONE_NORMAL) { 750 BUG_ON(start != node_start_pfn[nid]); 751 start = node_free_pfn[nid]; 752 } 753#ifdef CONFIG_HIGHMEM 754 if (idx == ZONE_HIGHMEM) 755 totalhigh_pages += z->spanned_pages; 756#endif 757 if (kdata_huge) { 758 unsigned long percpu_pfn = node_percpu_pfn[nid]; 759 if (start < percpu_pfn && end > percpu_pfn) 760 end = percpu_pfn; 761 } 762#ifdef CONFIG_PCI 763 if (start <= pci_reserve_start_pfn && 764 end > pci_reserve_start_pfn) { 765 if (end > pci_reserve_end_pfn) 766 init_free_pfn_range(pci_reserve_end_pfn, end); 767 end = pci_reserve_start_pfn; 768 } 769#endif 770 init_free_pfn_range(start, end); 771 } 772} 773 774/* 775 * paging_init() sets up the page tables - note that all of lowmem is 776 * already mapped by head.S. 777 */ 778void __init paging_init(void) 779{ 780#ifdef CONFIG_HIGHMEM 781 unsigned long vaddr, end; 782#endif 783#ifdef __tilegx__ 784 pud_t *pud; 785#endif 786 pgd_t *pgd_base = swapper_pg_dir; 787 788 kernel_physical_mapping_init(pgd_base); 789 790#ifdef CONFIG_HIGHMEM 791 /* 792 * Fixed mappings, only the page table structure has to be 793 * created - mappings will be set by set_fixmap(): 794 */ 795 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; 796 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 797 page_table_range_init(vaddr, end, pgd_base); 798 permanent_kmaps_init(pgd_base); 799#endif 800 801#ifdef __tilegx__ 802 /* 803 * Since GX allocates just one pmd_t array worth of vmalloc space, 804 * we go ahead and allocate it statically here, then share it 805 * globally. As a result we don't have to worry about any task 806 * changing init_mm once we get up and running, and there's no 807 * need for e.g. vmalloc_sync_all(). 808 */ 809 BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END)); 810 pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START); 811 assign_pmd(pud, alloc_pmd()); 812#endif 813} 814 815 816/* 817 * Walk the kernel page tables and derive the page_home() from 818 * the PTEs, so that set_pte() can properly validate the caching 819 * of all PTEs it sees. 820 */ 821void __init set_page_homes(void) 822{ 823} 824 825static void __init set_max_mapnr_init(void) 826{ 827#ifdef CONFIG_FLATMEM 828 max_mapnr = max_low_pfn; 829#endif 830} 831 832void __init mem_init(void) 833{ 834 int codesize, datasize, initsize; 835 int i; 836#ifndef __tilegx__ 837 void *last; 838#endif 839 840#ifdef CONFIG_FLATMEM 841 if (!mem_map) 842 BUG(); 843#endif 844 845#ifdef CONFIG_HIGHMEM 846 /* check that fixmap and pkmap do not overlap */ 847 if (PKMAP_ADDR(LAST_PKMAP-1) >= FIXADDR_START) { 848 pr_err("fixmap and kmap areas overlap" 849 " - this will crash\n"); 850 pr_err("pkstart: %lxh pkend: %lxh fixstart %lxh\n", 851 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP-1), 852 FIXADDR_START); 853 BUG(); 854 } 855#endif 856 857 set_max_mapnr_init(); 858 859 /* this will put all bootmem onto the freelists */ 860 totalram_pages += free_all_bootmem(); 861 862 /* count all remaining LOWMEM and give all HIGHMEM to page allocator */ 863 set_non_bootmem_pages_init(); 864 865 codesize = (unsigned long)&_etext - (unsigned long)&_text; 866 datasize = (unsigned long)&_end - (unsigned long)&_sdata; 867 initsize = (unsigned long)&_einittext - (unsigned long)&_sinittext; 868 initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata; 869 870 pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n", 871 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 872 num_physpages << (PAGE_SHIFT-10), 873 codesize >> 10, 874 datasize >> 10, 875 initsize >> 10, 876 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 877 ); 878 879 /* 880 * In debug mode, dump some interesting memory mappings. 881 */ 882#ifdef CONFIG_HIGHMEM 883 printk(KERN_DEBUG " KMAP %#lx - %#lx\n", 884 FIXADDR_START, FIXADDR_TOP + PAGE_SIZE - 1); 885 printk(KERN_DEBUG " PKMAP %#lx - %#lx\n", 886 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1); 887#endif 888#ifdef CONFIG_HUGEVMAP 889 printk(KERN_DEBUG " HUGEMAP %#lx - %#lx\n", 890 HUGE_VMAP_BASE, HUGE_VMAP_END - 1); 891#endif 892 printk(KERN_DEBUG " VMALLOC %#lx - %#lx\n", 893 _VMALLOC_START, _VMALLOC_END - 1); 894#ifdef __tilegx__ 895 for (i = MAX_NUMNODES-1; i >= 0; --i) { 896 struct pglist_data *node = &node_data[i]; 897 if (node->node_present_pages) { 898 unsigned long start = (unsigned long) 899 pfn_to_kaddr(node->node_start_pfn); 900 unsigned long end = start + 901 (node->node_present_pages << PAGE_SHIFT); 902 printk(KERN_DEBUG " MEM%d %#lx - %#lx\n", 903 i, start, end - 1); 904 } 905 } 906#else 907 last = high_memory; 908 for (i = MAX_NUMNODES-1; i >= 0; --i) { 909 if ((unsigned long)vbase_map[i] != -1UL) { 910 printk(KERN_DEBUG " LOWMEM%d %#lx - %#lx\n", 911 i, (unsigned long) (vbase_map[i]), 912 (unsigned long) (last-1)); 913 last = vbase_map[i]; 914 } 915 } 916#endif 917 918#ifndef __tilegx__ 919 /* 920 * Convert from using one lock for all atomic operations to 921 * one per cpu. 922 */ 923 __init_atomic_per_cpu(); 924#endif 925} 926 927/* 928 * this is for the non-NUMA, single node SMP system case. 929 * Specifically, in the case of x86, we will always add 930 * memory to the highmem for now. 931 */ 932#ifndef CONFIG_NEED_MULTIPLE_NODES 933int arch_add_memory(u64 start, u64 size) 934{ 935 struct pglist_data *pgdata = &contig_page_data; 936 struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; 937 unsigned long start_pfn = start >> PAGE_SHIFT; 938 unsigned long nr_pages = size >> PAGE_SHIFT; 939 940 return __add_pages(zone, start_pfn, nr_pages); 941} 942 943int remove_memory(u64 start, u64 size) 944{ 945 return -EINVAL; 946} 947#endif 948 949struct kmem_cache *pgd_cache; 950 951void __init pgtable_cache_init(void) 952{ 953 pgd_cache = kmem_cache_create("pgd", 954 PTRS_PER_PGD*sizeof(pgd_t), 955 PTRS_PER_PGD*sizeof(pgd_t), 956 0, 957 NULL); 958 if (!pgd_cache) 959 panic("pgtable_cache_init(): Cannot create pgd cache"); 960} 961 962#if !CHIP_HAS_COHERENT_LOCAL_CACHE() 963/* 964 * The __w1data area holds data that is only written during initialization, 965 * and is read-only and thus freely cacheable thereafter. Fix the page 966 * table entries that cover that region accordingly. 967 */ 968static void mark_w1data_ro(void) 969{ 970 /* Loop over page table entries */ 971 unsigned long addr = (unsigned long)__w1data_begin; 972 BUG_ON((addr & (PAGE_SIZE-1)) != 0); 973 for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) { 974 unsigned long pfn = kaddr_to_pfn((void *)addr); 975 pte_t *ptep = virt_to_pte(NULL, addr); 976 BUG_ON(pte_huge(*ptep)); /* not relevant for kdata_huge */ 977 set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO)); 978 } 979} 980#endif 981 982#ifdef CONFIG_DEBUG_PAGEALLOC 983static long __write_once initfree; 984#else 985static long __write_once initfree = 1; 986#endif 987 988/* Select whether to free (1) or mark unusable (0) the __init pages. */ 989static int __init set_initfree(char *str) 990{ 991 strict_strtol(str, 0, &initfree); 992 pr_info("initfree: %s free init pages\n", initfree ? "will" : "won't"); 993 return 1; 994} 995__setup("initfree=", set_initfree); 996 997static void free_init_pages(char *what, unsigned long begin, unsigned long end) 998{ 999 unsigned long addr = (unsigned long) begin; 1000 1001 if (kdata_huge && !initfree) { 1002 pr_warning("Warning: ignoring initfree=0:" 1003 " incompatible with kdata=huge\n"); 1004 initfree = 1; 1005 } 1006 end = (end + PAGE_SIZE - 1) & PAGE_MASK; 1007 local_flush_tlb_pages(NULL, begin, PAGE_SIZE, end - begin); 1008 for (addr = begin; addr < end; addr += PAGE_SIZE) { 1009 /* 1010 * Note we just reset the home here directly in the 1011 * page table. We know this is safe because our caller 1012 * just flushed the caches on all the other cpus, 1013 * and they won't be touching any of these pages. 1014 */ 1015 int pfn = kaddr_to_pfn((void *)addr); 1016 struct page *page = pfn_to_page(pfn); 1017 pte_t *ptep = virt_to_pte(NULL, addr); 1018 if (!initfree) { 1019 /* 1020 * If debugging page accesses then do not free 1021 * this memory but mark them not present - any 1022 * buggy init-section access will create a 1023 * kernel page fault: 1024 */ 1025 pte_clear(&init_mm, addr, ptep); 1026 continue; 1027 } 1028 __ClearPageReserved(page); 1029 init_page_count(page); 1030 if (pte_huge(*ptep)) 1031 BUG_ON(!kdata_huge); 1032 else 1033 set_pte_at(&init_mm, addr, ptep, 1034 pfn_pte(pfn, PAGE_KERNEL)); 1035 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); 1036 free_page(addr); 1037 totalram_pages++; 1038 } 1039 pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10); 1040} 1041 1042void free_initmem(void) 1043{ 1044 const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET; 1045 1046 /* 1047 * Evict the dirty initdata on the boot cpu, evict the w1data 1048 * wherever it's homed, and evict all the init code everywhere. 1049 * We are guaranteed that no one will touch the init pages any 1050 * more, and although other cpus may be touching the w1data, 1051 * we only actually change the caching on tile64, which won't 1052 * be keeping local copies in the other tiles' caches anyway. 1053 */ 1054 homecache_evict(&cpu_cacheable_map); 1055 1056 /* Free the data pages that we won't use again after init. */ 1057 free_init_pages("unused kernel data", 1058 (unsigned long)_sinitdata, 1059 (unsigned long)_einitdata); 1060 1061 /* 1062 * Free the pages mapped from 0xc0000000 that correspond to code 1063 * pages from 0xfd000000 that we won't use again after init. 1064 */ 1065 free_init_pages("unused kernel text", 1066 (unsigned long)_sinittext - text_delta, 1067 (unsigned long)_einittext - text_delta); 1068 1069#if !CHIP_HAS_COHERENT_LOCAL_CACHE() 1070 /* 1071 * Upgrade the .w1data section to globally cached. 1072 * We don't do this on tilepro, since the cache architecture 1073 * pretty much makes it irrelevant, and in any case we end 1074 * up having racing issues with other tiles that may touch 1075 * the data after we flush the cache but before we update 1076 * the PTEs and flush the TLBs, causing sharer shootdowns 1077 * later. Even though this is to clean data, it seems like 1078 * an unnecessary complication. 1079 */ 1080 mark_w1data_ro(); 1081#endif 1082 1083 /* Do a global TLB flush so everyone sees the changes. */ 1084 flush_tlb_all(); 1085} 1086