1/* 2 * Copyright 2010 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <linux/sched.h> 16#include <linux/kernel.h> 17#include <linux/errno.h> 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/highmem.h> 21#include <linux/slab.h> 22#include <linux/pagemap.h> 23#include <linux/spinlock.h> 24#include <linux/cpumask.h> 25#include <linux/module.h> 26#include <linux/io.h> 27#include <linux/vmalloc.h> 28#include <linux/smp.h> 29 30#include <asm/system.h> 31#include <asm/pgtable.h> 32#include <asm/pgalloc.h> 33#include <asm/fixmap.h> 34#include <asm/tlb.h> 35#include <asm/tlbflush.h> 36#include <asm/homecache.h> 37 38#define K(x) ((x) << (PAGE_SHIFT-10)) 39 40/* 41 * The normal show_free_areas() is too verbose on Tile, with dozens 42 * of processors and often four NUMA zones each with high and lowmem. 43 */ 44void show_mem(void) 45{ 46 struct zone *zone; 47 48 pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu" 49 " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu" 50 " pagecache:%lu swap:%lu\n", 51 (global_page_state(NR_ACTIVE_ANON) + 52 global_page_state(NR_ACTIVE_FILE)), 53 (global_page_state(NR_INACTIVE_ANON) + 54 global_page_state(NR_INACTIVE_FILE)), 55 global_page_state(NR_FILE_DIRTY), 56 global_page_state(NR_WRITEBACK), 57 global_page_state(NR_UNSTABLE_NFS), 58 global_page_state(NR_FREE_PAGES), 59 (global_page_state(NR_SLAB_RECLAIMABLE) + 60 global_page_state(NR_SLAB_UNRECLAIMABLE)), 61 global_page_state(NR_FILE_MAPPED), 62 global_page_state(NR_PAGETABLE), 63 global_page_state(NR_BOUNCE), 64 global_page_state(NR_FILE_PAGES), 65 nr_swap_pages); 66 67 for_each_zone(zone) { 68 unsigned long flags, order, total = 0, largest_order = -1; 69 70 if (!populated_zone(zone)) 71 continue; 72 73 spin_lock_irqsave(&zone->lock, flags); 74 for (order = 0; order < MAX_ORDER; order++) { 75 int nr = zone->free_area[order].nr_free; 76 total += nr << order; 77 if (nr) 78 largest_order = order; 79 } 80 spin_unlock_irqrestore(&zone->lock, flags); 81 pr_err("Node %d %7s: %lukB (largest %luKb)\n", 82 zone_to_nid(zone), zone->name, 83 K(total), largest_order ? K(1UL) << largest_order : 0); 84 } 85} 86 87/* 88 * Associate a virtual page frame with a given physical page frame 89 * and protection flags for that frame. 90 */ 91static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 92{ 93 pgd_t *pgd; 94 pud_t *pud; 95 pmd_t *pmd; 96 pte_t *pte; 97 98 pgd = swapper_pg_dir + pgd_index(vaddr); 99 if (pgd_none(*pgd)) { 100 BUG(); 101 return; 102 } 103 pud = pud_offset(pgd, vaddr); 104 if (pud_none(*pud)) { 105 BUG(); 106 return; 107 } 108 pmd = pmd_offset(pud, vaddr); 109 if (pmd_none(*pmd)) { 110 BUG(); 111 return; 112 } 113 pte = pte_offset_kernel(pmd, vaddr); 114 /* <pfn,flags> stored as-is, to permit clearing entries */ 115 set_pte(pte, pfn_pte(pfn, flags)); 116 117 /* 118 * It's enough to flush this one mapping. 119 * This appears conservative since it is only called 120 * from __set_fixmap. 121 */ 122 local_flush_tlb_page(NULL, vaddr, PAGE_SIZE); 123} 124 125void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) 126{ 127 unsigned long address = __fix_to_virt(idx); 128 129 if (idx >= __end_of_fixed_addresses) { 130 BUG(); 131 return; 132 } 133 set_pte_pfn(address, phys >> PAGE_SHIFT, flags); 134} 135 136#if defined(CONFIG_HIGHPTE) 137pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type) 138{ 139 pte_t *pte = kmap_atomic(pmd_page(*dir), type) + 140 (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK; 141 return &pte[pte_index(address)]; 142} 143#endif 144 145/* 146 * List of all pgd's needed so it can invalidate entries in both cached 147 * and uncached pgd's. This is essentially codepath-based locking 148 * against pageattr.c; it is the unique case in which a valid change 149 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 150 * vmalloc faults work because attached pagetables are never freed. 151 * The locking scheme was chosen on the basis of manfred's 152 * recommendations and having no core impact whatsoever. 153 * -- wli 154 */ 155DEFINE_SPINLOCK(pgd_lock); 156LIST_HEAD(pgd_list); 157 158static inline void pgd_list_add(pgd_t *pgd) 159{ 160 list_add(pgd_to_list(pgd), &pgd_list); 161} 162 163static inline void pgd_list_del(pgd_t *pgd) 164{ 165 list_del(pgd_to_list(pgd)); 166} 167 168#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET) 169#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START) 170 171static void pgd_ctor(pgd_t *pgd) 172{ 173 unsigned long flags; 174 175 memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t)); 176 spin_lock_irqsave(&pgd_lock, flags); 177 178#ifndef __tilegx__ 179 /* 180 * Check that the user interrupt vector has no L2. 181 * It never should for the swapper, and new page tables 182 * should always start with an empty user interrupt vector. 183 */ 184 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0); 185#endif 186 187 clone_pgd_range(pgd + KERNEL_PGD_INDEX_START, 188 swapper_pg_dir + KERNEL_PGD_INDEX_START, 189 KERNEL_PGD_PTRS); 190 191 pgd_list_add(pgd); 192 spin_unlock_irqrestore(&pgd_lock, flags); 193} 194 195static void pgd_dtor(pgd_t *pgd) 196{ 197 unsigned long flags; /* can be called from interrupt context */ 198 199 spin_lock_irqsave(&pgd_lock, flags); 200 pgd_list_del(pgd); 201 spin_unlock_irqrestore(&pgd_lock, flags); 202} 203 204pgd_t *pgd_alloc(struct mm_struct *mm) 205{ 206 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); 207 if (pgd) 208 pgd_ctor(pgd); 209 return pgd; 210} 211 212void pgd_free(struct mm_struct *mm, pgd_t *pgd) 213{ 214 pgd_dtor(pgd); 215 kmem_cache_free(pgd_cache, pgd); 216} 217 218 219#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER) 220 221struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 222{ 223 gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP; 224 struct page *p; 225 226#ifdef CONFIG_HIGHPTE 227 flags |= __GFP_HIGHMEM; 228#endif 229 230 p = alloc_pages(flags, L2_USER_PGTABLE_ORDER); 231 if (p == NULL) 232 return NULL; 233 234 pgtable_page_ctor(p); 235 return p; 236} 237 238/* 239 * Free page immediately (used in __pte_alloc if we raced with another 240 * process). We have to correct whatever pte_alloc_one() did before 241 * returning the pages to the allocator. 242 */ 243void pte_free(struct mm_struct *mm, struct page *p) 244{ 245 pgtable_page_dtor(p); 246 __free_pages(p, L2_USER_PGTABLE_ORDER); 247} 248 249void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, 250 unsigned long address) 251{ 252 int i; 253 254 pgtable_page_dtor(pte); 255 tlb->need_flush = 1; 256 if (tlb_fast_mode(tlb)) { 257 struct page *pte_pages[L2_USER_PGTABLE_PAGES]; 258 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) 259 pte_pages[i] = pte + i; 260 free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES); 261 return; 262 } 263 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) { 264 tlb->pages[tlb->nr++] = pte + i; 265 if (tlb->nr >= FREE_PTE_NR) 266 tlb_flush_mmu(tlb, 0, 0); 267 } 268} 269 270#ifndef __tilegx__ 271 272int ptep_test_and_clear_young(struct vm_area_struct *vma, 273 unsigned long addr, pte_t *ptep) 274{ 275#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16 276# error Code assumes HV_PTE "accessed" bit in second byte 277#endif 278 u8 *tmp = (u8 *)ptep; 279 u8 second_byte = tmp[1]; 280 if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8)))) 281 return 0; 282 tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8)); 283 return 1; 284} 285 286/* 287 * This implementation is atomic vs hypervisor writes, since the hypervisor 288 * always writes the low word (where "accessed" and "dirty" are) and this 289 * routine only writes the high word. 290 */ 291void ptep_set_wrprotect(struct mm_struct *mm, 292 unsigned long addr, pte_t *ptep) 293{ 294#if HV_PTE_INDEX_WRITABLE < 32 295# error Code assumes HV_PTE "writable" bit in high word 296#endif 297 u32 *tmp = (u32 *)ptep; 298 tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32)); 299} 300 301#endif 302 303pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr) 304{ 305 pgd_t *pgd; 306 pud_t *pud; 307 pmd_t *pmd; 308 309 if (pgd_addr_invalid(addr)) 310 return NULL; 311 312 pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr); 313 pud = pud_offset(pgd, addr); 314 if (!pud_present(*pud)) 315 return NULL; 316 pmd = pmd_offset(pud, addr); 317 if (pmd_huge_page(*pmd)) 318 return (pte_t *)pmd; 319 if (!pmd_present(*pmd)) 320 return NULL; 321 return pte_offset_kernel(pmd, addr); 322} 323 324pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu) 325{ 326 unsigned int width = smp_width; 327 int x = cpu % width; 328 int y = cpu / width; 329 BUG_ON(y >= smp_height); 330 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3); 331 BUG_ON(cpu < 0 || cpu >= NR_CPUS); 332 BUG_ON(!cpu_is_valid_lotar(cpu)); 333 return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y)); 334} 335 336int get_remote_cache_cpu(pgprot_t prot) 337{ 338 HV_LOTAR lotar = hv_pte_get_lotar(prot); 339 int x = HV_LOTAR_X(lotar); 340 int y = HV_LOTAR_Y(lotar); 341 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3); 342 return x + y * smp_width; 343} 344 345void set_pte_order(pte_t *ptep, pte_t pte, int order) 346{ 347 unsigned long pfn = pte_pfn(pte); 348 struct page *page = pfn_to_page(pfn); 349 350 /* Update the home of a PTE if necessary */ 351 pte = pte_set_home(pte, page_home(page)); 352 353#ifdef __tilegx__ 354 *ptep = pte; 355#else 356 /* 357 * When setting a PTE, write the high bits first, then write 358 * the low bits. This sets the "present" bit only after the 359 * other bits are in place. If a particular PTE update 360 * involves transitioning from one valid PTE to another, it 361 * may be necessary to call set_pte_order() more than once, 362 * transitioning via a suitable intermediate state. 363 * Note that this sequence also means that if we are transitioning 364 * from any migrating PTE to a non-migrating one, we will not 365 * see a half-updated PTE with the migrating bit off. 366 */ 367#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 368# error Must write the present and migrating bits last 369#endif 370 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); 371 barrier(); 372 ((u32 *)ptep)[0] = (u32)(pte_val(pte)); 373#endif 374} 375 376/* Can this mm load a PTE with cached_priority set? */ 377static inline int mm_is_priority_cached(struct mm_struct *mm) 378{ 379 return mm->context.priority_cached; 380} 381 382/* 383 * Add a priority mapping to an mm_context and 384 * notify the hypervisor if this is the first one. 385 */ 386void start_mm_caching(struct mm_struct *mm) 387{ 388 if (!mm_is_priority_cached(mm)) { 389 mm->context.priority_cached = -1U; 390 hv_set_caching(-1U); 391 } 392} 393 394/* 395 * Validate and return the priority_cached flag. We know if it's zero 396 * that we don't need to scan, since we immediately set it non-zero 397 * when we first consider a MAP_CACHE_PRIORITY mapping. 398 * 399 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it, 400 * since we're in an interrupt context (servicing switch_mm) we don't 401 * worry about it and don't unset the "priority_cached" field. 402 * Presumably we'll come back later and have more luck and clear 403 * the value then; for now we'll just keep the cache marked for priority. 404 */ 405static unsigned int update_priority_cached(struct mm_struct *mm) 406{ 407 if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) { 408 struct vm_area_struct *vm; 409 for (vm = mm->mmap; vm; vm = vm->vm_next) { 410 if (hv_pte_get_cached_priority(vm->vm_page_prot)) 411 break; 412 } 413 if (vm == NULL) 414 mm->context.priority_cached = 0; 415 up_write(&mm->mmap_sem); 416 } 417 return mm->context.priority_cached; 418} 419 420/* Set caching correctly for an mm that we are switching to. */ 421void check_mm_caching(struct mm_struct *prev, struct mm_struct *next) 422{ 423 if (!mm_is_priority_cached(next)) { 424 /* 425 * If the new mm doesn't use priority caching, just see if we 426 * need the hv_set_caching(), or can assume it's already zero. 427 */ 428 if (mm_is_priority_cached(prev)) 429 hv_set_caching(0); 430 } else { 431 hv_set_caching(update_priority_cached(next)); 432 } 433} 434 435#if CHIP_HAS_MMIO() 436 437/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */ 438void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 439 pgprot_t home) 440{ 441 void *addr; 442 struct vm_struct *area; 443 unsigned long offset, last_addr; 444 pgprot_t pgprot; 445 446 /* Don't allow wraparound or zero size */ 447 last_addr = phys_addr + size - 1; 448 if (!size || last_addr < phys_addr) 449 return NULL; 450 451 /* Create a read/write, MMIO VA mapping homed at the requested shim. */ 452 pgprot = PAGE_KERNEL; 453 pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO); 454 pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home)); 455 456 /* 457 * Mappings have to be page-aligned 458 */ 459 offset = phys_addr & ~PAGE_MASK; 460 phys_addr &= PAGE_MASK; 461 size = PAGE_ALIGN(last_addr+1) - phys_addr; 462 463 /* 464 * Ok, go for it.. 465 */ 466 area = get_vm_area(size, VM_IOREMAP /* | other flags? */); 467 if (!area) 468 return NULL; 469 area->phys_addr = phys_addr; 470 addr = area->addr; 471 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, 472 phys_addr, pgprot)) { 473 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); 474 return NULL; 475 } 476 return (__force void __iomem *) (offset + (char *)addr); 477} 478EXPORT_SYMBOL(ioremap_prot); 479 480/* Map a PCI MMIO bus address into VA space. */ 481void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) 482{ 483 panic("ioremap for PCI MMIO is not supported"); 484} 485EXPORT_SYMBOL(ioremap); 486 487/* Unmap an MMIO VA mapping. */ 488void iounmap(volatile void __iomem *addr_in) 489{ 490 volatile void __iomem *addr = (volatile void __iomem *) 491 (PAGE_MASK & (unsigned long __force)addr_in); 492 vunmap((void * __force)addr); 493} 494EXPORT_SYMBOL(iounmap); 495 496#endif /* CHIP_HAS_MMIO() */ 497