1/* 2 * Lockless get_user_pages_fast for x86 3 * 4 * Copyright (C) 2008 Nick Piggin 5 * Copyright (C) 2008 Novell Inc. 6 */ 7#include <linux/sched.h> 8#include <linux/mm.h> 9#include <linux/vmstat.h> 10#include <linux/highmem.h> 11 12#include <asm/pgtable.h> 13 14static inline pte_t gup_get_pte(pte_t *ptep) 15{ 16#ifndef CONFIG_X86_PAE 17 return ACCESS_ONCE(*ptep); 18#else 19 /* 20 * With get_user_pages_fast, we walk down the pagetables without taking 21 * any locks. For this we would like to load the pointers atomically, 22 * but that is not possible (without expensive cmpxchg8b) on PAE. What 23 * we do have is the guarantee that a pte will only either go from not 24 * present to present, or present to not present or both -- it will not 25 * switch to a completely different present page without a TLB flush in 26 * between; something that we are blocking by holding interrupts off. 27 * 28 * Setting ptes from not present to present goes: 29 * ptep->pte_high = h; 30 * smp_wmb(); 31 * ptep->pte_low = l; 32 * 33 * And present to not present goes: 34 * ptep->pte_low = 0; 35 * smp_wmb(); 36 * ptep->pte_high = 0; 37 * 38 * We must ensure here that the load of pte_low sees l iff pte_high 39 * sees h. We load pte_high *after* loading pte_low, which ensures we 40 * don't see an older value of pte_high. *Then* we recheck pte_low, 41 * which ensures that we haven't picked up a changed pte high. We might 42 * have got rubbish values from pte_low and pte_high, but we are 43 * guaranteed that pte_low will not have the present bit set *unless* 44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so 45 * we're safe. 46 * 47 * gup_get_pte should not be used or copied outside gup.c without being 48 * very careful -- it does not atomically load the pte or anything that 49 * is likely to be useful for you. 50 */ 51 pte_t pte; 52 53retry: 54 pte.pte_low = ptep->pte_low; 55 smp_rmb(); 56 pte.pte_high = ptep->pte_high; 57 smp_rmb(); 58 if (unlikely(pte.pte_low != ptep->pte_low)) 59 goto retry; 60 61 return pte; 62#endif 63} 64 65/* 66 * The performance critical leaf functions are made noinline otherwise gcc 67 * inlines everything into a single function which results in too much 68 * register pressure. 69 */ 70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, 71 unsigned long end, int write, struct page **pages, int *nr) 72{ 73 unsigned long mask; 74 pte_t *ptep; 75 76 mask = _PAGE_PRESENT|_PAGE_USER; 77 if (write) 78 mask |= _PAGE_RW; 79 80 ptep = pte_offset_map(&pmd, addr); 81 do { 82 pte_t pte = gup_get_pte(ptep); 83 struct page *page; 84 85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 86 pte_unmap(ptep); 87 return 0; 88 } 89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 page = pte_page(pte); 91 get_page(page); 92 pages[*nr] = page; 93 (*nr)++; 94 95 } while (ptep++, addr += PAGE_SIZE, addr != end); 96 pte_unmap(ptep - 1); 97 98 return 1; 99} 100 101static inline void get_head_page_multiple(struct page *page, int nr) 102{ 103 VM_BUG_ON(page != compound_head(page)); 104 VM_BUG_ON(page_count(page) == 0); 105 atomic_add(nr, &page->_count); 106} 107 108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 109 unsigned long end, int write, struct page **pages, int *nr) 110{ 111 unsigned long mask; 112 pte_t pte = *(pte_t *)&pmd; 113 struct page *head, *page; 114 int refs; 115 116 mask = _PAGE_PRESENT|_PAGE_USER; 117 if (write) 118 mask |= _PAGE_RW; 119 if ((pte_flags(pte) & mask) != mask) 120 return 0; 121 /* hugepages are never "special" */ 122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); 123 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 124 125 refs = 0; 126 head = pte_page(pte); 127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 128 do { 129 VM_BUG_ON(compound_head(page) != head); 130 pages[*nr] = page; 131 (*nr)++; 132 page++; 133 refs++; 134 } while (addr += PAGE_SIZE, addr != end); 135 get_head_page_multiple(head, refs); 136 137 return 1; 138} 139 140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 141 int write, struct page **pages, int *nr) 142{ 143 unsigned long next; 144 pmd_t *pmdp; 145 146 pmdp = pmd_offset(&pud, addr); 147 do { 148 pmd_t pmd = *pmdp; 149 150 next = pmd_addr_end(addr, end); 151 if (pmd_none(pmd)) 152 return 0; 153 if (unlikely(pmd_large(pmd))) { 154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 155 return 0; 156 } else { 157 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 158 return 0; 159 } 160 } while (pmdp++, addr = next, addr != end); 161 162 return 1; 163} 164 165static noinline int gup_huge_pud(pud_t pud, unsigned long addr, 166 unsigned long end, int write, struct page **pages, int *nr) 167{ 168 unsigned long mask; 169 pte_t pte = *(pte_t *)&pud; 170 struct page *head, *page; 171 int refs; 172 173 mask = _PAGE_PRESENT|_PAGE_USER; 174 if (write) 175 mask |= _PAGE_RW; 176 if ((pte_flags(pte) & mask) != mask) 177 return 0; 178 /* hugepages are never "special" */ 179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); 180 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 181 182 refs = 0; 183 head = pte_page(pte); 184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 185 do { 186 VM_BUG_ON(compound_head(page) != head); 187 pages[*nr] = page; 188 (*nr)++; 189 page++; 190 refs++; 191 } while (addr += PAGE_SIZE, addr != end); 192 get_head_page_multiple(head, refs); 193 194 return 1; 195} 196 197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, 198 int write, struct page **pages, int *nr) 199{ 200 unsigned long next; 201 pud_t *pudp; 202 203 pudp = pud_offset(&pgd, addr); 204 do { 205 pud_t pud = *pudp; 206 207 next = pud_addr_end(addr, end); 208 if (pud_none(pud)) 209 return 0; 210 if (unlikely(pud_large(pud))) { 211 if (!gup_huge_pud(pud, addr, next, write, pages, nr)) 212 return 0; 213 } else { 214 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 215 return 0; 216 } 217 } while (pudp++, addr = next, addr != end); 218 219 return 1; 220} 221 222/* 223 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall 224 * back to the regular GUP. 225 */ 226int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 227 struct page **pages) 228{ 229 struct mm_struct *mm = current->mm; 230 unsigned long addr, len, end; 231 unsigned long next; 232 unsigned long flags; 233 pgd_t *pgdp; 234 int nr = 0; 235 236 start &= PAGE_MASK; 237 addr = start; 238 len = (unsigned long) nr_pages << PAGE_SHIFT; 239 end = start + len; 240 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, 241 (void __user *)start, len))) 242 return 0; 243 244 /* 245 * This doesn't prevent pagetable teardown, but does prevent 246 * the pagetables and pages from being freed on x86. 247 * 248 * So long as we atomically load page table pointers versus teardown 249 * (which we do on x86, with the above PAE exception), we can follow the 250 * address down to the the page and take a ref on it. 251 */ 252 local_irq_save(flags); 253 pgdp = pgd_offset(mm, addr); 254 do { 255 pgd_t pgd = *pgdp; 256 257 next = pgd_addr_end(addr, end); 258 if (pgd_none(pgd)) 259 break; 260 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 261 break; 262 } while (pgdp++, addr = next, addr != end); 263 local_irq_restore(flags); 264 265 return nr; 266} 267 268/** 269 * get_user_pages_fast() - pin user pages in memory 270 * @start: starting user address 271 * @nr_pages: number of pages from start to pin 272 * @write: whether pages will be written to 273 * @pages: array that receives pointers to the pages pinned. 274 * Should be at least nr_pages long. 275 * 276 * Attempt to pin user pages in memory without taking mm->mmap_sem. 277 * If not successful, it will fall back to taking the lock and 278 * calling get_user_pages(). 279 * 280 * Returns number of pages pinned. This may be fewer than the number 281 * requested. If nr_pages is 0 or negative, returns 0. If no pages 282 * were pinned, returns -errno. 283 */ 284int get_user_pages_fast(unsigned long start, int nr_pages, int write, 285 struct page **pages) 286{ 287 struct mm_struct *mm = current->mm; 288 unsigned long addr, len, end; 289 unsigned long next; 290 pgd_t *pgdp; 291 int nr = 0; 292 293 start &= PAGE_MASK; 294 addr = start; 295 len = (unsigned long) nr_pages << PAGE_SHIFT; 296 297 end = start + len; 298 if (end < start) 299 goto slow_irqon; 300 301#ifdef CONFIG_X86_64 302 if (end >> __VIRTUAL_MASK_SHIFT) 303 goto slow_irqon; 304#endif 305 306 /* 307 * This doesn't prevent pagetable teardown, but does prevent 308 * the pagetables and pages from being freed on x86. 309 * 310 * So long as we atomically load page table pointers versus teardown 311 * (which we do on x86, with the above PAE exception), we can follow the 312 * address down to the the page and take a ref on it. 313 */ 314 local_irq_disable(); 315 pgdp = pgd_offset(mm, addr); 316 do { 317 pgd_t pgd = *pgdp; 318 319 next = pgd_addr_end(addr, end); 320 if (pgd_none(pgd)) 321 goto slow; 322 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 323 goto slow; 324 } while (pgdp++, addr = next, addr != end); 325 local_irq_enable(); 326 327 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); 328 return nr; 329 330 { 331 int ret; 332 333slow: 334 local_irq_enable(); 335slow_irqon: 336 /* Try to get the remaining pages with get_user_pages */ 337 start += nr << PAGE_SHIFT; 338 pages += nr; 339 340 down_read(&mm->mmap_sem); 341 ret = get_user_pages(current, mm, start, 342 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); 343 up_read(&mm->mmap_sem); 344 345 /* Have to be a bit careful with return values */ 346 if (nr > 0) { 347 if (ret < 0) 348 ret = nr; 349 else 350 ret += nr; 351 } 352 353 return ret; 354 } 355} 356