1#include <linux/init.h> 2 3#include <linux/mm.h> 4#include <linux/spinlock.h> 5#include <linux/smp.h> 6#include <linux/interrupt.h> 7#include <linux/module.h> 8 9#include <asm/tlbflush.h> 10#include <asm/mmu_context.h> 11#include <asm/cache.h> 12#include <asm/apic.h> 13#include <asm/uv/uv.h> 14 15DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) 16 = { &init_mm, 0, }; 17 18/* 19 * Smarter SMP flushing macros. 20 * c/o Linus Torvalds. 21 * 22 * These mean you can really definitely utterly forget about 23 * writing to user space from interrupts. (Its not allowed anyway). 24 * 25 * Optimizations Manfred Spraul <manfred@colorfullife.com> 26 * 27 * More scalable flush, from Andi Kleen 28 * 29 * To avoid global state use 8 different call vectors. 30 * Each CPU uses a specific vector to trigger flushes on other 31 * CPUs. Depending on the received vector the target CPUs look into 32 * the right array slot for the flush data. 33 * 34 * With more than 8 CPUs they are hashed to the 8 available 35 * vectors. The limited global vector space forces us to this right now. 36 * In future when interrupts are split into per CPU domains this could be 37 * fixed, at the cost of triggering multiple IPIs in some cases. 38 */ 39 40union smp_flush_state { 41 struct { 42 struct mm_struct *flush_mm; 43 unsigned long flush_va; 44 raw_spinlock_t tlbstate_lock; 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 46 }; 47 char pad[INTERNODE_CACHE_BYTES]; 48} ____cacheline_internodealigned_in_smp; 49 50/* State is put into the per CPU data section, but padded 51 to a full cache line because other CPUs can access it and we don't 52 want false sharing in the per cpu data segment. */ 53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54 55/* 56 * We cannot call mmdrop() because we are in interrupt context, 57 * instead update mm->cpu_vm_mask. 58 */ 59void leave_mm(int cpu) 60{ 61 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 62 BUG(); 63 cpumask_clear_cpu(cpu, 64 mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); 65 load_cr3(swapper_pg_dir); 66} 67EXPORT_SYMBOL_GPL(leave_mm); 68 69/* 70 * 71 * The flush IPI assumes that a thread switch happens in this order: 72 * [cpu0: the cpu that switches] 73 * 1) switch_mm() either 1a) or 1b) 74 * 1a) thread switch to a different mm 75 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 76 * Stop ipi delivery for the old mm. This is not synchronized with 77 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 78 * for the wrong mm, and in the worst case we perform a superfluous 79 * tlb flush. 80 * 1a2) set cpu mmu_state to TLBSTATE_OK 81 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 82 * was in lazy tlb mode. 83 * 1a3) update cpu active_mm 84 * Now cpu0 accepts tlb flushes for the new mm. 85 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 86 * Now the other cpus will send tlb flush ipis. 87 * 1a4) change cr3. 88 * 1b) thread switch without mm change 89 * cpu active_mm is correct, cpu0 already handles 90 * flush ipis. 91 * 1b1) set cpu mmu_state to TLBSTATE_OK 92 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 93 * Atomically set the bit [other cpus will start sending flush ipis], 94 * and test the bit. 95 * 1b3) if the bit was 0: leave_mm was called, flush the tlb. 96 * 2) switch %%esp, ie current 97 * 98 * The interrupt must handle 2 special cases: 99 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. 100 * - the cpu performs speculative tlb reads, i.e. even if the cpu only 101 * runs in kernel space, the cpu could load tlb entries for user space 102 * pages. 103 * 104 * The good news is that cpu mmu_state is local to each cpu, no 105 * write/read ordering problems. 106 */ 107 108/* 109 * TLB flush IPI: 110 * 111 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. 112 * 2) Leave the mm if we are in the lazy tlb mode. 113 * 114 * Interrupts are disabled. 115 */ 116 117#ifdef CONFIG_X86_64 118asmlinkage 119#endif 120void smp_invalidate_interrupt(struct pt_regs *regs) 121{ 122 unsigned int cpu; 123 unsigned int sender; 124 union smp_flush_state *f; 125 126 cpu = smp_processor_id(); 127 /* 128 * orig_rax contains the negated interrupt vector. 129 * Use that to determine where the sender put the data. 130 */ 131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; 132 f = &flush_state[sender]; 133 134 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) 135 goto out; 136 /* 137 * This was a BUG() but until someone can quote me the 138 * line from the intel manual that guarantees an IPI to 139 * multiple CPUs is retried _only_ on the erroring CPUs 140 * its staying as a return 141 * 142 * BUG(); 143 */ 144 145 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { 146 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 147 if (f->flush_va == TLB_FLUSH_ALL) 148 local_flush_tlb(); 149 else 150 __flush_tlb_one(f->flush_va); 151 } else 152 leave_mm(cpu); 153 } 154out: 155 ack_APIC_irq(); 156 smp_mb__before_clear_bit(); 157 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); 158 smp_mb__after_clear_bit(); 159 inc_irq_stat(irq_tlb_count); 160} 161 162static void flush_tlb_others_ipi(const struct cpumask *cpumask, 163 struct mm_struct *mm, unsigned long va) 164{ 165 unsigned int sender; 166 union smp_flush_state *f; 167 168 /* Caller has disabled preemption */ 169 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 170 f = &flush_state[sender]; 171 172 /* 173 * Could avoid this lock when 174 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 175 * probably not worth checking this for a cache-hot lock. 176 */ 177 raw_spin_lock(&f->tlbstate_lock); 178 179 f->flush_mm = mm; 180 f->flush_va = va; 181 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { 182 /* 183 * We have to send the IPI only to 184 * CPUs affected. 185 */ 186 apic->send_IPI_mask(to_cpumask(f->flush_cpumask), 187 INVALIDATE_TLB_VECTOR_START + sender); 188 189 while (!cpumask_empty(to_cpumask(f->flush_cpumask))) 190 cpu_relax(); 191 } 192 193 f->flush_mm = NULL; 194 f->flush_va = 0; 195 raw_spin_unlock(&f->tlbstate_lock); 196} 197 198void native_flush_tlb_others(const struct cpumask *cpumask, 199 struct mm_struct *mm, unsigned long va) 200{ 201 if (is_uv_system()) { 202 unsigned int cpu; 203 204 cpu = get_cpu(); 205 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 206 if (cpumask) 207 flush_tlb_others_ipi(cpumask, mm, va); 208 put_cpu(); 209 return; 210 } 211 flush_tlb_others_ipi(cpumask, mm, va); 212} 213 214static int __cpuinit init_smp_flush(void) 215{ 216 int i; 217 218 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 219 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 220 221 return 0; 222} 223core_initcall(init_smp_flush); 224 225void flush_tlb_current_task(void) 226{ 227 struct mm_struct *mm = current->mm; 228 229 preempt_disable(); 230 231 local_flush_tlb(); 232 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 233 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 234 preempt_enable(); 235} 236 237void flush_tlb_mm(struct mm_struct *mm) 238{ 239 preempt_disable(); 240 241 if (current->active_mm == mm) { 242 if (current->mm) 243 local_flush_tlb(); 244 else 245 leave_mm(smp_processor_id()); 246 } 247 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 248 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 249 250 preempt_enable(); 251} 252 253void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 254{ 255 struct mm_struct *mm = vma->vm_mm; 256 257 preempt_disable(); 258 259 if (current->active_mm == mm) { 260 if (current->mm) 261 __flush_tlb_one(va); 262 else 263 leave_mm(smp_processor_id()); 264 } 265 266 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 267 flush_tlb_others(mm_cpumask(mm), mm, va); 268 269 preempt_enable(); 270} 271 272static void do_flush_tlb_all(void *info) 273{ 274 __flush_tlb_all(); 275 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 276 leave_mm(smp_processor_id()); 277} 278 279void flush_tlb_all(void) 280{ 281 on_each_cpu(do_flush_tlb_all, NULL, 1); 282} 283