1/* 2 * Copyright (c) 2010, 2014 Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 34#include <linux/sched.h> 35#include <linux/mutex.h> 36#include <asm/atomic.h> 37 38#include "mlx4.h" 39 40#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 41 42/* Each CPU is put into a group. In most cases, the group number is 43 * equal to the CPU number of one of the CPUs in the group. The 44 * exception is group NR_CPUS which is the default group. This is 45 * protected by sys_tune_startup_mutex. */ 46DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS; 47 48/* For each group, a count of the number of CPUs in the group which 49 * are known to be busy. A busy CPU might be running the busy loop 50 * below or general kernel code. The count is decremented on entry to 51 * the old pm_idle handler and incremented on exit. The aim is to 52 * avoid the count going to zero or negative. This situation can 53 * occur temporarily during module unload or CPU hot-plug but 54 * normality will be restored when the affected CPUs next exit the 55 * idle loop. */ 56static atomic_t busy_cpu_count[NR_CPUS+1]; 57 58/* A workqueue item to be executed to cause the CPU to exit from the 59 * idle loop. */ 60DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work); 61 62#define sys_tune_set_state(CPU,STATE) \ 63 do { } while(0) 64 65 66/* A mutex to protect most of the module datastructures. */ 67static DEFINE_MUTEX(sys_tune_startup_mutex); 68 69/* The old pm_idle handler. */ 70static void (*old_pm_idle)(void) = NULL; 71 72static void sys_tune_pm_idle(void) 73{ 74 atomic_t *busy_cpus_ptr; 75 int busy_cpus; 76 int cpu = smp_processor_id(); 77 78 busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]); 79 80 sys_tune_set_state(cpu, 2); 81 82 local_irq_enable(); 83 while (!need_resched()) { 84 busy_cpus = atomic_read(busy_cpus_ptr); 85 86 /* If other CPUs in this group are busy then let this 87 * CPU go idle. We mustn't let the number of busy 88 * CPUs drop below 1. */ 89 if ( busy_cpus > 1 && 90 old_pm_idle != NULL && 91 ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus, 92 busy_cpus-1) == busy_cpus ) ) { 93 local_irq_disable(); 94 sys_tune_set_state(cpu, 3); 95 /* This check might not be necessary, but it 96 * seems safest to include it because there 97 * might be a kernel version which requires 98 * it. */ 99 if (need_resched()) 100 local_irq_enable(); 101 else 102 old_pm_idle(); 103 /* This CPU is busy again. */ 104 sys_tune_set_state(cpu, 1); 105 atomic_add(1, busy_cpus_ptr); 106 return; 107 } 108 109 cpu_relax(); 110 } 111 sys_tune_set_state(cpu, 0); 112} 113 114 115void sys_tune_work_func(struct work_struct *work) 116{ 117 /* Do nothing. Since this function is running in process 118 * context, the idle thread isn't running on this CPU. */ 119} 120 121 122#ifdef CONFIG_SMP 123static void sys_tune_smp_call(void *info) 124{ 125 schedule_work(&get_cpu_var(sys_tune_cpu_work)); 126 put_cpu_var(sys_tune_cpu_work); 127} 128#endif 129 130 131#ifdef CONFIG_SMP 132static void sys_tune_refresh(void) 133{ 134#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) 135 on_each_cpu(&sys_tune_smp_call, NULL, 0, 1); 136#else 137 on_each_cpu(&sys_tune_smp_call, NULL, 1); 138#endif 139} 140#else 141static void sys_tune_refresh(void) 142{ 143 /* The current thread is executing on the one and only CPU so 144 * the idle thread isn't running. */ 145} 146#endif 147 148 149 150static int sys_tune_cpu_group(int cpu) 151{ 152#ifdef CONFIG_SMP 153 const cpumask_t *mask; 154 int other_cpu; 155 int group; 156 157#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP) 158 /* Keep one hyperthread busy per core. */ 159 mask = topology_thread_cpumask(cpu); 160#else 161 return cpu; 162#endif 163 for_each_cpu_mask(cpu, *(mask)) { 164 group = per_cpu(idle_cpu_group, other_cpu); 165 if (group != NR_CPUS) 166 return group; 167 } 168#endif 169 170 return cpu; 171} 172 173 174static void sys_tune_add_cpu(int cpu) 175{ 176 int group; 177 178 /* Do nothing if this CPU has already been added. */ 179 if (per_cpu(idle_cpu_group, cpu) != NR_CPUS) 180 return; 181 182 group = sys_tune_cpu_group(cpu); 183 per_cpu(idle_cpu_group, cpu) = group; 184 atomic_inc(&(busy_cpu_count[group])); 185 186} 187 188static void sys_tune_del_cpu(int cpu) 189{ 190 191 int group; 192 193 if (per_cpu(idle_cpu_group, cpu) == NR_CPUS) 194 return; 195 196 group = per_cpu(idle_cpu_group, cpu); 197 /* If the CPU was busy, this can cause the count to drop to 198 * zero. To rectify this, we need to cause one of the other 199 * CPUs in the group to exit the idle loop. If the CPU was 200 * not busy then this causes the contribution for this CPU to 201 * go to -1 which can cause the overall count to drop to zero 202 * or go negative. To rectify this situation we need to cause 203 * this CPU to exit the idle loop. */ 204 atomic_dec(&(busy_cpu_count[group])); 205 per_cpu(idle_cpu_group, cpu) = NR_CPUS; 206 207} 208 209 210static int sys_tune_cpu_notify(struct notifier_block *self, 211 unsigned long action, void *hcpu) 212{ 213 int cpu = (long)hcpu; 214 215 switch(action) { 216#ifdef CPU_ONLINE_FROZEN 217 case CPU_ONLINE_FROZEN: 218#endif 219 case CPU_ONLINE: 220 mutex_lock(&sys_tune_startup_mutex); 221 sys_tune_add_cpu(cpu); 222 mutex_unlock(&sys_tune_startup_mutex); 223 /* The CPU might have already entered the idle loop in 224 * the wrong group. Make sure it exits the idle loop 225 * so that it picks up the correct group. */ 226 sys_tune_refresh(); 227 break; 228 229#ifdef CPU_DEAD_FROZEN 230 case CPU_DEAD_FROZEN: 231#endif 232 case CPU_DEAD: 233 mutex_lock(&sys_tune_startup_mutex); 234 sys_tune_del_cpu(cpu); 235 mutex_unlock(&sys_tune_startup_mutex); 236 /* The deleted CPU may have been the only busy CPU in 237 * the group. Make sure one of the other CPUs in the 238 * group exits the idle loop. */ 239 sys_tune_refresh(); 240 break; 241 } 242 return NOTIFY_OK; 243} 244 245 246static struct notifier_block sys_tune_cpu_nb = { 247 .notifier_call = sys_tune_cpu_notify, 248}; 249 250 251static void sys_tune_ensure_init(void) 252{ 253 BUG_ON (old_pm_idle != NULL); 254 255 /* Atomically update pm_idle to &sys_tune_pm_idle. The old value 256 * is stored in old_pm_idle before installing the new 257 * handler. */ 258 do { 259 old_pm_idle = pm_idle; 260 } while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) != 261 old_pm_idle); 262} 263#endif 264 265void sys_tune_fini(void) 266{ 267#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 268 void (*old)(void); 269 int cpu; 270 271 unregister_cpu_notifier(&sys_tune_cpu_nb); 272 273 mutex_lock(&sys_tune_startup_mutex); 274 275 276 old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle); 277 278 for_each_online_cpu(cpu) 279 sys_tune_del_cpu(cpu); 280 281 mutex_unlock(&sys_tune_startup_mutex); 282 283 /* Our handler may still be executing on other CPUs. 284 * Schedule this thread on all CPUs to make sure all 285 * idle threads get interrupted. */ 286 sys_tune_refresh(); 287 288 /* Make sure the work item has finished executing on all CPUs. 289 * This in turn ensures that all idle threads have been 290 * interrupted. */ 291 flush_scheduled_work(); 292#endif /* CONFIG_X86 */ 293} 294 295void sys_tune_init(void) 296{ 297#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 298 int cpu; 299 300 for_each_possible_cpu(cpu) { 301 INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu), 302 sys_tune_work_func); 303 } 304 305 /* Start by registering the handler to ensure we don't miss 306 * any updates. */ 307 register_cpu_notifier(&sys_tune_cpu_nb); 308 309 mutex_lock(&sys_tune_startup_mutex); 310 311 for_each_online_cpu(cpu) 312 sys_tune_add_cpu(cpu); 313 314 sys_tune_ensure_init(); 315 316 317 mutex_unlock(&sys_tune_startup_mutex); 318 319 /* Ensure our idle handler starts to run. */ 320 sys_tune_refresh(); 321#endif 322} 323 324