1/* sched.c - SPU scheduler. 2 * 3 * Copyright (C) IBM 2005 4 * Author: Mark Nutter <mnutter@us.ibm.com> 5 * 6 * 2006-03-31 NUMA domains added. 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 21 */ 22 23#undef DEBUG 24 25#include <linux/module.h> 26#include <linux/errno.h> 27#include <linux/sched.h> 28#include <linux/kernel.h> 29#include <linux/mm.h> 30#include <linux/completion.h> 31#include <linux/vmalloc.h> 32#include <linux/smp.h> 33#include <linux/stddef.h> 34#include <linux/unistd.h> 35#include <linux/numa.h> 36#include <linux/mutex.h> 37#include <linux/notifier.h> 38 39#include <asm/io.h> 40#include <asm/mmu_context.h> 41#include <asm/spu.h> 42#include <asm/spu_csa.h> 43#include <asm/spu_priv1.h> 44#include "spufs.h" 45 46#define SPU_TIMESLICE (HZ) 47 48struct spu_prio_array { 49 DECLARE_BITMAP(bitmap, MAX_PRIO); 50 struct list_head runq[MAX_PRIO]; 51 spinlock_t runq_lock; 52 struct list_head active_list[MAX_NUMNODES]; 53 struct mutex active_mutex[MAX_NUMNODES]; 54}; 55 56static struct spu_prio_array *spu_prio; 57static struct workqueue_struct *spu_sched_wq; 58 59static inline int node_allowed(int node) 60{ 61 cpumask_t mask; 62 63 if (!nr_cpus_node(node)) 64 return 0; 65 mask = node_to_cpumask(node); 66 if (!cpus_intersects(mask, current->cpus_allowed)) 67 return 0; 68 return 1; 69} 70 71void spu_start_tick(struct spu_context *ctx) 72{ 73 if (ctx->policy == SCHED_RR) { 74 /* 75 * Make sure the exiting bit is cleared. 76 */ 77 clear_bit(SPU_SCHED_EXITING, &ctx->sched_flags); 78 mb(); 79 queue_delayed_work(spu_sched_wq, &ctx->sched_work, SPU_TIMESLICE); 80 } 81} 82 83void spu_stop_tick(struct spu_context *ctx) 84{ 85 if (ctx->policy == SCHED_RR) { 86 /* 87 * While the work can be rearming normally setting this flag 88 * makes sure it does not rearm itself anymore. 89 */ 90 set_bit(SPU_SCHED_EXITING, &ctx->sched_flags); 91 mb(); 92 cancel_delayed_work(&ctx->sched_work); 93 } 94} 95 96/** 97 * spu_add_to_active_list - add spu to active list 98 * @spu: spu to add to the active list 99 */ 100static void spu_add_to_active_list(struct spu *spu) 101{ 102 mutex_lock(&spu_prio->active_mutex[spu->node]); 103 list_add_tail(&spu->list, &spu_prio->active_list[spu->node]); 104 mutex_unlock(&spu_prio->active_mutex[spu->node]); 105} 106 107/** 108 * spu_remove_from_active_list - remove spu from active list 109 * @spu: spu to remove from the active list 110 */ 111static void spu_remove_from_active_list(struct spu *spu) 112{ 113 int node = spu->node; 114 115 mutex_lock(&spu_prio->active_mutex[node]); 116 list_del_init(&spu->list); 117 mutex_unlock(&spu_prio->active_mutex[node]); 118} 119 120static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier); 121 122static void spu_switch_notify(struct spu *spu, struct spu_context *ctx) 123{ 124 blocking_notifier_call_chain(&spu_switch_notifier, 125 ctx ? ctx->object_id : 0, spu); 126} 127 128int spu_switch_event_register(struct notifier_block * n) 129{ 130 return blocking_notifier_chain_register(&spu_switch_notifier, n); 131} 132 133int spu_switch_event_unregister(struct notifier_block * n) 134{ 135 return blocking_notifier_chain_unregister(&spu_switch_notifier, n); 136} 137 138/** 139 * spu_bind_context - bind spu context to physical spu 140 * @spu: physical spu to bind to 141 * @ctx: context to bind 142 */ 143static void spu_bind_context(struct spu *spu, struct spu_context *ctx) 144{ 145 pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid, 146 spu->number, spu->node); 147 spu->ctx = ctx; 148 spu->flags = 0; 149 ctx->spu = spu; 150 ctx->ops = &spu_hw_ops; 151 spu->pid = current->pid; 152 spu_associate_mm(spu, ctx->owner); 153 spu->ibox_callback = spufs_ibox_callback; 154 spu->wbox_callback = spufs_wbox_callback; 155 spu->stop_callback = spufs_stop_callback; 156 spu->mfc_callback = spufs_mfc_callback; 157 spu->dma_callback = spufs_dma_callback; 158 mb(); 159 spu_unmap_mappings(ctx); 160 spu_restore(&ctx->csa, spu); 161 spu->timestamp = jiffies; 162 spu_cpu_affinity_set(spu, raw_smp_processor_id()); 163 spu_switch_notify(spu, ctx); 164 spu_add_to_active_list(spu); 165 ctx->state = SPU_STATE_RUNNABLE; 166} 167 168/** 169 * spu_unbind_context - unbind spu context from physical spu 170 * @spu: physical spu to unbind from 171 * @ctx: context to unbind 172 */ 173static void spu_unbind_context(struct spu *spu, struct spu_context *ctx) 174{ 175 pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__, 176 spu->pid, spu->number, spu->node); 177 178 spu_remove_from_active_list(spu); 179 spu_switch_notify(spu, NULL); 180 spu_unmap_mappings(ctx); 181 spu_save(&ctx->csa, spu); 182 spu->timestamp = jiffies; 183 ctx->state = SPU_STATE_SAVED; 184 spu->ibox_callback = NULL; 185 spu->wbox_callback = NULL; 186 spu->stop_callback = NULL; 187 spu->mfc_callback = NULL; 188 spu->dma_callback = NULL; 189 spu_associate_mm(spu, NULL); 190 spu->pid = 0; 191 ctx->ops = &spu_backing_ops; 192 ctx->spu = NULL; 193 spu->flags = 0; 194 spu->ctx = NULL; 195} 196 197/** 198 * spu_add_to_rq - add a context to the runqueue 199 * @ctx: context to add 200 */ 201static void __spu_add_to_rq(struct spu_context *ctx) 202{ 203 int prio = ctx->prio; 204 205 list_add_tail(&ctx->rq, &spu_prio->runq[prio]); 206 set_bit(prio, spu_prio->bitmap); 207} 208 209static void __spu_del_from_rq(struct spu_context *ctx) 210{ 211 int prio = ctx->prio; 212 213 if (!list_empty(&ctx->rq)) 214 list_del_init(&ctx->rq); 215 if (list_empty(&spu_prio->runq[prio])) 216 clear_bit(prio, spu_prio->bitmap); 217} 218 219static void spu_prio_wait(struct spu_context *ctx) 220{ 221 DEFINE_WAIT(wait); 222 223 spin_lock(&spu_prio->runq_lock); 224 prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE); 225 if (!signal_pending(current)) { 226 __spu_add_to_rq(ctx); 227 spin_unlock(&spu_prio->runq_lock); 228 mutex_unlock(&ctx->state_mutex); 229 schedule(); 230 mutex_lock(&ctx->state_mutex); 231 spin_lock(&spu_prio->runq_lock); 232 __spu_del_from_rq(ctx); 233 } 234 spin_unlock(&spu_prio->runq_lock); 235 __set_current_state(TASK_RUNNING); 236 remove_wait_queue(&ctx->stop_wq, &wait); 237} 238 239static struct spu *spu_get_idle(struct spu_context *ctx) 240{ 241 struct spu *spu = NULL; 242 int node = cpu_to_node(raw_smp_processor_id()); 243 int n; 244 245 for (n = 0; n < MAX_NUMNODES; n++, node++) { 246 node = (node < MAX_NUMNODES) ? node : 0; 247 if (!node_allowed(node)) 248 continue; 249 spu = spu_alloc_node(node); 250 if (spu) 251 break; 252 } 253 return spu; 254} 255 256/** 257 * find_victim - find a lower priority context to preempt 258 * @ctx: canidate context for running 259 * 260 * Returns the freed physical spu to run the new context on. 261 */ 262static struct spu *find_victim(struct spu_context *ctx) 263{ 264 struct spu_context *victim = NULL; 265 struct spu *spu; 266 int node, n; 267 268 /* 269 * Look for a possible preemption candidate on the local node first. 270 * If there is no candidate look at the other nodes. This isn't 271 * exactly fair, but so far the whole spu schedule tries to keep 272 * a strong node affinity. We might want to fine-tune this in 273 * the future. 274 */ 275 restart: 276 node = cpu_to_node(raw_smp_processor_id()); 277 for (n = 0; n < MAX_NUMNODES; n++, node++) { 278 node = (node < MAX_NUMNODES) ? node : 0; 279 if (!node_allowed(node)) 280 continue; 281 282 mutex_lock(&spu_prio->active_mutex[node]); 283 list_for_each_entry(spu, &spu_prio->active_list[node], list) { 284 struct spu_context *tmp = spu->ctx; 285 286 if (tmp->rt_priority < ctx->rt_priority && 287 (!victim || tmp->rt_priority < victim->rt_priority)) 288 victim = spu->ctx; 289 } 290 mutex_unlock(&spu_prio->active_mutex[node]); 291 292 if (victim) { 293 /* 294 * This nests ctx->state_mutex, but we always lock 295 * higher priority contexts before lower priority 296 * ones, so this is safe until we introduce 297 * priority inheritance schemes. 298 */ 299 if (!mutex_trylock(&victim->state_mutex)) { 300 victim = NULL; 301 goto restart; 302 } 303 304 spu = victim->spu; 305 if (!spu) { 306 /* 307 * This race can happen because we've dropped 308 * the active list mutex. No a problem, just 309 * restart the search. 310 */ 311 mutex_unlock(&victim->state_mutex); 312 victim = NULL; 313 goto restart; 314 } 315 spu_unbind_context(spu, victim); 316 mutex_unlock(&victim->state_mutex); 317 /* 318 * We need to break out of the wait loop in spu_run 319 * manually to ensure this context gets put on the 320 * runqueue again ASAP. 321 */ 322 wake_up(&victim->stop_wq); 323 return spu; 324 } 325 } 326 327 return NULL; 328} 329 330/** 331 * spu_activate - find a free spu for a context and execute it 332 * @ctx: spu context to schedule 333 * @flags: flags (currently ignored) 334 * 335 * Tries to find a free spu to run @ctx. If no free spu is available 336 * add the context to the runqueue so it gets woken up once an spu 337 * is available. 338 */ 339int spu_activate(struct spu_context *ctx, unsigned long flags) 340{ 341 342 if (ctx->spu) 343 return 0; 344 345 do { 346 struct spu *spu; 347 348 spu = spu_get_idle(ctx); 349 /* 350 * If this is a realtime thread we try to get it running by 351 * preempting a lower priority thread. 352 */ 353 if (!spu && ctx->rt_priority) 354 spu = find_victim(ctx); 355 if (spu) { 356 spu_bind_context(spu, ctx); 357 return 0; 358 } 359 360 spu_prio_wait(ctx); 361 } while (!signal_pending(current)); 362 363 return -ERESTARTSYS; 364} 365 366/** 367 * grab_runnable_context - try to find a runnable context 368 * 369 * Remove the highest priority context on the runqueue and return it 370 * to the caller. Returns %NULL if no runnable context was found. 371 */ 372static struct spu_context *grab_runnable_context(int prio) 373{ 374 struct spu_context *ctx = NULL; 375 int best; 376 377 spin_lock(&spu_prio->runq_lock); 378 best = sched_find_first_bit(spu_prio->bitmap); 379 if (best < prio) { 380 struct list_head *rq = &spu_prio->runq[best]; 381 382 BUG_ON(list_empty(rq)); 383 384 ctx = list_entry(rq->next, struct spu_context, rq); 385 __spu_del_from_rq(ctx); 386 } 387 spin_unlock(&spu_prio->runq_lock); 388 389 return ctx; 390} 391 392static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio) 393{ 394 struct spu *spu = ctx->spu; 395 struct spu_context *new = NULL; 396 397 if (spu) { 398 new = grab_runnable_context(max_prio); 399 if (new || force) { 400 spu_unbind_context(spu, ctx); 401 spu_free(spu); 402 if (new) 403 wake_up(&new->stop_wq); 404 } 405 406 } 407 408 return new != NULL; 409} 410 411/** 412 * spu_deactivate - unbind a context from it's physical spu 413 * @ctx: spu context to unbind 414 * 415 * Unbind @ctx from the physical spu it is running on and schedule 416 * the highest priority context to run on the freed physical spu. 417 */ 418void spu_deactivate(struct spu_context *ctx) 419{ 420 __spu_deactivate(ctx, 1, MAX_PRIO); 421} 422 423/** 424 * spu_yield - yield a physical spu if others are waiting 425 * @ctx: spu context to yield 426 * 427 * Check if there is a higher priority context waiting and if yes 428 * unbind @ctx from the physical spu and schedule the highest 429 * priority context to run on the freed physical spu instead. 430 */ 431void spu_yield(struct spu_context *ctx) 432{ 433 if (!(ctx->flags & SPU_CREATE_NOSCHED)) { 434 mutex_lock(&ctx->state_mutex); 435 __spu_deactivate(ctx, 0, MAX_PRIO); 436 mutex_unlock(&ctx->state_mutex); 437 } 438} 439 440void spu_sched_tick(struct work_struct *work) 441{ 442 struct spu_context *ctx = 443 container_of(work, struct spu_context, sched_work.work); 444 int preempted; 445 446 /* 447 * If this context is being stopped avoid rescheduling from the 448 * scheduler tick because we would block on the state_mutex. 449 * The caller will yield the spu later on anyway. 450 */ 451 if (test_bit(SPU_SCHED_EXITING, &ctx->sched_flags)) 452 return; 453 454 mutex_lock(&ctx->state_mutex); 455 preempted = __spu_deactivate(ctx, 0, ctx->prio + 1); 456 mutex_unlock(&ctx->state_mutex); 457 458 if (preempted) { 459 /* 460 * We need to break out of the wait loop in spu_run manually 461 * to ensure this context gets put on the runqueue again 462 * ASAP. 463 */ 464 wake_up(&ctx->stop_wq); 465 } else { 466 spu_start_tick(ctx); 467 } 468} 469 470int __init spu_sched_init(void) 471{ 472 int i; 473 474 spu_sched_wq = create_singlethread_workqueue("spusched"); 475 if (!spu_sched_wq) 476 return 1; 477 478 spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL); 479 if (!spu_prio) { 480 printk(KERN_WARNING "%s: Unable to allocate priority queue.\n", 481 __FUNCTION__); 482 destroy_workqueue(spu_sched_wq); 483 return 1; 484 } 485 for (i = 0; i < MAX_PRIO; i++) { 486 INIT_LIST_HEAD(&spu_prio->runq[i]); 487 __clear_bit(i, spu_prio->bitmap); 488 } 489 __set_bit(MAX_PRIO, spu_prio->bitmap); 490 for (i = 0; i < MAX_NUMNODES; i++) { 491 mutex_init(&spu_prio->active_mutex[i]); 492 INIT_LIST_HEAD(&spu_prio->active_list[i]); 493 } 494 spin_lock_init(&spu_prio->runq_lock); 495 return 0; 496} 497 498void __exit spu_sched_exit(void) 499{ 500 struct spu *spu, *tmp; 501 int node; 502 503 for (node = 0; node < MAX_NUMNODES; node++) { 504 mutex_lock(&spu_prio->active_mutex[node]); 505 list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node], 506 list) { 507 list_del_init(&spu->list); 508 spu_free(spu); 509 } 510 mutex_unlock(&spu_prio->active_mutex[node]); 511 } 512 kfree(spu_prio); 513 destroy_workqueue(spu_sched_wq); 514} 515