kernel/time/timer_migration.c

22  * lowest level group contains CPUs, the next level groups of CPU groups
43  * Each group has a designated migrator CPU/group as long as a CPU/group is
44  * active in the group. This designated role is necessary to avoid that all
45  * active CPUs in a group try to migrate expired timers from other CPUs,
48  * When a CPU is awake, it checks in it's own timer tick the group
53  * If it finds expired timers in one of the group queues it pulls them over
55  * group and the parent groups if required.
60  * CPU does not queue an event in the LVL0 group. If the next migratable
62  * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0
63  * group.
65  * When CPU comes out of idle and when a group has at least a single active
73  * If the CPU is the migrator of the group then it delegates that role to
74  * the next active CPU in the group or sets migrator to TMIGR_NONE when
75  * there is no active CPU in the group. This delegation needs to be
89  * not destroyed when a group becomes empty due to offlining. The group
99  * child and parent group. The lock ordering is always bottom up. This also
101  * active CPU/group information atomic_try_cmpxchg() is used instead and only
111  * Protection of the tmigr group state information:
117  * lockless and group wise. The following scenario describes what happens
135  * 1. CPU0 goes idle. As the update is performed group wise, in the first step
213  * first global timer of an idle CPU, the group and child states have to be read
249  *    child going idle in top level group, the expiry of the next group event
286  *    top level group.
328  * update of the group state from active path is no problem, as the upcoming CPU
329  * will take care of the group events.
360  * 2. CPU2 starts to expire remote timers. It starts with LVL0 group
390  *    in GRP0:0's timerqueue and therefore set in the CPU field of the group
414  * of the group as migrator and any needed updates within the hierarchy.
434  * Returns true, when @childmask corresponds to the group migrator or when the
435  * group is not active - so no migrator is set.
437 static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask)
441 	s.state = atomic_read(&group->migr_state);
449 static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask)
455 	s.state = atomic_read(&group->migr_state);
466 static bool tmigr_check_lonely(struct tmigr_group *group)
471 	s.state = atomic_read(&group->migr_state);
483 	struct tmigr_group *child = NULL, *group = tmc->tmgroup;
486 		WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
488 		if (up(group, child, data))
491 		child = group;
492 		group = group->parent;
493 	} while (group);
512  *			child group)
513  * @childmask:		childmask of child group
533  * @childmask:		childmask of child group
552  * Returns the next event of the timerqueue @group->events
554  * Removes timers with ignore flag and update next_expiry of the group. Values
555  * of the group event are updated in tmigr_update_events() only.
557 static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
562 	lockdep_assert_held(&group->lock);
564 	WRITE_ONCE(group->next_expiry, KTIME_MAX);
566 	while ((node = timerqueue_getnext(&group->events))) {
570 			WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
575 		 * Remove next timers with ignore flag, because the group lock
578 		if (!timerqueue_del(&group->events, node))
590 static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group,
593 	struct tmigr_event *evt = tmigr_next_groupevt(group);
599 	 * The event is ready to expire. Remove it and update next group event.
601 	timerqueue_del(&group->events, &evt->nextevt);
602 	tmigr_next_groupevt(group);
607 static u64 tmigr_next_groupevt_expires(struct tmigr_group *group)
611 	evt = tmigr_next_groupevt(group);
619 static bool tmigr_active_up(struct tmigr_group *group,
631 	 * tmigr_inactive_up(), as the group state change does not depend on the
634 	curstate.state = atomic_read(&group->migr_state);
650 	} while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
652 	if ((walk_done == false) && group->parent)
653 		data->childmask = group->childmask;
656 	 * The group is active (again). The group event might be still queued
657 	 * into the parent group's timerqueue but can now be handled by the
658 	 * migrator of this group. Therefore the ignore flag for the group event
662 	 * worst case the migrator of the parent group observes the change too
663 	 * late and expires remotely all events belonging to this group. The
667 	group->groupevt.ignore = true;
669 	trace_tmigr_group_set_cpu_active(group, newstate, childmask);
715  * The child and group states need to be read under the lock, to prevent a race
720  * This is the only place where the group event expiry value is set.
723 bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
734 		raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING);
737 		groupstate.state = atomic_read(&group->migr_state);
768 		 *    return is possible if there is a parent, sparing group
771 		 *    within the group and update next_expiry accordingly.
774 		 * single level so @group is the top level group, make sure the
775 		 * first event information of the group is updated properly and
778 		if (evt->ignore && !remote && group->parent)
781 		raw_spin_lock(&group->lock);
784 		groupstate.state = atomic_read(&group->migr_state);
788 	 * If the child event is already queued in the group, remove it from the
798 		if (!timerqueue_del(&group->events, &evt->nextevt))
799 			WRITE_ONCE(group->next_expiry, KTIME_MAX);
806 		 * the group is already active, there is no need to walk the
807 		 * hierarchy even if there is a parent group.
810 		 * if a remote timer handling was executed before and the group
812 		 * an enqueued timer in the non active group. The enqueued timer
813 		 * of the group needs to be propagated to a higher level to
822 		if (timerqueue_add(&group->events, &evt->nextevt))
823 			WRITE_ONCE(group->next_expiry, nextexp);
827 	if (!group->parent && (groupstate.migrator == TMIGR_NONE)) {
832 		 * handling. First timer in top level group which needs to be
833 		 * handled when top level group is not active, is calculated
840 		 * The top level group is idle and it has to be ensured the
846 		data->firstexp = tmigr_next_groupevt_expires(group);
849 	trace_tmigr_update_events(child, group, childstate, groupstate,
853 	raw_spin_unlock(&group->lock);
861 static bool tmigr_new_timer_up(struct tmigr_group *group,
867 	return tmigr_update_events(group, child, data);
953 	 * the timerqueue and group events.
996 static bool tmigr_handle_remote_up(struct tmigr_group *group,
1011 	trace_tmigr_handle_remote(group);
1014 	 * Handle the group only if @childmask is the migrator or if the
1015 	 * group has no migrator. Otherwise the group is active and is
1018 	if (!tmigr_check_migrator(group, childmask))
1021 	raw_spin_lock_irq(&group->lock);
1023 	evt = tmigr_next_expired_groupevt(group, now);
1028 		raw_spin_unlock_irq(&group->lock);
1038 	 * of the first event that needs to be handled (group->next_expiry was
1042 	data->childmask = group->childmask;
1043 	data->firstexp = group->next_expiry;
1045 	raw_spin_unlock_irq(&group->lock);
1098 static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
1108 	 * Handle the group only if the child is the migrator or if the group
1109 	 * has no migrator. Otherwise the group is active and is handled by its
1112 	if (!tmigr_check_migrator(group, childmask))
1116 	 * When there is a parent group and the CPU which triggered the
1118 	 * group before reading the next_expiry value.
1120 	if (group->parent && !data->tmc_active)
1130 		data->firstexp = READ_ONCE(group->next_expiry);
1136 		raw_spin_lock(&group->lock);
1137 		data->firstexp = group->next_expiry;
1138 		if (data->now >= group->next_expiry) {
1140 			raw_spin_unlock(&group->lock);
1143 		raw_spin_unlock(&group->lock);
1148 	data->childmask = group->childmask;
1246 static bool tmigr_inactive_up(struct tmigr_group *group,
1260 	 * to make sure the updates of child and group states are ordered. The
1261 	 * ordering is mandatory, as the group state change depends on the child
1264 	curstate.state = atomic_read_acquire(&group->migr_state);
1279 			 * Find a new migrator for the group, because the child
1280 			 * group is idle!
1302 		if (atomic_try_cmpxchg(&group->migr_state, &curstate.state,
1308 		 * tmigr_active_up() to make sure the updates of child and group
1318 	tmigr_update_events(group, child, data);
1320 	if (group->parent && (walk_done == false))
1321 		data->childmask = group->childmask;
1327 	 * - group is the top level group and
1328 	 * - group is idle (which means CPU was the last active CPU in the
1332 	WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent);
1334 	trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
1400  *			  the only one in the level 0 group; and if it is the
1401  *			  only one in level 0 group, but there are more than a
1402  *			  single group active on the way to top level)
1404  *			  or when on the way to top in every group only a single
1414 	struct tmigr_group *group = tmc->tmgroup;
1426 		if (!tmigr_check_lonely(group)) {
1435 			nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
1436 			if (!group->parent)
1439 		group = group->parent;
1440 	} while (group);
1445 static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
1450 	raw_spin_lock_init(&group->lock);
1452 	group->level = lvl;
1453 	group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
1455 	group->num_children = 0;
1460 	atomic_set(&group->migr_state, s.state);
1462 	timerqueue_init_head(&group->events);
1463 	timerqueue_init(&group->groupevt.nextevt);
1464 	group->groupevt.nextevt.expires = KTIME_MAX;
1465 	WRITE_ONCE(group->next_expiry, KTIME_MAX);
1466 	group->groupevt.ignore = true;
1472 	struct tmigr_group *tmp, *group = NULL;
1476 	/* Try to attach to an existing group first */
1480 		 * this group belongs to the same NUMA node.
1491 		 * siblings end up in the same group of the lowest level of the
1496 		group = tmp;
1500 	if (group)
1501 		return group;
1503 	/* Allocate and	set up a new group */
1504 	group = kzalloc_node(sizeof(*group), GFP_KERNEL, node);
1505 	if (!group)
1508 	tmigr_init_group(group, lvl, node);
1511 	list_add(&group->list, &tmigr_level_list[lvl]);
1512 	trace_tmigr_group_set(group);
1513 	return group;
1535 	 * in the parent group:
1542 	 * * But if a new group above the current top level is required, it is
1545 	 *   executed with the formerly top level group (child) and the newly
1546 	 *   created group (parent).
1566 	struct tmigr_group *group, *child, **stack;
1575 		group = tmigr_get_group(cpu, node, i);
1576 		if (IS_ERR(group)) {
1577 			err = PTR_ERR(group);
1582 		stack[i++] = group;
1590 		 * single group.
1592 		if (group->parent || i == tmigr_hierarchy_levels ||
1600 		group = stack[--i];
1603 			list_del(&group->list);
1604 			kfree(group);
1608 		WARN_ON_ONCE(i != group->level);
1611 		 * Update tmc -> group / child -> group connection
1616 			raw_spin_lock_irq(&group->lock);
1618 			tmc->tmgroup = group;
1619 			tmc->childmask = BIT(group->num_children++);
1621 			raw_spin_unlock_irq(&group->lock);
1629 			tmigr_connect_child_parent(child, group);
1639 		if (group->num_children == 1 && list_is_singular(lvllist)) {
1645 				tmigr_connect_child_parent(child, group);
1779 	 * If a NUMA node spawns more than one CPU level group then the next
1794 	pr_info("Timer migration: %d hierarchy levels; %d children per group;"