1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_PSI_TYPES_H
3#define _LINUX_PSI_TYPES_H
4
5#include <linux/kthread.h>
6#include <linux/seqlock.h>
7#include <linux/types.h>
8#include <linux/kref.h>
9#include <linux/wait.h>
10
11#ifdef CONFIG_PSI
12
13/* Tracked task states */
14enum psi_task_count {
15	NR_IOWAIT,
16	NR_MEMSTALL,
17	NR_RUNNING,
18	/*
19	 * For IO and CPU stalls the presence of running/oncpu tasks
20	 * in the domain means a partial rather than a full stall.
21	 * For memory it's not so simple because of page reclaimers:
22	 * they are running/oncpu while representing a stall. To tell
23	 * whether a domain has productivity left or not, we need to
24	 * distinguish between regular running (i.e. productive)
25	 * threads and memstall ones.
26	 */
27	NR_MEMSTALL_RUNNING,
28	NR_PSI_TASK_COUNTS = 4,
29};
30
31/* Task state bitmasks */
32#define TSK_IOWAIT	(1 << NR_IOWAIT)
33#define TSK_MEMSTALL	(1 << NR_MEMSTALL)
34#define TSK_RUNNING	(1 << NR_RUNNING)
35#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
36
37/* Only one task can be scheduled, no corresponding task count */
38#define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)
39
40/* Resources that workloads could be stalled on */
41enum psi_res {
42	PSI_IO,
43	PSI_MEM,
44	PSI_CPU,
45#ifdef CONFIG_IRQ_TIME_ACCOUNTING
46	PSI_IRQ,
47#endif
48	NR_PSI_RESOURCES,
49};
50
51/*
52 * Pressure states for each resource:
53 *
54 * SOME: Stalled tasks & working tasks
55 * FULL: Stalled tasks & no working tasks
56 */
57enum psi_states {
58	PSI_IO_SOME,
59	PSI_IO_FULL,
60	PSI_MEM_SOME,
61	PSI_MEM_FULL,
62	PSI_CPU_SOME,
63	PSI_CPU_FULL,
64#ifdef CONFIG_IRQ_TIME_ACCOUNTING
65	PSI_IRQ_FULL,
66#endif
67	/* Only per-CPU, to weigh the CPU in the global average: */
68	PSI_NONIDLE,
69	NR_PSI_STATES,
70};
71
72/* Use one bit in the state mask to track TSK_ONCPU */
73#define PSI_ONCPU	(1 << NR_PSI_STATES)
74
75/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
76#define PSI_STATE_RESCHEDULE	(1 << (NR_PSI_STATES + 1))
77
78enum psi_aggregators {
79	PSI_AVGS = 0,
80	PSI_POLL,
81	NR_PSI_AGGREGATORS,
82};
83
84struct psi_group_cpu {
85	/* 1st cacheline updated by the scheduler */
86
87	/* Aggregator needs to know of concurrent changes */
88	seqcount_t seq ____cacheline_aligned_in_smp;
89
90	/* States of the tasks belonging to this group */
91	unsigned int tasks[NR_PSI_TASK_COUNTS];
92
93	/* Aggregate pressure state derived from the tasks */
94	u32 state_mask;
95
96	/* Period time sampling buckets for each state of interest (ns) */
97	u32 times[NR_PSI_STATES];
98
99	/* Time of last task change in this group (rq_clock) */
100	u64 state_start;
101
102	/* 2nd cacheline updated by the aggregator */
103
104	/* Delta detection against the sampling buckets */
105	u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
106			____cacheline_aligned_in_smp;
107};
108
109/* PSI growth tracking window */
110struct psi_window {
111	/* Window size in ns */
112	u64 size;
113
114	/* Start time of the current window in ns */
115	u64 start_time;
116
117	/* Value at the start of the window */
118	u64 start_value;
119
120	/* Value growth in the previous window */
121	u64 prev_growth;
122};
123
124struct psi_trigger {
125	/* PSI state being monitored by the trigger */
126	enum psi_states state;
127
128	/* User-spacified threshold in ns */
129	u64 threshold;
130
131	/* List node inside triggers list */
132	struct list_head node;
133
134	/* Backpointer needed during trigger destruction */
135	struct psi_group *group;
136
137	/* Wait queue for polling */
138	wait_queue_head_t event_wait;
139
140	/* Kernfs file for cgroup triggers */
141	struct kernfs_open_file *of;
142
143	/* Pending event flag */
144	int event;
145
146	/* Tracking window */
147	struct psi_window win;
148
149	/*
150	 * Time last event was generated. Used for rate-limiting
151	 * events to one per window
152	 */
153	u64 last_event_time;
154
155	/* Deferred event(s) from previous ratelimit window */
156	bool pending_event;
157
158	/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
159	enum psi_aggregators aggregator;
160};
161
162struct psi_group {
163	struct psi_group *parent;
164	bool enabled;
165
166	/* Protects data used by the aggregator */
167	struct mutex avgs_lock;
168
169	/* Per-cpu task state & time tracking */
170	struct psi_group_cpu __percpu *pcpu;
171
172	/* Running pressure averages */
173	u64 avg_total[NR_PSI_STATES - 1];
174	u64 avg_last_update;
175	u64 avg_next_update;
176
177	/* Aggregator work control */
178	struct delayed_work avgs_work;
179
180	/* Unprivileged triggers against N*PSI_FREQ windows */
181	struct list_head avg_triggers;
182	u32 avg_nr_triggers[NR_PSI_STATES - 1];
183
184	/* Total stall times and sampled pressure averages */
185	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
186	unsigned long avg[NR_PSI_STATES - 1][3];
187
188	/* Monitor RT polling work control */
189	struct task_struct __rcu *rtpoll_task;
190	struct timer_list rtpoll_timer;
191	wait_queue_head_t rtpoll_wait;
192	atomic_t rtpoll_wakeup;
193	atomic_t rtpoll_scheduled;
194
195	/* Protects data used by the monitor */
196	struct mutex rtpoll_trigger_lock;
197
198	/* Configured RT polling triggers */
199	struct list_head rtpoll_triggers;
200	u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
201	u32 rtpoll_states;
202	u64 rtpoll_min_period;
203
204	/* Total stall times at the start of RT polling monitor activation */
205	u64 rtpoll_total[NR_PSI_STATES - 1];
206	u64 rtpoll_next_update;
207	u64 rtpoll_until;
208};
209
210#else /* CONFIG_PSI */
211
212#define NR_PSI_RESOURCES	0
213
214struct psi_group { };
215
216#endif /* CONFIG_PSI */
217
218#endif /* _LINUX_PSI_TYPES_H */
219