1254721Semaste/* SPDX-License-Identifier: GPL-2.0 */ 2254721Semaste/* 3254721Semaste * linux/cgroup-defs.h - basic definitions for cgroup 4254721Semaste * 5254721Semaste * This file provides basic type and interface. Include this file directly 6254721Semaste * only if necessary to avoid cyclic dependencies. 7254721Semaste */ 8254721Semaste#ifndef _LINUX_CGROUP_DEFS_H 9254721Semaste#define _LINUX_CGROUP_DEFS_H 10254721Semaste 11254721Semaste#include <linux/limits.h> 12254721Semaste#include <linux/list.h> 13254721Semaste#include <linux/idr.h> 14254721Semaste#include <linux/wait.h> 15254721Semaste#include <linux/mutex.h> 16254721Semaste#include <linux/rcupdate.h> 17254721Semaste#include <linux/refcount.h> 18254721Semaste#include <linux/percpu-refcount.h> 19254721Semaste#include <linux/percpu-rwsem.h> 20254721Semaste#include <linux/u64_stats_sync.h> 21254721Semaste#include <linux/workqueue.h> 22254721Semaste#include <linux/bpf-cgroup-defs.h> 23254721Semaste#include <linux/psi_types.h> 24254721Semaste 25254721Semaste#ifdef CONFIG_CGROUPS 26254721Semaste 27254721Semastestruct cgroup; 28254721Semastestruct cgroup_root; 29254721Semastestruct cgroup_subsys; 30254721Semastestruct cgroup_taskset; 31254721Semastestruct kernfs_node; 32254721Semastestruct kernfs_ops; 33254721Semastestruct kernfs_open_file; 34254721Semastestruct seq_file; 35254721Semastestruct poll_table_struct; 36254721Semaste 37254721Semaste#define MAX_CGROUP_TYPE_NAMELEN 32 38269024Semaste#define MAX_CGROUP_ROOT_NAMELEN 64 39254721Semaste#define MAX_CFTYPE_NAME 64 40254721Semaste 41254721Semaste/* define the enumeration of all cgroup subsystems */ 42254721Semaste#define SUBSYS(_x) _x ## _cgrp_id, 43254721Semasteenum cgroup_subsys_id { 44254721Semaste#include <linux/cgroup_subsys.h> 45254721Semaste CGROUP_SUBSYS_COUNT, 46254721Semaste}; 47254721Semaste#undef SUBSYS 48254721Semaste 49254721Semaste/* bits in struct cgroup_subsys_state flags field */ 50254721Semasteenum { 51254721Semaste CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 52254721Semaste CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 53254721Semaste CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ 54254721Semaste CSS_VISIBLE = (1 << 3), /* css is visible to userland */ 55254721Semaste CSS_DYING = (1 << 4), /* css is dying */ 56254721Semaste}; 57254721Semaste 58254721Semaste/* bits in struct cgroup flags field */ 59254721Semasteenum { 60254721Semaste /* Control Group requires release notifications to userspace */ 61254721Semaste CGRP_NOTIFY_ON_RELEASE, 62254721Semaste /* 63254721Semaste * Clone the parent's configuration when creating a new child 64254721Semaste * cpuset cgroup. For historical reasons, this option can be 65254721Semaste * specified at mount time and thus is implemented here. 66254721Semaste */ 67254721Semaste CGRP_CPUSET_CLONE_CHILDREN, 68254721Semaste 69254721Semaste /* Control group has to be frozen. */ 70254721Semaste CGRP_FREEZE, 71254721Semaste 72254721Semaste /* Cgroup is frozen. */ 73254721Semaste CGRP_FROZEN, 74254721Semaste 75254721Semaste /* Control group has to be killed. */ 76254721Semaste CGRP_KILL, 77254721Semaste}; 78254721Semaste 79254721Semaste/* cgroup_root->flags */ 80254721Semasteenum { 81254721Semaste CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 82254721Semaste CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 83254721Semaste 84254721Semaste /* 85254721Semaste * Consider namespaces as delegation boundaries. If this flag is 86254721Semaste * set, controller specific interface files in a namespace root 87254721Semaste * aren't writeable from inside the namespace. 88254721Semaste */ 89254721Semaste CGRP_ROOT_NS_DELEGATE = (1 << 3), 90254721Semaste 91254721Semaste /* 92254721Semaste * Reduce latencies on dynamic cgroup modifications such as task 93254721Semaste * migrations and controller on/offs by disabling percpu operation on 94254721Semaste * cgroup_threadgroup_rwsem. This makes hot path operations such as 95254721Semaste * forks and exits into the slow path and more expensive. 96254721Semaste * 97254721Semaste * The static usage pattern of creating a cgroup, enabling controllers, 98254721Semaste * and then seeding it with CLONE_INTO_CGROUP doesn't require write 99254721Semaste * locking cgroup_threadgroup_rwsem and thus doesn't benefit from 100254721Semaste * favordynmod. 101254721Semaste */ 102254721Semaste CGRP_ROOT_FAVOR_DYNMODS = (1 << 4), 103254721Semaste 104254721Semaste /* 105254721Semaste * Enable cpuset controller in v1 cgroup to use v2 behavior. 106254721Semaste */ 107254721Semaste CGRP_ROOT_CPUSET_V2_MODE = (1 << 16), 108254721Semaste 109254721Semaste /* 110254721Semaste * Enable legacy local memory.events. 111254721Semaste */ 112254721Semaste CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17), 113254721Semaste 114254721Semaste /* 115254721Semaste * Enable recursive subtree protection 116254721Semaste */ 117254721Semaste CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), 118254721Semaste 119254721Semaste /* 120254721Semaste * Enable hugetlb accounting for the memory controller. 121254721Semaste */ 122254721Semaste CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), 123254721Semaste}; 124254721Semaste 125254721Semaste/* cftype->flags */ 126254721Semasteenum { 127254721Semaste CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 128254721Semaste CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 129254721Semaste CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ 130254721Semaste 131254721Semaste CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 132254721Semaste CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ 133254721Semaste CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ 134254721Semaste 135254721Semaste /* internal flags, do not use outside cgroup core proper */ 136254721Semaste __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ 137254721Semaste __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ 138254721Semaste __CFTYPE_ADDED = (1 << 18), 139254721Semaste}; 140254721Semaste 141254721Semaste/* 142254721Semaste * cgroup_file is the handle for a file instance created in a cgroup which 143254721Semaste * is used, for example, to generate file changed notifications. This can 144254721Semaste * be obtained by setting cftype->file_offset. 145254721Semaste */ 146254721Semastestruct cgroup_file { 147254721Semaste /* do not access any fields from outside cgroup core */ 148254721Semaste struct kernfs_node *kn; 149254721Semaste unsigned long notified_at; 150254721Semaste struct timer_list notify_timer; 151254721Semaste}; 152254721Semaste 153254721Semaste/* 154254721Semaste * Per-subsystem/per-cgroup state maintained by the system. This is the 155254721Semaste * fundamental structural building block that controllers deal with. 156254721Semaste * 157254721Semaste * Fields marked with "PI:" are public and immutable and may be accessed 158254721Semaste * directly without synchronization. 159254721Semaste */ 160254721Semastestruct cgroup_subsys_state { 161254721Semaste /* PI: the cgroup that this css is attached to */ 162254721Semaste struct cgroup *cgroup; 163254721Semaste 164254721Semaste /* PI: the cgroup subsystem that this css is attached to */ 165254721Semaste struct cgroup_subsys *ss; 166254721Semaste 167254721Semaste /* reference count - access via css_[try]get() and css_put() */ 168254721Semaste struct percpu_ref refcnt; 169254721Semaste 170254721Semaste /* siblings list anchored at the parent's ->children */ 171254721Semaste struct list_head sibling; 172254721Semaste struct list_head children; 173254721Semaste 174254721Semaste /* flush target list anchored at cgrp->rstat_css_list */ 175254721Semaste struct list_head rstat_css_node; 176254721Semaste 177254721Semaste /* 178254721Semaste * PI: Subsys-unique ID. 0 is unused and root is always 1. The 179254721Semaste * matching css can be looked up using css_from_id(). 180254721Semaste */ 181254721Semaste int id; 182254721Semaste 183254721Semaste unsigned int flags; 184254721Semaste 185254721Semaste /* 186254721Semaste * Monotonically increasing unique serial number which defines a 187254721Semaste * uniform order among all csses. It's guaranteed that all 188254721Semaste * ->children lists are in the ascending order of ->serial_nr and 189254721Semaste * used to allow interrupting and resuming iterations. 190254721Semaste */ 191254721Semaste u64 serial_nr; 192254721Semaste 193254721Semaste /* 194254721Semaste * Incremented by online self and children. Used to guarantee that 195254721Semaste * parents are not offlined before their children. 196254721Semaste */ 197254721Semaste atomic_t online_cnt; 198254721Semaste 199254721Semaste /* percpu_ref killing and RCU release */ 200254721Semaste struct work_struct destroy_work; 201254721Semaste struct rcu_work destroy_rwork; 202254721Semaste 203254721Semaste /* 204254721Semaste * PI: the parent css. Placed here for cache proximity to following 205254721Semaste * fields of the containing structure. 206254721Semaste */ 207254721Semaste struct cgroup_subsys_state *parent; 208254721Semaste}; 209254721Semaste 210254721Semaste/* 211254721Semaste * A css_set is a structure holding pointers to a set of 212254721Semaste * cgroup_subsys_state objects. This saves space in the task struct 213254721Semaste * object and speeds up fork()/exit(), since a single inc/dec and a 214254721Semaste * list_add()/del() can bump the reference count on the entire cgroup 215254721Semaste * set for a task. 216254721Semaste */ 217254721Semastestruct css_set { 218254721Semaste /* 219254721Semaste * Set of subsystem states, one for each subsystem. This array is 220254721Semaste * immutable after creation apart from the init_css_set during 221254721Semaste * subsystem registration (at boot time). 222254721Semaste */ 223254721Semaste struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 224254721Semaste 225254721Semaste /* reference count */ 226254721Semaste refcount_t refcount; 227254721Semaste 228254721Semaste /* 229254721Semaste * For a domain cgroup, the following points to self. If threaded, 230254721Semaste * to the matching cset of the nearest domain ancestor. The 231254721Semaste * dom_cset provides access to the domain cgroup and its csses to 232254721Semaste * which domain level resource consumptions should be charged. 233254721Semaste */ 234254721Semaste struct css_set *dom_cset; 235254721Semaste 236254721Semaste /* the default cgroup associated with this css_set */ 237254721Semaste struct cgroup *dfl_cgrp; 238254721Semaste 239263363Semaste /* internal task count, protected by css_set_lock */ 240263363Semaste int nr_tasks; 241254721Semaste 242254721Semaste /* 243254721Semaste * Lists running through all tasks using this cgroup group. 244254721Semaste * mg_tasks lists tasks which belong to this cset but are in the 245254721Semaste * process of being migrated out or in. Protected by 246254721Semaste * css_set_lock, but, during migration, once tasks are moved to 247254721Semaste * mg_tasks, it can be read safely while holding cgroup_mutex. 248254721Semaste */ 249254721Semaste struct list_head tasks; 250254721Semaste struct list_head mg_tasks; 251254721Semaste struct list_head dying_tasks; 252254721Semaste 253254721Semaste /* all css_task_iters currently walking this cset */ 254254721Semaste struct list_head task_iters; 255254721Semaste 256254721Semaste /* 257254721Semaste * On the default hierarchy, ->subsys[ssid] may point to a css 258254721Semaste * attached to an ancestor instead of the cgroup this css_set is 259254721Semaste * associated with. The following node is anchored at 260254721Semaste * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to 261254721Semaste * iterate through all css's attached to a given cgroup. 262254721Semaste */ 263254721Semaste struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; 264254721Semaste 265254721Semaste /* all threaded csets whose ->dom_cset points to this cset */ 266254721Semaste struct list_head threaded_csets; 267254721Semaste struct list_head threaded_csets_node; 268254721Semaste 269254721Semaste /* 270254721Semaste * List running through all cgroup groups in the same hash 271254721Semaste * slot. Protected by css_set_lock 272254721Semaste */ 273254721Semaste struct hlist_node hlist; 274254721Semaste 275254721Semaste /* 276254721Semaste * List of cgrp_cset_links pointing at cgroups referenced from this 277254721Semaste * css_set. Protected by css_set_lock. 278254721Semaste */ 279254721Semaste struct list_head cgrp_links; 280254721Semaste 281254721Semaste /* 282254721Semaste * List of csets participating in the on-going migration either as 283254721Semaste * source or destination. Protected by cgroup_mutex. 284254721Semaste */ 285254721Semaste struct list_head mg_src_preload_node; 286254721Semaste struct list_head mg_dst_preload_node; 287254721Semaste struct list_head mg_node; 288254721Semaste 289254721Semaste /* 290254721Semaste * If this cset is acting as the source of migration the following 291254721Semaste * two fields are set. mg_src_cgrp and mg_dst_cgrp are 292254721Semaste * respectively the source and destination cgroups of the on-going 293254721Semaste * migration. mg_dst_cset is the destination cset the target tasks 294254721Semaste * on this cset should be migrated to. Protected by cgroup_mutex. 295254721Semaste */ 296254721Semaste struct cgroup *mg_src_cgrp; 297254721Semaste struct cgroup *mg_dst_cgrp; 298254721Semaste struct css_set *mg_dst_cset; 299254721Semaste 300254721Semaste /* dead and being drained, ignore for migration */ 301254721Semaste bool dead; 302254721Semaste 303254721Semaste /* For RCU-protected deletion */ 304254721Semaste struct rcu_head rcu_head; 305254721Semaste}; 306254721Semaste 307254721Semastestruct cgroup_base_stat { 308254721Semaste struct task_cputime cputime; 309254721Semaste 310254721Semaste#ifdef CONFIG_SCHED_CORE 311254721Semaste u64 forceidle_sum; 312254721Semaste#endif 313254721Semaste}; 314254721Semaste 315254721Semaste/* 316254721Semaste * rstat - cgroup scalable recursive statistics. Accounting is done 317254721Semaste * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the 318254721Semaste * hierarchy on reads. 319254721Semaste * 320254721Semaste * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are 321254721Semaste * linked into the updated tree. On the following read, propagation only 322254721Semaste * considers and consumes the updated tree. This makes reading O(the 323254721Semaste * number of descendants which have been active since last read) instead of 324254721Semaste * O(the total number of descendants). 325254721Semaste * 326254721Semaste * This is important because there can be a lot of (draining) cgroups which 327254721Semaste * aren't active and stat may be read frequently. The combination can 328254721Semaste * become very expensive. By propagating selectively, increasing reading 329254721Semaste * frequency decreases the cost of each read. 330254721Semaste * 331254721Semaste * This struct hosts both the fields which implement the above - 332254721Semaste * updated_children and updated_next - and the fields which track basic 333254721Semaste * resource statistics on top of it - bsync, bstat and last_bstat. 334254721Semaste */ 335254721Semastestruct cgroup_rstat_cpu { 336254721Semaste /* 337254721Semaste * ->bsync protects ->bstat. These are the only fields which get 338254721Semaste * updated in the hot path. 339254721Semaste */ 340254721Semaste struct u64_stats_sync bsync; 341254721Semaste struct cgroup_base_stat bstat; 342254721Semaste 343254721Semaste /* 344254721Semaste * Snapshots at the last reading. These are used to calculate the 345254721Semaste * deltas to propagate to the global counters. 346254721Semaste */ 347254721Semaste struct cgroup_base_stat last_bstat; 348254721Semaste 349254721Semaste /* 350254721Semaste * This field is used to record the cumulative per-cpu time of 351254721Semaste * the cgroup and its descendants. Currently it can be read via 352254721Semaste * eBPF/drgn etc, and we are still trying to determine how to 353254721Semaste * expose it in the cgroupfs interface. 354254721Semaste */ 355254721Semaste struct cgroup_base_stat subtree_bstat; 356254721Semaste 357254721Semaste /* 358254721Semaste * Snapshots at the last reading. These are used to calculate the 359254721Semaste * deltas to propagate to the per-cpu subtree_bstat. 360254721Semaste */ 361254721Semaste struct cgroup_base_stat last_subtree_bstat; 362254721Semaste 363254721Semaste /* 364254721Semaste * Child cgroups with stat updates on this cpu since the last read 365254721Semaste * are linked on the parent's ->updated_children through 366254721Semaste * ->updated_next. 367254721Semaste * 368254721Semaste * In addition to being more compact, singly-linked list pointing 369254721Semaste * to the cgroup makes it unnecessary for each per-cpu struct to 370254721Semaste * point back to the associated cgroup. 371254721Semaste * 372254721Semaste * Protected by per-cpu cgroup_rstat_cpu_lock. 373254721Semaste */ 374254721Semaste struct cgroup *updated_children; /* terminated by self cgroup */ 375254721Semaste struct cgroup *updated_next; /* NULL iff not on the list */ 376254721Semaste}; 377254721Semaste 378254721Semastestruct cgroup_freezer_state { 379254721Semaste /* Should the cgroup and its descendants be frozen. */ 380254721Semaste bool freeze; 381254721Semaste 382254721Semaste /* Should the cgroup actually be frozen? */ 383254721Semaste int e_freeze; 384254721Semaste 385254721Semaste /* Fields below are protected by css_set_lock */ 386254721Semaste 387254721Semaste /* Number of frozen descendant cgroups */ 388254721Semaste int nr_frozen_descendants; 389254721Semaste 390254721Semaste /* 391254721Semaste * Number of tasks, which are counted as frozen: 392254721Semaste * frozen, SIGSTOPped, and PTRACEd. 393254721Semaste */ 394254721Semaste int nr_frozen_tasks; 395254721Semaste}; 396254721Semaste 397254721Semastestruct cgroup { 398254721Semaste /* self css with NULL ->ss, points back to this cgroup */ 399254721Semaste struct cgroup_subsys_state self; 400254721Semaste 401254721Semaste unsigned long flags; /* "unsigned long" so bitops work */ 402254721Semaste 403254721Semaste /* 404254721Semaste * The depth this cgroup is at. The root is at depth zero and each 405254721Semaste * step down the hierarchy increments the level. This along with 406254721Semaste * ancestors[] can determine whether a given cgroup is a 407254721Semaste * descendant of another without traversing the hierarchy. 408254721Semaste */ 409254721Semaste int level; 410254721Semaste 411254721Semaste /* Maximum allowed descent tree depth */ 412254721Semaste int max_depth; 413254721Semaste 414254721Semaste /* 415254721Semaste * Keep track of total numbers of visible and dying descent cgroups. 416254721Semaste * Dying cgroups are cgroups which were deleted by a user, 417254721Semaste * but are still existing because someone else is holding a reference. 418254721Semaste * max_descendants is a maximum allowed number of descent cgroups. 419254721Semaste * 420254721Semaste * nr_descendants and nr_dying_descendants are protected 421254721Semaste * by cgroup_mutex and css_set_lock. It's fine to read them holding 422254721Semaste * any of cgroup_mutex and css_set_lock; for writing both locks 423254721Semaste * should be held. 424254721Semaste */ 425254721Semaste int nr_descendants; 426254721Semaste int nr_dying_descendants; 427254721Semaste int max_descendants; 428254721Semaste 429254721Semaste /* 430254721Semaste * Each non-empty css_set associated with this cgroup contributes 431254721Semaste * one to nr_populated_csets. The counter is zero iff this cgroup 432254721Semaste * doesn't have any tasks. 433254721Semaste * 434254721Semaste * All children which have non-zero nr_populated_csets and/or 435254721Semaste * nr_populated_children of their own contribute one to either 436254721Semaste * nr_populated_domain_children or nr_populated_threaded_children 437254721Semaste * depending on their type. Each counter is zero iff all cgroups 438254721Semaste * of the type in the subtree proper don't have any tasks. 439254721Semaste */ 440254721Semaste int nr_populated_csets; 441254721Semaste int nr_populated_domain_children; 442254721Semaste int nr_populated_threaded_children; 443254721Semaste 444254721Semaste int nr_threaded_children; /* # of live threaded child cgroups */ 445254721Semaste 446254721Semaste struct kernfs_node *kn; /* cgroup kernfs entry */ 447254721Semaste struct cgroup_file procs_file; /* handle for "cgroup.procs" */ 448254721Semaste struct cgroup_file events_file; /* handle for "cgroup.events" */ 449254721Semaste 450254721Semaste /* handles for "{cpu,memory,io,irq}.pressure" */ 451254721Semaste struct cgroup_file psi_files[NR_PSI_RESOURCES]; 452254721Semaste 453254721Semaste /* 454254721Semaste * The bitmask of subsystems enabled on the child cgroups. 455254721Semaste * ->subtree_control is the one configured through 456254721Semaste * "cgroup.subtree_control" while ->subtree_ss_mask is the effective 457254721Semaste * one which may have more subsystems enabled. Controller knobs 458254721Semaste * are made available iff it's enabled in ->subtree_control. 459254721Semaste */ 460254721Semaste u16 subtree_control; 461254721Semaste u16 subtree_ss_mask; 462254721Semaste u16 old_subtree_control; 463254721Semaste u16 old_subtree_ss_mask; 464254721Semaste 465254721Semaste /* Private pointers for each registered subsystem */ 466254721Semaste struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 467254721Semaste 468254721Semaste struct cgroup_root *root; 469254721Semaste 470254721Semaste /* 471254721Semaste * List of cgrp_cset_links pointing at css_sets with tasks in this 472254721Semaste * cgroup. Protected by css_set_lock. 473254721Semaste */ 474254721Semaste struct list_head cset_links; 475254721Semaste 476254721Semaste /* 477254721Semaste * On the default hierarchy, a css_set for a cgroup with some 478254721Semaste * susbsys disabled will point to css's which are associated with 479254721Semaste * the closest ancestor which has the subsys enabled. The 480254721Semaste * following lists all css_sets which point to this cgroup's css 481254721Semaste * for the given subsystem. 482254721Semaste */ 483254721Semaste struct list_head e_csets[CGROUP_SUBSYS_COUNT]; 484254721Semaste 485254721Semaste /* 486254721Semaste * If !threaded, self. If threaded, it points to the nearest 487254721Semaste * domain ancestor. Inside a threaded subtree, cgroups are exempt 488254721Semaste * from process granularity and no-internal-task constraint. 489254721Semaste * Domain level resource consumptions which aren't tied to a 490254721Semaste * specific task are charged to the dom_cgrp. 491254721Semaste */ 492254721Semaste struct cgroup *dom_cgrp; 493254721Semaste struct cgroup *old_dom_cgrp; /* used while enabling threaded */ 494254721Semaste 495254721Semaste /* per-cpu recursive resource statistics */ 496254721Semaste struct cgroup_rstat_cpu __percpu *rstat_cpu; 497254721Semaste struct list_head rstat_css_list; 498254721Semaste 499254721Semaste /* 500254721Semaste * Add padding to separate the read mostly rstat_cpu and 501254721Semaste * rstat_css_list into a different cacheline from the following 502254721Semaste * rstat_flush_next and *bstat fields which can have frequent updates. 503254721Semaste */ 504254721Semaste CACHELINE_PADDING(_pad_); 505254721Semaste 506254721Semaste /* 507254721Semaste * A singly-linked list of cgroup structures to be rstat flushed. 508254721Semaste * This is a scratch field to be used exclusively by 509254721Semaste * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. 510254721Semaste */ 511254721Semaste struct cgroup *rstat_flush_next; 512254721Semaste 513254721Semaste /* cgroup basic resource statistics */ 514254721Semaste struct cgroup_base_stat last_bstat; 515254721Semaste struct cgroup_base_stat bstat; 516254721Semaste struct prev_cputime prev_cputime; /* for printing out cputime */ 517254721Semaste 518254721Semaste /* 519254721Semaste * list of pidlists, up to two for each namespace (one for procs, one 520254721Semaste * for tasks); created on demand. 521254721Semaste */ 522254721Semaste struct list_head pidlists; 523254721Semaste struct mutex pidlist_mutex; 524254721Semaste 525254721Semaste /* used to wait for offlining of csses */ 526254721Semaste wait_queue_head_t offline_waitq; 527254721Semaste 528254721Semaste /* used to schedule release agent */ 529254721Semaste struct work_struct release_agent_work; 530254721Semaste 531254721Semaste /* used to track pressure stalls */ 532254721Semaste struct psi_group *psi; 533254721Semaste 534254721Semaste /* used to store eBPF programs */ 535254721Semaste struct cgroup_bpf bpf; 536254721Semaste 537254721Semaste /* If there is block congestion on this cgroup. */ 538254721Semaste atomic_t congestion_count; 539254721Semaste 540254721Semaste /* Used to store internal freezer state */ 541254721Semaste struct cgroup_freezer_state freezer; 542254721Semaste 543254721Semaste#ifdef CONFIG_BPF_SYSCALL 544254721Semaste struct bpf_local_storage __rcu *bpf_cgrp_storage; 545254721Semaste#endif 546254721Semaste 547254721Semaste /* All ancestors including self */ 548254721Semaste struct cgroup *ancestors[]; 549254721Semaste}; 550254721Semaste 551254721Semaste/* 552254721Semaste * A cgroup_root represents the root of a cgroup hierarchy, and may be 553254721Semaste * associated with a kernfs_root to form an active hierarchy. This is 554254721Semaste * internal to cgroup core. Don't access directly from controllers. 555254721Semaste */ 556254721Semastestruct cgroup_root { 557254721Semaste struct kernfs_root *kf_root; 558254721Semaste 559254721Semaste /* The bitmask of subsystems attached to this hierarchy */ 560254721Semaste unsigned int subsys_mask; 561254721Semaste 562254721Semaste /* Unique id for this hierarchy. */ 563254721Semaste int hierarchy_id; 564254721Semaste 565254721Semaste /* A list running through the active hierarchies */ 566254721Semaste struct list_head root_list; 567254721Semaste struct rcu_head rcu; /* Must be near the top */ 568254721Semaste 569254721Semaste /* 570254721Semaste * The root cgroup. The containing cgroup_root will be destroyed on its 571254721Semaste * release. cgrp->ancestors[0] will be used overflowing into the 572254721Semaste * following field. cgrp_ancestor_storage must immediately follow. 573254721Semaste */ 574254721Semaste struct cgroup cgrp; 575254721Semaste 576254721Semaste /* must follow cgrp for cgrp->ancestors[0], see above */ 577254721Semaste struct cgroup *cgrp_ancestor_storage; 578254721Semaste 579254721Semaste /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ 580254721Semaste atomic_t nr_cgrps; 581254721Semaste 582254721Semaste /* Hierarchy-specific flags */ 583254721Semaste unsigned int flags; 584254721Semaste 585254721Semaste /* The path to use for release notifications. */ 586254721Semaste char release_agent_path[PATH_MAX]; 587254721Semaste 588254721Semaste /* The name for this hierarchy - may be empty */ 589254721Semaste char name[MAX_CGROUP_ROOT_NAMELEN]; 590254721Semaste}; 591254721Semaste 592254721Semaste/* 593254721Semaste * struct cftype: handler definitions for cgroup control files 594254721Semaste * 595254721Semaste * When reading/writing to a file: 596254721Semaste * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata 597254721Semaste * - the 'cftype' of the file is file->f_path.dentry->d_fsdata 598254721Semaste */ 599254721Semastestruct cftype { 600254721Semaste /* 601254721Semaste * By convention, the name should begin with the name of the 602254721Semaste * subsystem, followed by a period. Zero length string indicates 603254721Semaste * end of cftype array. 604254721Semaste */ 605254721Semaste char name[MAX_CFTYPE_NAME]; 606254721Semaste unsigned long private; 607254721Semaste 608254721Semaste /* 609254721Semaste * The maximum length of string, excluding trailing nul, that can 610254721Semaste * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. 611254721Semaste */ 612254721Semaste size_t max_write_len; 613254721Semaste 614254721Semaste /* CFTYPE_* flags */ 615263363Semaste unsigned int flags; 616263363Semaste 617254721Semaste /* 618254721Semaste * If non-zero, should contain the offset from the start of css to 619254721Semaste * a struct cgroup_file field. cgroup will record the handle of 620254721Semaste * the created file into it. The recorded handle can be used as 621254721Semaste * long as the containing css remains accessible. 622254721Semaste */ 623254721Semaste unsigned int file_offset; 624254721Semaste 625254721Semaste /* 626254721Semaste * Fields used for internal bookkeeping. Initialized automatically 627254721Semaste * during registration. 628254721Semaste */ 629254721Semaste struct cgroup_subsys *ss; /* NULL for cgroup core files */ 630254721Semaste struct list_head node; /* anchored at ss->cfts */ 631254721Semaste struct kernfs_ops *kf_ops; 632254721Semaste 633254721Semaste int (*open)(struct kernfs_open_file *of); 634254721Semaste void (*release)(struct kernfs_open_file *of); 635254721Semaste 636254721Semaste /* 637254721Semaste * read_u64() is a shortcut for the common case of returning a 638254721Semaste * single integer. Use it in place of read() 639254721Semaste */ 640254721Semaste u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); 641254721Semaste /* 642254721Semaste * read_s64() is a signed version of read_u64() 643254721Semaste */ 644254721Semaste s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 645254721Semaste 646254721Semaste /* generic seq_file read interface */ 647254721Semaste int (*seq_show)(struct seq_file *sf, void *v); 648254721Semaste 649254721Semaste /* optional ops, implement all or none */ 650254721Semaste void *(*seq_start)(struct seq_file *sf, loff_t *ppos); 651254721Semaste void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); 652254721Semaste void (*seq_stop)(struct seq_file *sf, void *v); 653254721Semaste 654254721Semaste /* 655254721Semaste * write_u64() is a shortcut for the common case of accepting 656254721Semaste * a single integer (as parsed by simple_strtoull) from 657254721Semaste * userspace. Use in place of write(); return 0 or error. 658254721Semaste */ 659254721Semaste int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, 660254721Semaste u64 val); 661254721Semaste /* 662254721Semaste * write_s64() is a signed version of write_u64() 663254721Semaste */ 664254721Semaste int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, 665254721Semaste s64 val); 666254721Semaste 667254721Semaste /* 668254721Semaste * write() is the generic write callback which maps directly to 669254721Semaste * kernfs write operation and overrides all other operations. 670254721Semaste * Maximum write size is determined by ->max_write_len. Use 671254721Semaste * of_css/cft() to access the associated css and cft. 672254721Semaste */ 673254721Semaste ssize_t (*write)(struct kernfs_open_file *of, 674254721Semaste char *buf, size_t nbytes, loff_t off); 675254721Semaste 676254721Semaste __poll_t (*poll)(struct kernfs_open_file *of, 677254721Semaste struct poll_table_struct *pt); 678254721Semaste 679254721Semaste#ifdef CONFIG_DEBUG_LOCK_ALLOC 680254721Semaste struct lock_class_key lockdep_key; 681254721Semaste#endif 682254721Semaste}; 683254721Semaste 684254721Semaste/* 685254721Semaste * Control Group subsystem type. 686254721Semaste * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details 687254721Semaste */ 688254721Semastestruct cgroup_subsys { 689254721Semaste struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); 690254721Semaste int (*css_online)(struct cgroup_subsys_state *css); 691254721Semaste void (*css_offline)(struct cgroup_subsys_state *css); 692254721Semaste void (*css_released)(struct cgroup_subsys_state *css); 693254721Semaste void (*css_free)(struct cgroup_subsys_state *css); 694254721Semaste void (*css_reset)(struct cgroup_subsys_state *css); 695254721Semaste void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); 696254721Semaste int (*css_extra_stat_show)(struct seq_file *seq, 697254721Semaste struct cgroup_subsys_state *css); 698254721Semaste int (*css_local_stat_show)(struct seq_file *seq, 699254721Semaste struct cgroup_subsys_state *css); 700254721Semaste 701254721Semaste int (*can_attach)(struct cgroup_taskset *tset); 702254721Semaste void (*cancel_attach)(struct cgroup_taskset *tset); 703254721Semaste void (*attach)(struct cgroup_taskset *tset); 704254721Semaste void (*post_attach)(void); 705254721Semaste int (*can_fork)(struct task_struct *task, 706254721Semaste struct css_set *cset); 707254721Semaste void (*cancel_fork)(struct task_struct *task, struct css_set *cset); 708254721Semaste void (*fork)(struct task_struct *task); 709254721Semaste void (*exit)(struct task_struct *task); 710254721Semaste void (*release)(struct task_struct *task); 711254721Semaste void (*bind)(struct cgroup_subsys_state *root_css); 712254721Semaste 713254721Semaste bool early_init:1; 714254721Semaste 715254721Semaste /* 716254721Semaste * If %true, the controller, on the default hierarchy, doesn't show 717254721Semaste * up in "cgroup.controllers" or "cgroup.subtree_control", is 718254721Semaste * implicitly enabled on all cgroups on the default hierarchy, and 719254721Semaste * bypasses the "no internal process" constraint. This is for 720254721Semaste * utility type controllers which is transparent to userland. 721254721Semaste * 722254721Semaste * An implicit controller can be stolen from the default hierarchy 723254721Semaste * anytime and thus must be okay with offline csses from previous 724254721Semaste * hierarchies coexisting with csses for the current one. 725254721Semaste */ 726254721Semaste bool implicit_on_dfl:1; 727254721Semaste 728254721Semaste /* 729254721Semaste * If %true, the controller, supports threaded mode on the default 730254721Semaste * hierarchy. In a threaded subtree, both process granularity and 731254721Semaste * no-internal-process constraint are ignored and a threaded 732254721Semaste * controllers should be able to handle that. 733254721Semaste * 734254721Semaste * Note that as an implicit controller is automatically enabled on 735254721Semaste * all cgroups on the default hierarchy, it should also be 736254721Semaste * threaded. implicit && !threaded is not supported. 737254721Semaste */ 738254721Semaste bool threaded:1; 739254721Semaste 740254721Semaste /* the following two fields are initialized automatically during boot */ 741254721Semaste int id; 742254721Semaste const char *name; 743254721Semaste 744254721Semaste /* optional, initialized automatically during boot if not set */ 745254721Semaste const char *legacy_name; 746254721Semaste 747254721Semaste /* link to parent, protected by cgroup_lock() */ 748254721Semaste struct cgroup_root *root; 749254721Semaste 750254721Semaste /* idr for css->id */ 751254721Semaste struct idr css_idr; 752254721Semaste 753254721Semaste /* 754254721Semaste * List of cftypes. Each entry is the first entry of an array 755254721Semaste * terminated by zero length name. 756254721Semaste */ 757254721Semaste struct list_head cfts; 758254721Semaste 759254721Semaste /* 760254721Semaste * Base cftypes which are automatically registered. The two can 761254721Semaste * point to the same array. 762254721Semaste */ 763254721Semaste struct cftype *dfl_cftypes; /* for the default hierarchy */ 764254721Semaste struct cftype *legacy_cftypes; /* for the legacy hierarchies */ 765254721Semaste 766254721Semaste /* 767254721Semaste * A subsystem may depend on other subsystems. When such subsystem 768254721Semaste * is enabled on a cgroup, the depended-upon subsystems are enabled 769254721Semaste * together if available. Subsystems enabled due to dependency are 770254721Semaste * not visible to userland until explicitly enabled. The following 771254721Semaste * specifies the mask of subsystems that this one depends on. 772254721Semaste */ 773254721Semaste unsigned int depends_on; 774254721Semaste}; 775254721Semaste 776254721Semasteextern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 777254721Semaste 778254721Semaste/** 779254721Semaste * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups 780254721Semaste * @tsk: target task 781254721Semaste * 782254721Semaste * Allows cgroup operations to synchronize against threadgroup changes 783254721Semaste * using a percpu_rw_semaphore. 784254721Semaste */ 785254721Semastestatic inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) 786254721Semaste{ 787254721Semaste percpu_down_read(&cgroup_threadgroup_rwsem); 788254721Semaste} 789254721Semaste 790254721Semaste/** 791254721Semaste * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups 792254721Semaste * @tsk: target task 793254721Semaste * 794254721Semaste * Counterpart of cgroup_threadcgroup_change_begin(). 795254721Semaste */ 796254721Semastestatic inline void cgroup_threadgroup_change_end(struct task_struct *tsk) 797254721Semaste{ 798254721Semaste percpu_up_read(&cgroup_threadgroup_rwsem); 799254721Semaste} 800254721Semaste 801254721Semaste#else /* CONFIG_CGROUPS */ 802254721Semaste 803254721Semaste#define CGROUP_SUBSYS_COUNT 0 804254721Semaste 805254721Semastestatic inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) 806254721Semaste{ 807254721Semaste might_sleep(); 808254721Semaste} 809254721Semaste 810254721Semastestatic inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} 811254721Semaste 812254721Semaste#endif /* CONFIG_CGROUPS */ 813254721Semaste 814254721Semaste#ifdef CONFIG_SOCK_CGROUP_DATA 815254721Semaste 816254721Semaste/* 817254721Semaste * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains 818254721Semaste * per-socket cgroup information except for memcg association. 819254721Semaste * 820254721Semaste * On legacy hierarchies, net_prio and net_cls controllers directly 821254721Semaste * set attributes on each sock which can then be tested by the network 822254721Semaste * layer. On the default hierarchy, each sock is associated with the 823254721Semaste * cgroup it was created in and the networking layer can match the 824254721Semaste * cgroup directly. 825254721Semaste */ 826254721Semastestruct sock_cgroup_data { 827254721Semaste struct cgroup *cgroup; /* v2 */ 828254721Semaste#ifdef CONFIG_CGROUP_NET_CLASSID 829254721Semaste u32 classid; /* v1 */ 830254721Semaste#endif 831254721Semaste#ifdef CONFIG_CGROUP_NET_PRIO 832254721Semaste u16 prioidx; /* v1 */ 833254721Semaste#endif 834254721Semaste}; 835254721Semaste 836254721Semastestatic inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) 837254721Semaste{ 838254721Semaste#ifdef CONFIG_CGROUP_NET_PRIO 839254721Semaste return READ_ONCE(skcd->prioidx); 840254721Semaste#else 841254721Semaste return 1; 842254721Semaste#endif 843254721Semaste} 844254721Semaste 845254721Semastestatic inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) 846254721Semaste{ 847254721Semaste#ifdef CONFIG_CGROUP_NET_CLASSID 848254721Semaste return READ_ONCE(skcd->classid); 849254721Semaste#else 850254721Semaste return 0; 851254721Semaste#endif 852254721Semaste} 853254721Semaste 854254721Semastestatic inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, 855254721Semaste u16 prioidx) 856254721Semaste{ 857254721Semaste#ifdef CONFIG_CGROUP_NET_PRIO 858254721Semaste WRITE_ONCE(skcd->prioidx, prioidx); 859254721Semaste#endif 860254721Semaste} 861254721Semaste 862254721Semastestatic inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, 863254721Semaste u32 classid) 864254721Semaste{ 865254721Semaste#ifdef CONFIG_CGROUP_NET_CLASSID 866254721Semaste WRITE_ONCE(skcd->classid, classid); 867254721Semaste#endif 868254721Semaste} 869254721Semaste 870254721Semaste#else /* CONFIG_SOCK_CGROUP_DATA */ 871254721Semaste 872254721Semastestruct sock_cgroup_data { 873254721Semaste}; 874254721Semaste 875254721Semaste#endif /* CONFIG_SOCK_CGROUP_DATA */ 876254721Semaste 877254721Semaste#endif /* _LINUX_CGROUP_DEFS_H */ 878254721Semaste