/include/linux/cgroup.h
C Header | 654 lines | 284 code | 87 blank | 283 comment | 6 complexity | 99c53134e4dbf9d657c546e9bd5613ab MD5 | raw file
Possible License(s): LGPL-2.0, AGPL-1.0, GPL-2.0
1#ifndef _LINUX_CGROUP_H 2#define _LINUX_CGROUP_H 3/* 4 * cgroup interface 5 * 6 * Copyright (C) 2003 BULL SA 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 8 * 9 */ 10 11#include <linux/sched.h> 12#include <linux/cpumask.h> 13#include <linux/nodemask.h> 14#include <linux/rcupdate.h> 15#include <linux/cgroupstats.h> 16#include <linux/prio_heap.h> 17#include <linux/rwsem.h> 18#include <linux/idr.h> 19 20#ifdef CONFIG_CGROUPS 21 22struct cgroupfs_root; 23struct cgroup_subsys; 24struct inode; 25struct cgroup; 26struct css_id; 27 28extern int cgroup_init_early(void); 29extern int cgroup_init(void); 30extern void cgroup_lock(void); 31extern int cgroup_lock_is_held(void); 32extern bool cgroup_lock_live_group(struct cgroup *cgrp); 33extern void cgroup_unlock(void); 34extern void cgroup_fork(struct task_struct *p); 35extern void cgroup_fork_callbacks(struct task_struct *p); 36extern void cgroup_post_fork(struct task_struct *p); 37extern void cgroup_exit(struct task_struct *p, int run_callbacks); 38extern int cgroupstats_build(struct cgroupstats *stats, 39 struct dentry *dentry); 40extern int cgroup_load_subsys(struct cgroup_subsys *ss); 41extern void cgroup_unload_subsys(struct cgroup_subsys *ss); 42 43extern const struct file_operations proc_cgroup_operations; 44 45/* Define the enumeration of all builtin cgroup subsystems */ 46#define SUBSYS(_x) _x ## _subsys_id, 47enum cgroup_subsys_id { 48#include <linux/cgroup_subsys.h> 49 CGROUP_BUILTIN_SUBSYS_COUNT 50}; 51#undef SUBSYS 52/* 53 * This define indicates the maximum number of subsystems that can be loaded 54 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep 55 * track of all of them. 56 */ 57#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) 58 59/* Per-subsystem/per-cgroup state maintained by the system. */ 60struct cgroup_subsys_state { 61 /* 62 * The cgroup that this subsystem is attached to. Useful 63 * for subsystems that want to know about the cgroup 64 * hierarchy structure 65 */ 66 struct cgroup *cgroup; 67 68 /* 69 * State maintained by the cgroup system to allow subsystems 70 * to be "busy". Should be accessed via css_get(), 71 * css_tryget() and and css_put(). 72 */ 73 74 atomic_t refcnt; 75 76 unsigned long flags; 77 /* ID for this css, if possible */ 78 struct css_id __rcu *id; 79}; 80 81/* bits in struct cgroup_subsys_state flags field */ 82enum { 83 CSS_ROOT, /* This CSS is the root of the subsystem */ 84 CSS_REMOVED, /* This CSS is dead */ 85}; 86 87/* 88 * Call css_get() to hold a reference on the css; it can be used 89 * for a reference obtained via: 90 * - an existing ref-counted reference to the css 91 * - task->cgroups for a locked task 92 */ 93 94extern void __css_get(struct cgroup_subsys_state *css, int count); 95static inline void css_get(struct cgroup_subsys_state *css) 96{ 97 /* We don't need to reference count the root state */ 98 if (!test_bit(CSS_ROOT, &css->flags)) 99 __css_get(css, 1); 100} 101 102static inline bool css_is_removed(struct cgroup_subsys_state *css) 103{ 104 return test_bit(CSS_REMOVED, &css->flags); 105} 106 107/* 108 * Call css_tryget() to take a reference on a css if your existing 109 * (known-valid) reference isn't already ref-counted. Returns false if 110 * the css has been destroyed. 111 */ 112 113static inline bool css_tryget(struct cgroup_subsys_state *css) 114{ 115 if (test_bit(CSS_ROOT, &css->flags)) 116 return true; 117 while (!atomic_inc_not_zero(&css->refcnt)) { 118 if (test_bit(CSS_REMOVED, &css->flags)) 119 return false; 120 cpu_relax(); 121 } 122 return true; 123} 124 125/* 126 * css_put() should be called to release a reference taken by 127 * css_get() or css_tryget() 128 */ 129 130extern void __css_put(struct cgroup_subsys_state *css, int count); 131static inline void css_put(struct cgroup_subsys_state *css) 132{ 133 if (!test_bit(CSS_ROOT, &css->flags)) 134 __css_put(css, 1); 135} 136 137/* bits in struct cgroup flags field */ 138enum { 139 /* Control Group is dead */ 140 CGRP_REMOVED, 141 /* Control Group has ever had a child cgroup or a task */ 142 CGRP_RELEASABLE, 143 /* Control Group requires release notifications to userspace */ 144 CGRP_NOTIFY_ON_RELEASE, 145 /* 146 * A thread in rmdir() is wating for this cgroup. 147 */ 148 CGRP_WAIT_ON_RMDIR, 149 /* 150 * Clone cgroup values when creating a new child cgroup 151 */ 152 CGRP_CLONE_CHILDREN, 153}; 154 155/* which pidlist file are we talking about? */ 156enum cgroup_filetype { 157 CGROUP_FILE_PROCS, 158 CGROUP_FILE_TASKS, 159}; 160 161/* 162 * A pidlist is a list of pids that virtually represents the contents of one 163 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, 164 * a pair (one each for procs, tasks) for each pid namespace that's relevant 165 * to the cgroup. 166 */ 167struct cgroup_pidlist { 168 /* 169 * used to find which pidlist is wanted. doesn't change as long as 170 * this particular list stays in the list. 171 */ 172 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; 173 /* array of xids */ 174 pid_t *list; 175 /* how many elements the above list has */ 176 int length; 177 /* how many files are using the current array */ 178 int use_count; 179 /* each of these stored in a list by its cgroup */ 180 struct list_head links; 181 /* pointer to the cgroup we belong to, for list removal purposes */ 182 struct cgroup *owner; 183 /* protects the other fields */ 184 struct rw_semaphore mutex; 185}; 186 187struct cgroup { 188 unsigned long flags; /* "unsigned long" so bitops work */ 189 190 /* 191 * count users of this cgroup. >0 means busy, but doesn't 192 * necessarily indicate the number of tasks in the cgroup 193 */ 194 atomic_t count; 195 196 /* 197 * We link our 'sibling' struct into our parent's 'children'. 198 * Our children link their 'sibling' into our 'children'. 199 */ 200 struct list_head sibling; /* my parent's children */ 201 struct list_head children; /* my children */ 202 203 struct cgroup *parent; /* my parent */ 204 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ 205 206 /* Private pointers for each registered subsystem */ 207 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 208 209 struct cgroupfs_root *root; 210 struct cgroup *top_cgroup; 211 212 /* 213 * List of cg_cgroup_links pointing at css_sets with 214 * tasks in this cgroup. Protected by css_set_lock 215 */ 216 struct list_head css_sets; 217 218 /* 219 * Linked list running through all cgroups that can 220 * potentially be reaped by the release agent. Protected by 221 * release_list_lock 222 */ 223 struct list_head release_list; 224 225 /* 226 * list of pidlists, up to two for each namespace (one for procs, one 227 * for tasks); created on demand. 228 */ 229 struct list_head pidlists; 230 struct mutex pidlist_mutex; 231 232 /* For RCU-protected deletion */ 233 struct rcu_head rcu_head; 234 235 /* List of events which userspace want to receive */ 236 struct list_head event_list; 237 spinlock_t event_list_lock; 238}; 239 240/* 241 * A css_set is a structure holding pointers to a set of 242 * cgroup_subsys_state objects. This saves space in the task struct 243 * object and speeds up fork()/exit(), since a single inc/dec and a 244 * list_add()/del() can bump the reference count on the entire cgroup 245 * set for a task. 246 */ 247 248struct css_set { 249 250 /* Reference count */ 251 atomic_t refcount; 252 253 /* 254 * List running through all cgroup groups in the same hash 255 * slot. Protected by css_set_lock 256 */ 257 struct hlist_node hlist; 258 259 /* 260 * List running through all tasks using this cgroup 261 * group. Protected by css_set_lock 262 */ 263 struct list_head tasks; 264 265 /* 266 * List of cg_cgroup_link objects on link chains from 267 * cgroups referenced from this css_set. Protected by 268 * css_set_lock 269 */ 270 struct list_head cg_links; 271 272 /* 273 * Set of subsystem states, one for each subsystem. This array 274 * is immutable after creation apart from the init_css_set 275 * during subsystem registration (at boot time) and modular subsystem 276 * loading/unloading. 277 */ 278 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 279 280 /* For RCU-protected deletion */ 281 struct rcu_head rcu_head; 282 struct work_struct work; 283}; 284 285/* 286 * cgroup_map_cb is an abstract callback API for reporting map-valued 287 * control files 288 */ 289 290struct cgroup_map_cb { 291 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 292 void *state; 293}; 294 295/* 296 * struct cftype: handler definitions for cgroup control files 297 * 298 * When reading/writing to a file: 299 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 300 * - the 'cftype' of the file is file->f_dentry->d_fsdata 301 */ 302 303#define MAX_CFTYPE_NAME 64 304struct cftype { 305 /* 306 * By convention, the name should begin with the name of the 307 * subsystem, followed by a period 308 */ 309 char name[MAX_CFTYPE_NAME]; 310 int private; 311 /* 312 * If not 0, file mode is set to this value, otherwise it will 313 * be figured out automatically 314 */ 315 mode_t mode; 316 317 /* 318 * If non-zero, defines the maximum length of string that can 319 * be passed to write_string; defaults to 64 320 */ 321 size_t max_write_len; 322 323 int (*open)(struct inode *inode, struct file *file); 324 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 325 struct file *file, 326 char __user *buf, size_t nbytes, loff_t *ppos); 327 /* 328 * read_u64() is a shortcut for the common case of returning a 329 * single integer. Use it in place of read() 330 */ 331 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 332 /* 333 * read_s64() is a signed version of read_u64() 334 */ 335 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 336 /* 337 * read_map() is used for defining a map of key/value 338 * pairs. It should call cb->fill(cb, key, value) for each 339 * entry. The key/value pairs (and their ordering) should not 340 * change between reboots. 341 */ 342 int (*read_map)(struct cgroup *cont, struct cftype *cft, 343 struct cgroup_map_cb *cb); 344 /* 345 * read_seq_string() is used for outputting a simple sequence 346 * using seqfile. 347 */ 348 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, 349 struct seq_file *m); 350 351 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 352 struct file *file, 353 const char __user *buf, size_t nbytes, loff_t *ppos); 354 355 /* 356 * write_u64() is a shortcut for the common case of accepting 357 * a single integer (as parsed by simple_strtoull) from 358 * userspace. Use in place of write(); return 0 or error. 359 */ 360 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 361 /* 362 * write_s64() is a signed version of write_u64() 363 */ 364 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 365 366 /* 367 * write_string() is passed a nul-terminated kernelspace 368 * buffer of maximum length determined by max_write_len. 369 * Returns 0 or -ve error code. 370 */ 371 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 372 const char *buffer); 373 /* 374 * trigger() callback can be used to get some kick from the 375 * userspace, when the actual string written is not important 376 * at all. The private field can be used to determine the 377 * kick type for multiplexing. 378 */ 379 int (*trigger)(struct cgroup *cgrp, unsigned int event); 380 381 int (*release)(struct inode *inode, struct file *file); 382 383 /* 384 * register_event() callback will be used to add new userspace 385 * waiter for changes related to the cftype. Implement it if 386 * you want to provide this functionality. Use eventfd_signal() 387 * on eventfd to send notification to userspace. 388 */ 389 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 390 struct eventfd_ctx *eventfd, const char *args); 391 /* 392 * unregister_event() callback will be called when userspace 393 * closes the eventfd or on cgroup removing. 394 * This callback must be implemented, if you want provide 395 * notification functionality. 396 */ 397 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 398 struct eventfd_ctx *eventfd); 399}; 400 401struct cgroup_scanner { 402 struct cgroup *cg; 403 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); 404 void (*process_task)(struct task_struct *p, 405 struct cgroup_scanner *scan); 406 struct ptr_heap *heap; 407 void *data; 408}; 409 410/* 411 * Add a new file to the given cgroup directory. Should only be 412 * called by subsystems from within a populate() method 413 */ 414int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 415 const struct cftype *cft); 416 417/* 418 * Add a set of new files to the given cgroup directory. Should 419 * only be called by subsystems from within a populate() method 420 */ 421int cgroup_add_files(struct cgroup *cgrp, 422 struct cgroup_subsys *subsys, 423 const struct cftype cft[], 424 int count); 425 426int cgroup_is_removed(const struct cgroup *cgrp); 427 428int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 429 430int cgroup_task_count(const struct cgroup *cgrp); 431 432/* Return true if cgrp is a descendant of the task's cgroup */ 433int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 434 435/* 436 * When the subsys has to access css and may add permanent refcnt to css, 437 * it should take care of racy conditions with rmdir(). Following set of 438 * functions, is for stop/restart rmdir if necessary. 439 * Because these will call css_get/put, "css" should be alive css. 440 * 441 * cgroup_exclude_rmdir(); 442 * ...do some jobs which may access arbitrary empty cgroup 443 * cgroup_release_and_wakeup_rmdir(); 444 * 445 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, 446 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. 447 */ 448 449void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); 450void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); 451 452/* 453 * Control Group subsystem type. 454 * See Documentation/cgroups/cgroups.txt for details 455 */ 456 457struct cgroup_subsys { 458 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, 459 struct cgroup *cgrp); 460 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 461 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 462 int (*allow_attach)(struct cgroup *cgrp, struct task_struct *tsk); 463 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 464 struct task_struct *tsk); 465 int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); 466 void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 467 struct task_struct *tsk); 468 void (*pre_attach)(struct cgroup *cgrp); 469 void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); 470 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 471 struct cgroup *old_cgrp, struct task_struct *tsk); 472 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 473 void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, 474 struct cgroup *old_cgrp, struct task_struct *task); 475 int (*populate)(struct cgroup_subsys *ss, 476 struct cgroup *cgrp); 477 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); 478 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); 479 480 int subsys_id; 481 int active; 482 int disabled; 483 int early_init; 484 /* 485 * True if this subsys uses ID. ID is not available before cgroup_init() 486 * (not available in early_init time.) 487 */ 488 bool use_id; 489#define MAX_CGROUP_TYPE_NAMELEN 32 490 const char *name; 491 492 /* 493 * Protects sibling/children links of cgroups in this 494 * hierarchy, plus protects which hierarchy (or none) the 495 * subsystem is a part of (i.e. root/sibling). To avoid 496 * potential deadlocks, the following operations should not be 497 * undertaken while holding any hierarchy_mutex: 498 * 499 * - allocating memory 500 * - initiating hotplug events 501 */ 502 struct mutex hierarchy_mutex; 503 struct lock_class_key subsys_key; 504 505 /* 506 * Link to parent, and list entry in parent's children. 507 * Protected by this->hierarchy_mutex and cgroup_lock() 508 */ 509 struct cgroupfs_root *root; 510 struct list_head sibling; 511 /* used when use_id == true */ 512 struct idr idr; 513 spinlock_t id_lock; 514 515 /* should be defined only by modular subsystems */ 516 struct module *module; 517}; 518 519#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 520#include <linux/cgroup_subsys.h> 521#undef SUBSYS 522 523static inline struct cgroup_subsys_state *cgroup_subsys_state( 524 struct cgroup *cgrp, int subsys_id) 525{ 526 return cgrp->subsys[subsys_id]; 527} 528 529/* 530 * function to get the cgroup_subsys_state which allows for extra 531 * rcu_dereference_check() conditions, such as locks used during the 532 * cgroup_subsys::attach() methods. 533 */ 534#define task_subsys_state_check(task, subsys_id, __c) \ 535 rcu_dereference_check(task->cgroups->subsys[subsys_id], \ 536 lockdep_is_held(&task->alloc_lock) || \ 537 cgroup_lock_is_held() || (__c)) 538 539static inline struct cgroup_subsys_state * 540task_subsys_state(struct task_struct *task, int subsys_id) 541{ 542 return task_subsys_state_check(task, subsys_id, false); 543} 544 545static inline struct cgroup* task_cgroup(struct task_struct *task, 546 int subsys_id) 547{ 548 return task_subsys_state(task, subsys_id)->cgroup; 549} 550 551/* A cgroup_iter should be treated as an opaque object */ 552struct cgroup_iter { 553 struct list_head *cg_link; 554 struct list_head *task; 555}; 556 557/* 558 * To iterate across the tasks in a cgroup: 559 * 560 * 1) call cgroup_iter_start to initialize an iterator 561 * 562 * 2) call cgroup_iter_next() to retrieve member tasks until it 563 * returns NULL or until you want to end the iteration 564 * 565 * 3) call cgroup_iter_end() to destroy the iterator. 566 * 567 * Or, call cgroup_scan_tasks() to iterate through every task in a 568 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling 569 * the test_task() callback, but not while calling the process_task() 570 * callback. 571 */ 572void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); 573struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 574 struct cgroup_iter *it); 575void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 576int cgroup_scan_tasks(struct cgroup_scanner *scan); 577int cgroup_attach_task(struct cgroup *, struct task_struct *); 578int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 579 580static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) 581{ 582 return cgroup_attach_task_all(current, tsk); 583} 584 585/* 586 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works 587 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. 588 * CSS ID is assigned at cgroup allocation (create) automatically 589 * and removed when subsys calls free_css_id() function. This is because 590 * the lifetime of cgroup_subsys_state is subsys's matter. 591 * 592 * Looking up and scanning function should be called under rcu_read_lock(). 593 * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. 594 * But the css returned by this routine can be "not populated yet" or "being 595 * destroyed". The caller should check css and cgroup's status. 596 */ 597 598/* 599 * Typically Called at ->destroy(), or somewhere the subsys frees 600 * cgroup_subsys_state. 601 */ 602void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); 603 604/* Find a cgroup_subsys_state which has given ID */ 605 606struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); 607 608/* 609 * Get a cgroup whose id is greater than or equal to id under tree of root. 610 * Returning a cgroup_subsys_state or NULL. 611 */ 612struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, 613 struct cgroup_subsys_state *root, int *foundid); 614 615/* Returns true if root is ancestor of cg */ 616bool css_is_ancestor(struct cgroup_subsys_state *cg, 617 const struct cgroup_subsys_state *root); 618 619/* Get id and depth of css */ 620unsigned short css_id(struct cgroup_subsys_state *css); 621unsigned short css_depth(struct cgroup_subsys_state *css); 622struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 623 624#else /* !CONFIG_CGROUPS */ 625 626static inline int cgroup_init_early(void) { return 0; } 627static inline int cgroup_init(void) { return 0; } 628static inline void cgroup_fork(struct task_struct *p) {} 629static inline void cgroup_fork_callbacks(struct task_struct *p) {} 630static inline void cgroup_post_fork(struct task_struct *p) {} 631static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 632 633static inline void cgroup_lock(void) {} 634static inline void cgroup_unlock(void) {} 635static inline int cgroupstats_build(struct cgroupstats *stats, 636 struct dentry *dentry) 637{ 638 return -EINVAL; 639} 640 641/* No cgroups - nothing to do */ 642static inline int cgroup_attach_task_all(struct task_struct *from, 643 struct task_struct *t) 644{ 645 return 0; 646} 647static inline int cgroup_attach_task_current_cg(struct task_struct *t) 648{ 649 return 0; 650} 651 652#endif /* !CONFIG_CGROUPS */ 653 654#endif /* _LINUX_CGROUP_H */