linux cgroup (2)—-mount

kernel: 5.0
The following mount process is the first mount after the initialization is completed.

cgroup_mount

static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{<!-- -->
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct dentry *dentry;
int ret;

get_cgroup_ns(ns);

/* Check if the caller has permission to mount. */
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {<!-- -->
put_cgroup_ns(ns);
return ERR_PTR(-EPERM);
}

/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();//

if (fs_type == & amp;cgroup2_fs_type) {<!-- -->
unsigned int root_flags;

ret = parse_cgroup_root_flags(data, & amp;root_flags);
if (ret) {<!-- -->
put_cgroup_ns(ns);
return ERR_PTR(ret);
}

cgrp_dfl_visible = true;
cgroup_get_live( & amp;cgrp_dfl_root.cgrp);

dentry = cgroup_do_mount( & amp;cgroup2_fs_type, flags, & amp;cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
if (!IS_ERR(dentry))
apply_cgroup_root_flags(root_flags);
} else {<!-- -->
dentry = cgroup1_mount( &cgroup_fs_type, flags, data,
CGROUP_SUPER_MAGIC, ns);
}

put_cgroup_ns(ns);
return dentry;
}

cgroup_enable_task_cg_lists

It only plays a role when mounting for the first time. It is used to connect all tasks at this time to the tasks linked list of css_set using cg_list as the connector. Subsequent tasks of the new fork are connected to the cgroups pointer of the parent process when they are created. The tasks linked list of css_set (see copy_process –>cgroup_post_fork)

static void cgroup_enable_task_cg_lists(void)
{<!-- -->
struct task_struct *p, *g;

/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock( & amp;tasklist_lock);
spin_lock_irq( & amp;css_set_lock);

if (use_task_css_set_links)
goto out_unlock;

use_task_css_set_links = true;

do_each_thread(g, p) {<!-- -->
WARN_ON_ONCE(!list_empty( & amp;p->cg_list) ||
task_css_set(p) != & amp;init_css_set);

/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
*
* Interrupts were already disabled while acquiring
* the css_set_lock, so we do not need to disable it
* again when acquiring the sighand->siglock here.
*/
spin_lock( & amp;p->sighand->siglock);
if (!(p->flags & amp; PF_EXITING)) {<!-- -->
struct css_set *cset = task_css_set(p);
/*When forking a process, the cgroups of the new process first point to init_css_set(copy_process --> cgroup_fork)
*If the initial mounting has been completed (use_task_css_set_links is true), it points to the cgroups of the parent process
*/
if (!css_set_populated(cset))
css_set_update_populated(cset, true); //Set the nr_populaed_csets members of all cgroups associated with cset,
//Indicates that the cgroup has a task associated with it
list_add_tail( & amp;p->cg_list, & amp;cset->tasks);//The tasks linked list of css_set maintains the tasks in the normal state, and mg_tasks maintains the migration status.
get_css_set(cset); //Increase reference count
cset->nr_tasks + + ;
}
spin_unlock( & amp;p->sighand->siglock);
} while_each_thread(g, p);
out_unlock:
spin_unlock_irq( & amp;css_set_lock);
read_unlock( & amp;tasklist_lock);
}

cgroup1_mount( & amp;cgroup_fs_type, flags, data,CGROUP_SUPER_MAGIC, ns);

struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
void *data, unsigned long magic,
struct cgroup_namespace *ns)
{<!-- -->
struct super_block *pinned_sb = NULL;
struct cgroup_sb_opts opts;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
bool new_root = false;

cgroup_lock_and_drain_offline( & amp;cgrp_dfl_root.cgrp);

/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, & amp;opts);//Parse the data passed in by the user space, including subsystem names, etc.
if (ret)
goto out_unlock;

/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
* dying subsystems. We just need to ensure that the ones
* unmounted previously finish dying and don't care about new ones
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {<!-- -->
if (!(opts.subsys_mask & amp; (1 << i)) || //At this time, the roots of all subsystems point to cgrp_dfl_root, so this loop is skipped
ss->root == & amp;cgrp_dfl_root)
continue;

if (!percpu_ref_tryget_live( & amp;ss->root->cgrp.self.refcnt)) {<!-- -->
mutex_unlock( & amp;cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
cgroup_put( & amp;ss->root->cgrp);
}

for_each_root(root) {<!-- --> //Each cgroup_root will be connected to the cgroup_roots linked list using root_list as the connector. At this time, there is only cgrp_dfl_root
bool name_match = false;

if (root == & amp;cgrp_dfl_root)
continue;

/*
* If we asked for a name then it must match. Also, if
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
if (opts.name) {<!-- -->
if (strcmp(opts.name, root->name))
continue;
name_match = true;
}

/*
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
if ((opts.subsys_mask || opts.none) & amp; & amp;
(opts.subsys_mask != root->subsys_mask)) {<!-- -->
if (!name_match)
continue;
ret = -EBUSY;
goto out_unlock;
}

if (root->flags ^ opts.flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\\
");

/*
* We want to reuse @root whose lifetime is governed by its
* ->cgrp. Let's check whether @root is alive and keep it
* that way. As cgroup_kill_sb() can happen anytime, we
* want to block it by pinning the sb so that @root doesn't
* get killed before mount is complete.
*
* With the sb pinned, tryget_live can reliably indicate
* whether @root can be reused. If it's being killed,
* drain it. We can use wait_queue for the wait but this
* path is super cold. Let's just sleep a bit and retry.
*/
pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
if (IS_ERR(pinned_sb) ||
!percpu_ref_tryget_live( & amp;root->cgrp.self.refcnt)) {<!-- -->
mutex_unlock( & amp;cgroup_mutex);
if (!IS_ERR_OR_NULL(pinned_sb))
deactivate_super(pinned_sb);
msleep(10);
ret = restart_syscall();
goto out_free;
}

ret = 0;
goto out_unlock;
}
  //When mounting for the first time, the above two loops do nothing.
/*
* No such thing, create a new one. name= matching without subsys
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
if (!opts.subsys_mask & amp; & amp; !opts.none) {<!-- -->
ret = -EINVAL;
goto out_unlock;
}

/* Hierarchies may only be created in the initial cgroup namespace. */
if (ns != & amp;init_cgroup_ns) {<!-- -->
ret = -EPERM;
goto out_unlock;
}

root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {<!-- -->
ret = -ENOMEM;
goto out_unlock;
}
new_root = true;

init_cgroup_root(root, & amp;opts); //Now there is a second cgroup_root, a second cgroup, and a second cgoup_subsys_state in the system, and they reference each other.

ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
if (ret)
cgroup_free_root(root);

out_unlock:
mutex_unlock( & amp;cgroup_mutex);
out_free:
kfree(opts.release_agent);
kfree(opts.name);

if (ret)
return ERR_PTR(ret);
//Call kernfs_mount to complete the actual mount,
dentry = cgroup_do_mount( &cgroup_fs_type, flags, root,
CGROUP_SUPER_MAGIC, ns);

/*
* There's a race window after we release cgroup_mutex and before
* allocating a superblock. Make sure a concurrent process won't
* be able to re-use the root during this window by delaying the
* initialization of root refcnt.
*/
if (new_root) {<!-- -->
mutex_lock( & amp;cgroup_mutex);
percpu_ref_reinit( & amp;root->cgrp.self.refcnt);
mutex_unlock( & amp;cgroup_mutex);
}

/*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
if (pinned_sb)
deactivate_super(pinned_sb);

return dentry;
}

cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);

Different from calling this function during initialization, subsys_mask is not 0 at this time and will be in rebind_subsystem

int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
{<!-- -->
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = & amp;root->cgrp;
struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;

lockdep_assert_held( & amp;cgroup_mutex);

ret = cgroup_idr_alloc( & amp;root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
root_cgrp->ancestor_ids[0] = ret;

ret = percpu_ref_init( & amp;root_cgrp->self.refcnt, css_release,
ref_flags, GFP_KERNEL);
if (ret)
goto out;

/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. Later rebinding may disable
* controllers on the default hierarchy and thus create new csets,
* which can't be more than the existing ones. Allocate 2x.
*/
ret = allocate_cgrp_cset_links(2 * css_set_count, & amp;tmp_links);
if (ret)
goto cancel_ref;

ret = cgroup_init_root_id(root);
if (ret)
goto cancel_ref;

kf_sops = root == &cgrp_dfl_root ?
& amp;cgroup_kf_syscall_ops : & amp;cgroup1_kf_syscall_ops;

root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {<!-- -->
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;

ret = css_populate_dir( & amp;root_cgrp->self);
if (ret)
goto destroy_root;

ret = rebind_subsystems(root, ss_mask);
if (ret)
goto destroy_root;

ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);

trace_cgroup_setup_root(root);

/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
list_add( & amp;root->root_list, & amp;cgroup_roots);
cgroup_root_count + + ;

/*
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
spin_lock_irq( & amp;css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {<!-- -->
link_css_set( & amp;tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq( & amp;css_set_lock);

BUG_ON(!list_empty( & amp;root_cgrp->self.children));
BUG_ON(atomic_read( & amp;root->nr_cgrps) != 1);

kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;

destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links( & amp;tmp_links);
return ret;
}

rebind_dubsystem

To complete the movement of subsystems between different cgroup_roots, cgroup_apply_control is called twice because the specified subsystem is removed from the original cgroup and added to the new cgroup.

int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{<!-- -->
struct cgroup *dcgrp = & amp;dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;

lockdep_assert_held( & amp;cgroup_mutex);

do_each_subsys_mask(ss, ssid, ss_mask) {<!-- -->
/*
* If @ss has non-root csses attached to it, can't move.
* If @ss is an implicit controller, it is exempt from this
* rule and can be stolen.
*/
if (css_next_child(NULL, cgroup_css( & amp;ss->root->cgrp, ss)) & amp; & amp;
!ss->implicit_on_dfl)
return -EBUSY;

/* can't move between two non-dummy roots either */
if (ss->root != & amp;cgrp_dfl_root & amp; & amp; dst_root != & amp;cgrp_dfl_root)
return -EBUSY;
} while_each_subsys_mask();

do_each_subsys_mask(ss, ssid, ss_mask) {<!-- -->
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = & amp;src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
struct css_set *cset;

WARN_ON(!css || cgroup_css(dcgrp, ss));

/* disable from the source */
//Move the subsystem out of cgrp_dfl_root. The subsys_mask of cgroup_root indicates which subsystems' css are filled in its subsys array.
src_root->subsys_mask & amp;= ~(1 << ssid);
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0); //Remove the properties file of the specified subsystem from cgrp_dfl_root->cgrp and its descendants

/* rebind */
//Indicates that the specified subsystem has added a new cgroup_root
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
css->cgroup = dcgrp;

spin_lock_irq( & amp;css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail( & amp;cset->e_cset_node[ss->id],
& amp;dcgrp->e_csets[ss->id]);
spin_unlock_irq( & amp;css_set_lock);

/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == & amp;cgrp_dfl_root) {<!-- -->
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {<!-- -->
dcgrp->subtree_control |= 1 << ssid;
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}

ret = cgroup_apply_control(dcgrp);
if (ret)
pr_warn("partial failure to rebind %s controller (err=%d)\\
",
ss->name, ret);

if (ss->bind)
ss->bind(css);
} while_each_subsys_mask();

kernfs_activate(dcgrp->kn);
return 0;
}

cgroup_apply_control

static int cgroup_apply_control(struct cgroup *cgrp)
{<!-- -->
int ret;

cgroup_propagate_control(cgrp); //Synchronize the subsystems of all descendants of cgroup with cgrp

ret = cgroup_apply_control_enable(cgrp);//Create cgroup and all descendant cftype nodes
if (ret)
return ret;

/*
* At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);//Process migration
if (ret)
return ret;

return 0;
}

Summary

The mount of cgroup first creates a cgroup_root, and then migrates the specified subsystem from the source cgroup_root to the newly created cgroup_root.