kernel: 5.0
The following mount process is the first mount after the initialization is completed.
cgroup_mount
static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) {<!-- --> struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; int ret; get_cgroup_ns(ns); /* Check if the caller has permission to mount. */ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {<!-- --> put_cgroup_ns(ns); return ERR_PTR(-EPERM); } /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists();// if (fs_type == & amp;cgroup2_fs_type) {<!-- --> unsigned int root_flags; ret = parse_cgroup_root_flags(data, & amp;root_flags); if (ret) {<!-- --> put_cgroup_ns(ns); return ERR_PTR(ret); } cgrp_dfl_visible = true; cgroup_get_live( & amp;cgrp_dfl_root.cgrp); dentry = cgroup_do_mount( & amp;cgroup2_fs_type, flags, & amp;cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); if (!IS_ERR(dentry)) apply_cgroup_root_flags(root_flags); } else {<!-- --> dentry = cgroup1_mount( &cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); } put_cgroup_ns(ns); return dentry; }
cgroup_enable_task_cg_lists
It only plays a role when mounting for the first time. It is used to connect all tasks at this time to the tasks linked list of css_set using cg_list as the connector. Subsequent tasks of the new fork are connected to the cgroups pointer of the parent process when they are created. The tasks linked list of css_set (see copy_process –>cgroup_post_fork)
static void cgroup_enable_task_cg_lists(void) {<!-- --> struct task_struct *p, *g; /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed * cgroup_post_fork() without seeing use_task_css_set_links = 1 * is not guaranteed to have its child immediately visible in the * tasklist if we walk through it with RCU. */ read_lock( & amp;tasklist_lock); spin_lock_irq( & amp;css_set_lock); if (use_task_css_set_links) goto out_unlock; use_task_css_set_links = true; do_each_thread(g, p) {<!-- --> WARN_ON_ONCE(!list_empty( & amp;p->cg_list) || task_css_set(p) != & amp;init_css_set); /* * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. * Do it while holding siglock so that we don't end up * racing against cgroup_exit(). * * Interrupts were already disabled while acquiring * the css_set_lock, so we do not need to disable it * again when acquiring the sighand->siglock here. */ spin_lock( & amp;p->sighand->siglock); if (!(p->flags & amp; PF_EXITING)) {<!-- --> struct css_set *cset = task_css_set(p); /*When forking a process, the cgroups of the new process first point to init_css_set(copy_process --> cgroup_fork) *If the initial mounting has been completed (use_task_css_set_links is true), it points to the cgroups of the parent process */ if (!css_set_populated(cset)) css_set_update_populated(cset, true); //Set the nr_populaed_csets members of all cgroups associated with cset, //Indicates that the cgroup has a task associated with it list_add_tail( & amp;p->cg_list, & amp;cset->tasks);//The tasks linked list of css_set maintains the tasks in the normal state, and mg_tasks maintains the migration status. get_css_set(cset); //Increase reference count cset->nr_tasks + + ; } spin_unlock( & amp;p->sighand->siglock); } while_each_thread(g, p); out_unlock: spin_unlock_irq( & amp;css_set_lock); read_unlock( & amp;tasklist_lock); }
cgroup1_mount( & amp;cgroup_fs_type, flags, data,CGROUP_SUPER_MAGIC, ns);
struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, void *data, unsigned long magic, struct cgroup_namespace *ns) {<!-- --> struct super_block *pinned_sb = NULL; struct cgroup_sb_opts opts; struct cgroup_root *root; struct cgroup_subsys *ss; struct dentry *dentry; int i, ret; bool new_root = false; cgroup_lock_and_drain_offline( & amp;cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, & amp;opts);//Parse the data passed in by the user space, including subsystem names, etc. if (ret) goto out_unlock; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) {<!-- --> if (!(opts.subsys_mask & amp; (1 << i)) || //At this time, the roots of all subsystems point to cgrp_dfl_root, so this loop is skipped ss->root == & amp;cgrp_dfl_root) continue; if (!percpu_ref_tryget_live( & amp;ss->root->cgrp.self.refcnt)) {<!-- --> mutex_unlock( & amp;cgroup_mutex); msleep(10); ret = restart_syscall(); goto out_free; } cgroup_put( & amp;ss->root->cgrp); } for_each_root(root) {<!-- --> //Each cgroup_root will be connected to the cgroup_roots linked list using root_list as the connector. At this time, there is only cgrp_dfl_root bool name_match = false; if (root == & amp;cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (opts.name) {<!-- --> if (strcmp(opts.name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) & amp; & amp; (opts.subsys_mask != root->subsys_mask)) {<!-- --> if (!name_match) continue; ret = -EBUSY; goto out_unlock; } if (root->flags ^ opts.flags) pr_warn("new mount options do not match the existing superblock, will be ignored\\ "); /* * We want to reuse @root whose lifetime is governed by its * ->cgrp. Let's check whether @root is alive and keep it * that way. As cgroup_kill_sb() can happen anytime, we * want to block it by pinning the sb so that @root doesn't * get killed before mount is complete. * * With the sb pinned, tryget_live can reliably indicate * whether @root can be reused. If it's being killed, * drain it. We can use wait_queue for the wait but this * path is super cold. Let's just sleep a bit and retry. */ pinned_sb = kernfs_pin_sb(root->kf_root, NULL); if (IS_ERR(pinned_sb) || !percpu_ref_tryget_live( & amp;root->cgrp.self.refcnt)) {<!-- --> mutex_unlock( & amp;cgroup_mutex); if (!IS_ERR_OR_NULL(pinned_sb)) deactivate_super(pinned_sb); msleep(10); ret = restart_syscall(); goto out_free; } ret = 0; goto out_unlock; } //When mounting for the first time, the above two loops do nothing. /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!opts.subsys_mask & amp; & amp; !opts.none) {<!-- --> ret = -EINVAL; goto out_unlock; } /* Hierarchies may only be created in the initial cgroup namespace. */ if (ns != & amp;init_cgroup_ns) {<!-- --> ret = -EPERM; goto out_unlock; } root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) {<!-- --> ret = -ENOMEM; goto out_unlock; } new_root = true; init_cgroup_root(root, & amp;opts); //Now there is a second cgroup_root, a second cgroup, and a second cgoup_subsys_state in the system, and they reference each other. ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); if (ret) cgroup_free_root(root); out_unlock: mutex_unlock( & amp;cgroup_mutex); out_free: kfree(opts.release_agent); kfree(opts.name); if (ret) return ERR_PTR(ret); //Call kernfs_mount to complete the actual mount, dentry = cgroup_do_mount( &cgroup_fs_type, flags, root, CGROUP_SUPER_MAGIC, ns); /* * There's a race window after we release cgroup_mutex and before * allocating a superblock. Make sure a concurrent process won't * be able to re-use the root during this window by delaying the * initialization of root refcnt. */ if (new_root) {<!-- --> mutex_lock( & amp;cgroup_mutex); percpu_ref_reinit( & amp;root->cgrp.self.refcnt); mutex_unlock( & amp;cgroup_mutex); } /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. */ if (pinned_sb) deactivate_super(pinned_sb); return dentry; }
cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
Different from calling this function during initialization, subsys_mask is not 0 at this time and will be in rebind_subsystem
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) {<!-- --> LIST_HEAD(tmp_links); struct cgroup *root_cgrp = & amp;root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held( & amp;cgroup_mutex); ret = cgroup_idr_alloc( & amp;root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init( & amp;root_cgrp->self.refcnt, css_release, ref_flags, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, & amp;tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? & amp;cgroup_kf_syscall_ops : & amp;cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) {<!-- --> ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; ret = css_populate_dir( & amp;root_cgrp->self); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto destroy_root; ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add( & amp;root->root_list, & amp;cgroup_roots); cgroup_root_count + + ; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq( & amp;css_set_lock); hash_for_each(css_set_table, i, cset, hlist) {<!-- --> link_css_set( & amp;tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq( & amp;css_set_lock); BUG_ON(!list_empty( & amp;root_cgrp->self.children)); BUG_ON(atomic_read( & amp;root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); ret = 0; goto out; destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links( & amp;tmp_links); return ret; }
rebind_dubsystem
To complete the movement of subsystems between different cgroup_roots, cgroup_apply_control is called twice because the specified subsystem is removed from the original cgroup and added to the new cgroup.
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) {<!-- --> struct cgroup *dcgrp = & amp;dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; lockdep_assert_held( & amp;cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) {<!-- --> /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css( & amp;ss->root->cgrp, ss)) & amp; & amp; !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != & amp;cgrp_dfl_root & amp; & amp; dst_root != & amp;cgrp_dfl_root) return -EBUSY; } while_each_subsys_mask(); do_each_subsys_mask(ss, ssid, ss_mask) {<!-- --> struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = & amp;src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset; WARN_ON(!css || cgroup_css(dcgrp, ss)); /* disable from the source */ //Move the subsystem out of cgrp_dfl_root. The subsys_mask of cgroup_root indicates which subsystems' css are filled in its subsys array. src_root->subsys_mask & amp;= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); //Remove the properties file of the specified subsystem from cgrp_dfl_root->cgrp and its descendants /* rebind */ //Indicates that the specified subsystem has added a new cgroup_root RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; css->cgroup = dcgrp; spin_lock_irq( & amp;css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail( & amp;cset->e_cset_node[ss->id], & amp;dcgrp->e_csets[ss->id]); spin_unlock_irq( & amp;css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == & amp;cgrp_dfl_root) {<!-- --> static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else {<!-- --> dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\\ ", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; }
cgroup_apply_control
static int cgroup_apply_control(struct cgroup *cgrp) {<!-- --> int ret; cgroup_propagate_control(cgrp); //Synchronize the subsystems of all descendants of cgroup with cgrp ret = cgroup_apply_control_enable(cgrp);//Create cgroup and all descendant cftype nodes if (ret) return ret; /* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ ret = cgroup_update_dfl_csses(cgrp);//Process migration if (ret) return ret; return 0; }
Summary
The mount of cgroup first creates a cgroup_root, and then migrates the specified subsystem from the source cgroup_root to the newly created cgroup_root.