diff options
Diffstat (limited to 'nixpkgs/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch')
-rw-r--r-- | nixpkgs/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch | 784 |
1 files changed, 784 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch b/nixpkgs/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch new file mode 100644 index 000000000000..38cc0532ba97 --- /dev/null +++ b/nixpkgs/pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch @@ -0,0 +1,784 @@ +commit 827b86ad1dd21feed4c0b99faf6059f245f7dadb +Author: Tejun Heo <tj@kernel.org> +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Misc preps for cgroup unified hierarchy interface + + Make the following changes in preparation for the cpu controller + interface implementation for the unified hierarchy. This patch + doesn't cause any functional differences. + + * s/cpu_stats_show()/cpu_cfs_stats_show()/ + + * s/cpu_files/cpu_legacy_files/ + + * Separate out cpuacct_stats_read() from cpuacct_stats_show(). While + at it, make the @val array u64 for consistency. + + Signed-off-by: Tejun Heo <tj@kernel.org> + Cc: Ingo Molnar <mingo@redhat.com> + Cc: Peter Zijlstra <peterz@infradead.org> + Cc: Li Zefan <lizefan@huawei.com> + Cc: Johannes Weiner <hannes@cmpxchg.org> + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3b31fc05a0f1..a1b95e83fa87 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7174,7 +7174,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) + return ret; + } + +-static int cpu_stats_show(struct seq_file *sf, void *v) ++static int cpu_cfs_stats_show(struct seq_file *sf, void *v) + { + struct task_group *tg = css_tg(seq_css(sf)); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; +@@ -7214,7 +7214,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + } + #endif /* CONFIG_RT_GROUP_SCHED */ + +-static struct cftype cpu_files[] = { ++static struct cftype cpu_legacy_files[] = { + #ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", +@@ -7235,7 +7235,7 @@ static struct cftype cpu_files[] = { + }, + { + .name = "stat", +- .seq_show = cpu_stats_show, ++ .seq_show = cpu_cfs_stats_show, + }, + #endif + #ifdef CONFIG_RT_GROUP_SCHED +@@ -7261,7 +7261,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .fork = cpu_cgroup_fork, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, +- .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, + .early_init = true, + }; + +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index f95ab29a45d0..6151c23f722f 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) + return 0; + } + +-static int cpuacct_stats_show(struct seq_file *sf, void *v) ++static void cpuacct_stats_read(struct cpuacct *ca, ++ u64 (*val)[CPUACCT_STAT_NSTATS]) + { +- struct cpuacct *ca = css_ca(seq_css(sf)); +- s64 val[CPUACCT_STAT_NSTATS]; + int cpu; +- int stat; + +- memset(val, 0, sizeof(val)); ++ memset(val, 0, sizeof(*val)); ++ + for_each_possible_cpu(cpu) { + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; +- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; +- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; ++ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; ++ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + } ++} ++ ++static int cpuacct_stats_show(struct seq_file *sf, void *v) ++{ ++ u64 val[CPUACCT_STAT_NSTATS]; ++ int stat; ++ ++ cpuacct_stats_read(css_ca(seq_css(sf)), &val); + + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { +- seq_printf(sf, "%s %lld\n", ++ seq_printf(sf, "%s %llu\n", + cpuacct_stat_desc[stat], + (long long)nsec_to_clock_t(val[stat])); + } + +commit fdb64d002b3a223ce4bb11aa4448a42050470052 +Author: Tejun Heo <tj@kernel.org> +Date: Fri Mar 11 07:31:23 2016 -0500 + + sched: Implement interface for cgroup unified hierarchy + + While the cpu controller doesn't have any functional problems, there + are a couple interface issues which can be addressed in the v2 + interface. + + * cpuacct being a separate controller. This separation is artificial + and rather pointless as demonstrated by most use cases co-mounting + the two controllers. It also forces certain information to be + accounted twice. + + * Use of different time units. Writable control knobs use + microseconds, some stat fields use nanoseconds while other cpuacct + stat fields use centiseconds. + + * Control knobs which can't be used in the root cgroup still show up + in the root. + + * Control knob names and semantics aren't consistent with other + controllers. + + This patchset implements cpu controller's interface on the unified + hierarchy which adheres to the controller file conventions described + in Documentation/cgroups/unified-hierarchy.txt. Overall, the + following changes are made. + + * cpuacct is implictly enabled and disabled by cpu and its information + is reported through "cpu.stat" which now uses microseconds for all + time durations. All time duration fields now have "_usec" appended + to them for clarity. While this doesn't solve the double accounting + immediately, once majority of users switch to v2, cpu can directly + account and report the relevant stats and cpuacct can be disabled on + the unified hierarchy. + + Note that cpuacct.usage_percpu is currently not included in + "cpu.stat". If this information is actually called for, it can be + added later. + + * "cpu.shares" is replaced with "cpu.weight" and operates on the + standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). + The weight is scaled to scheduler weight so that 100 maps to 1024 + and the ratio relationship is preserved - if weight is W and its + scaled value is S, W / 100 == S / 1024. While the mapped range is a + bit smaller than the orignal scheduler weight range, the dead zones + on both sides are relatively small and covers wider range than the + nice value mappings. This file doesn't make sense in the root + cgroup and isn't create on root. + + * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" + which contains both quota and period. + + * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by + "cpu.rt.max" which contains both runtime and period. + + v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for + CFS bandwidth stats and also using raw division for u64. Use + CONFIG_CFS_BANDWITH and do_div() instead. + + The semantics of "cpu.rt.max" is not fully decided yet. Dropped + for now. + + Signed-off-by: Tejun Heo <tj@kernel.org> + Cc: Ingo Molnar <mingo@redhat.com> + Cc: Peter Zijlstra <peterz@infradead.org> + Cc: Li Zefan <lizefan@huawei.com> + Cc: Johannes Weiner <hannes@cmpxchg.org> + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index a1b95e83fa87..f01d56e58a1b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7253,6 +7253,139 @@ static struct cftype cpu_legacy_files[] = { + { } /* Terminate */ + }; + ++static int cpu_stats_show(struct seq_file *sf, void *v) ++{ ++ cpuacct_cpu_stats_show(sf); ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ struct task_group *tg = css_tg(seq_css(sf)); ++ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; ++ u64 throttled_usec; ++ ++ throttled_usec = cfs_b->throttled_time; ++ do_div(throttled_usec, NSEC_PER_USEC); ++ ++ seq_printf(sf, "nr_periods %d\n" ++ "nr_throttled %d\n" ++ "throttled_usec %llu\n", ++ cfs_b->nr_periods, cfs_b->nr_throttled, ++ throttled_usec); ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ struct task_group *tg = css_tg(css); ++ u64 weight = scale_load_down(tg->shares); ++ ++ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); ++} ++ ++static int cpu_weight_write_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, u64 weight) ++{ ++ /* ++ * cgroup weight knobs should use the common MIN, DFL and MAX ++ * values which are 1, 100 and 10000 respectively. While it loses ++ * a bit of range on both ends, it maps pretty well onto the shares ++ * value used by scheduler and the round-trip conversions preserve ++ * the original value over the entire range. ++ */ ++ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) ++ return -ERANGE; ++ ++ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); ++ ++ return sched_group_set_shares(css_tg(css), scale_load(weight)); ++} ++#endif ++ ++static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, ++ long period, long quota) ++{ ++ if (quota < 0) ++ seq_puts(sf, "max"); ++ else ++ seq_printf(sf, "%ld", quota); ++ ++ seq_printf(sf, " %ld\n", period); ++} ++ ++/* caller should put the current value in *@periodp before calling */ ++static int __maybe_unused cpu_period_quota_parse(char *buf, ++ u64 *periodp, u64 *quotap) ++{ ++ char tok[21]; /* U64_MAX */ ++ ++ if (!sscanf(buf, "%s %llu", tok, periodp)) ++ return -EINVAL; ++ ++ *periodp *= NSEC_PER_USEC; ++ ++ if (sscanf(tok, "%llu", quotap)) ++ *quotap *= NSEC_PER_USEC; ++ else if (!strcmp(tok, "max")) ++ *quotap = RUNTIME_INF; ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++#ifdef CONFIG_CFS_BANDWIDTH ++static int cpu_max_show(struct seq_file *sf, void *v) ++{ ++ struct task_group *tg = css_tg(seq_css(sf)); ++ ++ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); ++ return 0; ++} ++ ++static ssize_t cpu_max_write(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, loff_t off) ++{ ++ struct task_group *tg = css_tg(of_css(of)); ++ u64 period = tg_get_cfs_period(tg); ++ u64 quota; ++ int ret; ++ ++ ret = cpu_period_quota_parse(buf, &period, "a); ++ if (!ret) ++ ret = tg_set_cfs_bandwidth(tg, period, quota); ++ return ret ?: nbytes; ++} ++#endif ++ ++static struct cftype cpu_files[] = { ++ { ++ .name = "stat", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_stats_show, ++ }, ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ { ++ .name = "weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_u64 = cpu_weight_read_u64, ++ .write_u64 = cpu_weight_write_u64, ++ }, ++#endif ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ .name = "max", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = cpu_max_show, ++ .write = cpu_max_write, ++ }, ++#endif ++ { } /* terminate */ ++}; ++ + struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, +@@ -7262,7 +7395,15 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, + .early_init = true, ++#ifdef CONFIG_CGROUP_CPUACCT ++ /* ++ * cpuacct is enabled together with cpu on the unified hierarchy ++ * and its stats are reported through "cpu.stat". ++ */ ++ .depends_on = 1 << cpuacct_cgrp_id, ++#endif + }; + + #endif /* CONFIG_CGROUP_SCHED */ +diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c +index 6151c23f722f..fc1cf13c3af1 100644 +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -347,6 +347,31 @@ static struct cftype files[] = { + { } /* terminate */ + }; + ++/* used to print cpuacct stats in cpu.stat on the unified hierarchy */ ++void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++ struct cgroup_subsys_state *css; ++ u64 usage, val[CPUACCT_STAT_NSTATS]; ++ ++ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys); ++ ++ usage = cpuusage_read(css, seq_cft(sf)); ++ cpuacct_stats_read(css_ca(css), &val); ++ ++ val[CPUACCT_STAT_USER] *= TICK_NSEC; ++ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC; ++ do_div(usage, NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC); ++ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC); ++ ++ seq_printf(sf, "usage_usec %llu\n" ++ "user_usec %llu\n" ++ "system_usec %llu\n", ++ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]); ++ ++ css_put(css); ++} ++ + /* + * charge this task's execution time to its accounting group. + * +diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h +index ba72807c73d4..ddf7af466d35 100644 +--- a/kernel/sched/cpuacct.h ++++ b/kernel/sched/cpuacct.h +@@ -2,6 +2,7 @@ + + extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); + extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); ++extern void cpuacct_cpu_stats_show(struct seq_file *sf); + + #else + +@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val) + { + } + ++static inline void cpuacct_cpu_stats_show(struct seq_file *sf) ++{ ++} ++ + #endif + +commit 8dde150866b8c433216105c50b7e889d5242d583 +Author: Tejun Heo <tj@kernel.org> +Date: Fri Aug 5 12:41:01 2016 -0400 + + cgroup: add documentation regarding CPU controller cgroup v2 support + + Signed-off-by: Tejun Heo <tj@kernel.org> + +diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt +new file mode 100644 +index 000000000000..1ed7032d4472 +--- /dev/null ++++ b/Documentation/cgroup-v2-cpu.txt +@@ -0,0 +1,368 @@ ++ ++ ++CPU Controller on Control Group v2 ++ ++August, 2016 Tejun Heo <tj@kernel.org> ++ ++ ++While most controllers have support for cgroup v2 now, the CPU ++controller support is not upstream yet due to objections from the ++scheduler maintainers on the basic designs of cgroup v2. This ++document explains the current situation as well as an interim ++solution, and details the disagreements and arguments. The latest ++version of this document can be found at the following URL. ++ ++ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu ++ ++This document was posted to the linux-kernel and cgroup mailing lists. ++Unfortunately, no consensus was reached as of Oct, 2016. The thread ++can be found at the following URL. ++ ++ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org ++ ++ ++CONTENTS ++ ++1. Current Situation and Interim Solution ++2. Disagreements and Arguments ++ 2-1. Contentious Restrictions ++ 2-1-1. Process Granularity ++ 2-1-2. No Internal Process Constraint ++ 2-2. Impact on CPU Controller ++ 2-2-1. Impact of Process Granularity ++ 2-2-2. Impact of No Internal Process Constraint ++ 2-3. Arguments for cgroup v2 ++3. Way Forward ++4. References ++ ++ ++1. Current Situation and Interim Solution ++ ++All objections from the scheduler maintainers apply to cgroup v2 core ++design, and there are no known objections to the specifics of the CPU ++controller cgroup v2 interface. The only blocked part is changes to ++expose the CPU controller interface on cgroup v2, which comprises the ++following two patches: ++ ++ [1] sched: Misc preps for cgroup unified hierarchy interface ++ [2] sched: Implement interface for cgroup unified hierarchy ++ ++The necessary changes are superficial and implement the interface ++files on cgroup v2. The combined diffstat is as follows. ++ ++ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++-- ++ kernel/sched/cpuacct.c | 57 ++++++++++++------ ++ kernel/sched/cpuacct.h | 5 + ++ 3 files changed, 189 insertions(+), 22 deletions(-) ++ ++The patches are easy to apply and forward-port. The following git ++branch will always carry the two patches on top of the latest release ++of the upstream kernel. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu ++ ++There also are versioned branches going back to v4.4. ++ ++ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER ++ ++While it's difficult to tell whether the CPU controller support will ++be merged, there are crucial resource control features in cgroup v2 ++that are only possible due to the design choices that are being ++objected to, and every effort will be made to ease enabling the CPU ++controller cgroup v2 support out-of-tree for parties which choose to. ++ ++ ++2. Disagreements and Arguments ++ ++There have been several lengthy discussion threads [3][4] on LKML ++around the structural constraints of cgroup v2. The two that affect ++the CPU controller are process granularity and no internal process ++constraint. Both arise primarily from the need for common resource ++domain definition across different resources. ++ ++The common resource domain is a powerful concept in cgroup v2 that ++allows controllers to make basic assumptions about the structural ++organization of processes and controllers inside the cgroup hierarchy, ++and thus solve problems spanning multiple types of resources. The ++prime example for this is page cache writeback: dirty page cache is ++regulated through throttling buffered writers based on memory ++availability, and initiating batched write outs to the disk based on ++IO capacity. Tracking and controlling writeback inside a cgroup thus ++requires the direct cooperation of the memory and the IO controller. ++ ++This easily extends to other areas, such as CPU cycles consumed while ++performing memory reclaim or IO encryption. ++ ++ ++2-1. Contentious Restrictions ++ ++For controllers of different resources to work together, they must ++agree on a common organization. This uniform model across controllers ++imposes two contentious restrictions on the CPU controller: process ++granularity and the no-internal-process constraint. ++ ++ ++ 2-1-1. Process Granularity ++ ++ For memory, because an address space is shared between all threads ++ of a process, the terminal consumer is a process, not a thread. ++ Separating the threads of a single process into different memory ++ control domains doesn't make semantical sense. cgroup v2 ensures ++ that all controller can agree on the same organization by requiring ++ that threads of the same process belong to the same cgroup. ++ ++ There are other reasons to enforce process granularity. One ++ important one is isolating system-level management operations from ++ in-process application operations. The cgroup interface, being a ++ virtual filesystem, is very unfit for multiple independent ++ operations taking place at the same time as most operations have to ++ be multi-step and there is no way to synchronize multiple accessors. ++ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity" ++ ++ ++ 2-1-2. No Internal Process Constraint ++ ++ cgroup v2 does not allow processes to belong to any cgroup which has ++ child cgroups when resource controllers are enabled on it (the ++ notable exception being the root cgroup itself). This is because, ++ for some resources, a resource domain (cgroup) is not directly ++ comparable to the terminal consumer (process/task) of said resource, ++ and so putting the two into a sibling relationship isn't meaningful. ++ ++ - Differing Control Parameters and Capabilities ++ ++ A cgroup controller has different resource control parameters and ++ capabilities from a terminal consumer, be that a task or process. ++ There are a couple cases where a cgroup control knob can be mapped ++ to a per-task or per-process API but they are exceptions and the ++ mappings aren't obvious even in those cases. ++ ++ For example, task priorities (also known as nice values) set ++ through setpriority(2) are mapped to the CPU controller ++ "cpu.shares" values. However, how exactly the two ranges map and ++ even the fact that they map to each other at all are not obvious. ++ ++ The situation gets further muddled when considering other resource ++ types and control knobs. IO priorities set through ioprio_set(2) ++ cannot be mapped to IO controller weights and most cgroup resource ++ control knobs including the bandwidth control knobs of the CPU ++ controller don't have counterparts in the terminal consumers. ++ ++ - Anonymous Resource Consumption ++ ++ For CPU, every time slice consumed from inside a cgroup, which ++ comprises most but not all of consumed CPU time for the cgroup, ++ can be clearly attributed to a specific task or process. Because ++ these two types of entities are directly comparable as consumers ++ of CPU time, it's theoretically possible to mix tasks and cgroups ++ on the same tree levels and let them directly compete for the time ++ quota available to their common ancestor. ++ ++ However, the same can't be said for resource types like memory or ++ IO: the memory consumed by the page cache, for example, can be ++ tracked on a per-cgroup level, but due to mismatches in lifetimes ++ of involved objects (page cache can persist long after processes ++ are gone), shared usages and the implementation overhead of ++ tracking persistent state, it can no longer be attributed to ++ individual processes after instantiation. Consequently, any IO ++ incurred by page cache writeback can be attributed to a cgroup, ++ but not to the individual consumers inside the cgroup. ++ ++ For memory and IO, this makes a resource domain (cgroup) an object ++ of a fundamentally different type than a terminal consumer ++ (process). A process can't be a first class object in the resource ++ distribution graph as its total resource consumption can't be ++ described without the containing resource domain. ++ ++ Disallowing processes in internal cgroups avoids competition between ++ cgroups and processes which cannot be meaningfully defined for these ++ resources. All resource control takes place among cgroups and a ++ terminal consumer interacts with the containing cgroup the same way ++ it would with the system without cgroup. ++ ++ Root cgroup is exempt from this constraint, which is in line with ++ how root cgroup is handled in general - it's excluded from cgroup ++ resource accounting and control. ++ ++ ++Enforcing process granularity and no internal process constraint ++allows all controllers to be on the same footing in terms of resource ++distribution hierarchy. ++ ++ ++2-2. Impact on CPU Controller ++ ++As indicated earlier, the CPU controller's resource distribution graph ++is the simplest. Every schedulable resource consumption can be ++attributed to a specific task. In addition, for weight based control, ++the per-task priority set through setpriority(2) can be translated to ++and from a per-cgroup weight. As such, the CPU controller can treat a ++task and a cgroup symmetrically, allowing support for any tree layout ++of cgroups and tasks. Both process granularity and the no internal ++process constraint restrict how the CPU controller can be used. ++ ++ ++ 2-2-1. Impact of Process Granularity ++ ++ Process granularity prevents tasks belonging to the same process to ++ be assigned to different cgroups. It was pointed out [6] that this ++ excludes the valid use case of hierarchical CPU distribution within ++ processes. ++ ++ To address this issue, the rgroup (resource group) [7][8][9] ++ interface, an extension of the existing setpriority(2) API, was ++ proposed, which is in line with other programmable priority ++ mechanisms and eliminates the risk of in-application configuration ++ and system configuration stepping on each other's toes. ++ Unfortunately, the proposal quickly turned into discussions around ++ cgroup v2 design decisions [4] and no consensus could be reached. ++ ++ ++ 2-2-2. Impact of No Internal Process Constraint ++ ++ The no internal process constraint disallows tasks from competing ++ directly against cgroups. Here is an excerpt from Peter Zijlstra ++ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and ++ t4 are tasks: ++ ++ ++ R ++ / | \ ++ t1 t2 A ++ / \ ++ t3 t4 ++ ++ ++ Is fundamentally different from: ++ ++ ++ R ++ / \ ++ L A ++ / \ / \ ++ t1 t2 t3 t4 ++ ++ ++ Because if in the first hierarchy you add a task (t5) to R, all of ++ its A will run at 1/4th of total bandwidth where before it had ++ 1/3rd, whereas with the second example, if you add our t5 to L, A ++ doesn't get any less bandwidth. ++ ++ ++ It is true that the trees are semantically different from each other ++ and the symmetric handling of tasks and cgroups is aesthetically ++ pleasing. However, it isn't clear what the practical usefulness of ++ a layout with direct competition between tasks and cgroups would be, ++ considering that number and behavior of tasks are controlled by each ++ application, and cgroups primarily deal with system level resource ++ distribution; changes in the number of active threads would directly ++ impact resource distribution. Real world use cases of such layouts ++ could not be established during the discussions. ++ ++ ++2-3. Arguments for cgroup v2 ++ ++There are strong demands for comprehensive hierarchical resource ++control across all major resources, and establishing a common resource ++hierarchy is an essential step. As with most engineering decisions, ++common resource hierarchy definition comes with its trade-offs. With ++cgroup v2, the trade-offs are in the form of structural constraints ++which, among others, restrict the CPU controller's space of possible ++configurations. ++ ++However, even with the restrictions, cgroup v2, in combination with ++rgroup, covers most of identified real world use cases while enabling ++new important use cases of resource control across multiple resource ++types that were fundamentally broken previously. ++ ++Furthermore, for resource control, treating resource domains as ++objects of a different type from terminal consumers has important ++advantages - it can account for resource consumptions which are not ++tied to any specific terminal consumer, be that a task or process, and ++allows decoupling resource distribution controls from in-application ++APIs. Even the CPU controller may benefit from it as the kernel can ++consume significant amount of CPU cycles in interrupt context or tasks ++shared across multiple resource domains (e.g. softirq). ++ ++Finally, it's important to note that enabling cgroup v2 support for ++the CPU controller doesn't block use cases which require the features ++which are not available on cgroup v2. Unlikely, but should anybody ++actually rely on the CPU controller's symmetric handling of tasks and ++cgroups, backward compatibility is and will be maintained by being ++able to disconnect the controller from the cgroup v2 hierarchy and use ++it standalone. This also holds for cpuset which is often used in ++highly customized configurations which might be a poor fit for common ++resource domains. ++ ++The required changes are minimal, the benefits for the target use ++cases are critical and obvious, and use cases which have to use v1 can ++continue to do so. ++ ++ ++3. Way Forward ++ ++cgroup v2 primarily aims to solve the problem of comprehensive ++hierarchical resource control across all major computing resources, ++which is one of the core problems of modern server infrastructure ++engineering. The trade-offs that cgroup v2 took are results of ++pursuing that goal and gaining a better understanding of the nature of ++resource control in the process. ++ ++I believe that real world usages will prove cgroup v2's model right, ++considering the crucial pieces of comprehensive resource control that ++cannot be implemented without common resource domains. This is not to ++say that cgroup v2 is fixed in stone and can't be updated; if there is ++an approach which better serves both comprehensive resource control ++and the CPU controller's flexibility, we will surely move towards ++that. It goes without saying that discussions around such approach ++should consider practical aspects of resource control as a whole ++rather than absolutely focusing on a particular controller. ++ ++Until such consensus can be reached, the CPU controller cgroup v2 ++support will be maintained out of the mainline kernel in an easily ++accessible form. If there is anything cgroup developers can do to ++ease the pain, please feel free to contact us on the cgroup mailing ++list at cgroups@vger.kernel.org. ++ ++ ++4. References ++ ++[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org ++ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface ++ Tejun Heo <tj@kernel.org> ++ ++[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org ++ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo <tj@kernel.org> ++ ++[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org ++ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Tejun Heo <tj@kernel.org> ++ ++[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Peter Zijlstra <peterz@infradead.org> ++ ++[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt ++ Control Group v2 ++ Tejun Heo <tj@kernel.org> ++ ++[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com ++ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy ++ Paul Turner <pjt@google.com> ++ ++[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org ++ [RFD] cgroup: thread granularity support for cpu controller ++ Tejun Heo <tj@kernel.org> ++ ++[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org ++ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP ++ Tejun Heo <tj@kernel.org> ++ ++[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org ++ Example program for PRIO_RGRP ++ Tejun Heo <tj@kernel.org> ++ ++[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net ++ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource ++ Peter Zijlstra <peterz@infradead.org> |