aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2018-07-04 11:54:35 +0200
committerMichal Hocko <mhocko@suse.com>2018-07-04 11:54:35 +0200
commitacd6e6b7071663a082ecbff3f0824d0e86884a27 (patch)
tree384bf43b7701143d2b5cb4a6b24e3df7df684d1c
parentab8c3a3d7b2155eb9fb2df30c3d7d275121969e8 (diff)
parent4245153446ca19a32a5061fc2144368a42da5d0f (diff)
downloadmm-akpm/for-next.tar.gz
Merge branch 'akpm/memcg-oom' into akpm/for-nextakpm/for-next
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst74
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/memcontrol.h25
-rw-r--r--include/linux/oom.h12
-rw-r--r--kernel/cgroup/cgroup.c13
-rw-r--r--mm/memcontrol.c264
-rw-r--r--mm/oom_kill.c210
7 files changed, 526 insertions, 77 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8a2c52d5c53b7..2d4c4cf72008a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -48,6 +48,7 @@ v1 is available under Documentation/cgroup-v1/.
5-2-1. Memory Interface Files
5-2-2. Usage Guidelines
5-2-3. Memory Ownership
+ 5-2-4. OOM Killer
5-3. IO
5-3-1. IO Interface Files
5-3-2. Writeback
@@ -1069,6 +1070,31 @@ PAGE_SIZE multiple when read back.
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
+ memory.oom_group
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ If set, OOM killer will consider the memory cgroup as an
+ indivisible memory consumers and compare it with other memory
+ consumers by it's memory footprint.
+ If such memory cgroup is selected as an OOM victim, all
+ processes belonging to it or it's descendants will be killed.
+
+ This applies to system-wide OOM conditions and reaching
+ the hard memory limit of the cgroup and their ancestor.
+ If OOM condition happens in a descendant cgroup with it's own
+ memory limit, the memory cgroup can't be considered
+ as an OOM victim, and OOM killer will not kill all belonging
+ tasks.
+
+ Also, OOM killer respects the /proc/pid/oom_score_adj value -1000,
+ and will never kill the unkillable task, even if memory.oom_group
+ is set.
+
+ If cgroup-aware OOM killer is not enabled, ENOTSUPP error
+ is returned on attempt to access the file.
+
memory.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
@@ -1293,6 +1319,54 @@ to be accessed repeatedly by other cgroups, it may make sense to use
POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
belonging to the affected files to ensure correct memory ownership.
+OOM Killer
+~~~~~~~~~~
+
+Cgroup v2 memory controller implements a cgroup-aware OOM killer.
+It means that it treats cgroups as first class OOM entities.
+
+Cgroup-aware OOM logic is turned off by default and requires
+passing the "groupoom" option on mounting cgroupfs. It can also
+by remounting cgroupfs with the following command::
+
+ # mount -o remount,groupoom $MOUNT_POINT
+
+Under OOM conditions the memory controller tries to make the best
+choice of a victim, looking for a memory cgroup with the largest
+memory footprint, considering leaf cgroups and cgroups with the
+memory.oom_group option set, which are considered to be an indivisible
+memory consumers.
+
+By default, OOM killer will kill the biggest task in the selected
+memory cgroup. A user can change this behavior by enabling
+the per-cgroup memory.oom_group option. If set, it causes
+the OOM killer to kill all processes attached to the cgroup,
+except processes with oom_score_adj set to -1000.
+
+This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM
+the memory controller considers only cgroups belonging to the sub-tree
+of the OOM'ing cgroup.
+
+Leaf cgroups and cgroups with oom_group option set are compared based
+on their cumulative memory usage. The root cgroup is treated as a
+leaf memory cgroup as well, so it is compared with other leaf memory
+cgroups. Due to internal implementation restrictions the size of
+the root cgroup is the cumulative sum of oom_badness of all its tasks
+(in other words oom_score_adj of each task is obeyed). Relying on
+oom_score_adj (apart from OOM_SCORE_ADJ_MIN) can lead to over- or
+underestimation of the root cgroup consumption and it is therefore
+discouraged. This might change in the future, however.
+
+If there are no cgroups with the enabled memory controller,
+the OOM killer is using the "traditional" process-based approach.
+
+Please, note that memory charges are not migrating if tasks
+are moved between different memory cgroups. Moving tasks with
+significant memory footprint may affect OOM victim selection logic.
+If it's a case, please, consider creating a common ancestor for
+the source and destination memory cgroups and enabling oom_group
+on ancestor layer.
+
IO
--
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011c..92af04018176e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -81,6 +81,11 @@ enum {
* Enable cpuset controller in v1 cgroup to use v2 behavior.
*/
CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
+
+ /*
+ * Enable cgroup-aware OOM killer.
+ */
+ CGRP_GROUP_OOM = (1 << 5),
};
/* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 06ba93c78a6fa..90d0a5e64c740 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -35,6 +35,7 @@ struct mem_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
+struct oom_control;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -233,6 +234,13 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
+ /*
+ * Treat the sub-tree as an indivisible memory consumer,
+ * kill all belonging tasks if the memory cgroup selected
+ * as OOM victim.
+ */
+ bool oom_group;
+
/* memory.events */
struct cgroup_file events_file;
@@ -540,6 +548,13 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
bool mem_cgroup_oom_synchronize(bool wait);
+bool mem_cgroup_select_oom_victim(struct oom_control *oc);
+
+static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg)
+{
+ return memcg->oom_group;
+}
+
#ifdef CONFIG_MEMCG_SWAP
extern int do_swap_account;
#endif
@@ -1075,6 +1090,16 @@ static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
+
+static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_oom_group(struct mem_cgroup *memcg)
+{
+ return false;
+}
#endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6adac113e96d2..553eb37def7e7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -10,6 +10,13 @@
#include <linux/sched/coredump.h> /* MMF_* */
#include <linux/mm.h> /* VM_FAULT* */
+
+/*
+ * Special value returned by victim selection functions to indicate
+ * that are inflight OOM victims.
+ */
+#define INFLIGHT_VICTIM ((void *)-1UL)
+
struct zonelist;
struct notifier_block;
struct mem_cgroup;
@@ -40,7 +47,8 @@ struct oom_control {
/* Used by oom implementation, do not set */
unsigned long totalpages;
- struct task_struct *chosen;
+ struct task_struct *chosen_task;
+ struct mem_cgroup *chosen_memcg;
unsigned long chosen_points;
};
@@ -113,6 +121,8 @@ extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
+extern int oom_evaluate_task(struct task_struct *task, void *arg);
+
/* sysctls */
extern int sysctl_oom_dump_tasks;
extern int sysctl_oom_kill_allocating_task;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 077370bf89643..de25822590d0a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1747,6 +1747,9 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
if (!strcmp(token, "nsdelegate")) {
*root_flags |= CGRP_ROOT_NS_DELEGATE;
continue;
+ } else if (!strcmp(token, "groupoom")) {
+ *root_flags |= CGRP_GROUP_OOM;
+ continue;
}
pr_err("cgroup2: unknown option \"%s\"\n", token);
@@ -1763,6 +1766,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ if (root_flags & CGRP_GROUP_OOM)
+ cgrp_dfl_root.flags |= CGRP_GROUP_OOM;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM;
}
}
@@ -1770,6 +1778,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_GROUP_OOM)
+ seq_puts(seq, ",groupoom");
return 0;
}
@@ -5957,7 +5967,8 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n"
+ "groupoom\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b710018bf6539..1a945ad6b5f59 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1048,7 +1048,8 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* value, the function breaks the iteration loop and returns the value.
* Otherwise, it will iterate over all tasks and return 0.
*
- * This function must not be called for the root memory cgroup.
+ * If memcg is the root memory cgroup, this function will iterate only
+ * over tasks belonging directly to the root memory cgroup.
*/
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
int (*fn)(struct task_struct *, void *), void *arg)
@@ -1056,8 +1057,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
-
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
@@ -1066,7 +1065,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
- if (ret) {
+ if (ret || memcg == root_mem_cgroup) {
mem_cgroup_iter_break(memcg, iter);
break;
}
@@ -2804,6 +2803,224 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
return ret;
}
+static long memcg_oom_badness(struct mem_cgroup *memcg,
+ const nodemask_t *nodemask,
+ unsigned long totalpages)
+{
+ long points = 0;
+ int nid;
+ pg_data_t *pgdat;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (nodemask && !node_isset(nid, *nodemask))
+ continue;
+
+ points += mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
+
+ pgdat = NODE_DATA(nid);
+ points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
+ NR_SLAB_UNRECLAIMABLE);
+ }
+
+ points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
+ (PAGE_SIZE / 1024);
+ points += memcg_page_state(memcg, MEMCG_SOCK);
+ points += memcg_page_state(memcg, MEMCG_SWAP);
+
+ return points;
+}
+
+/*
+ * Checks if the given memcg is a valid OOM victim and returns a number,
+ * which means the folowing:
+ * -1: there are inflight OOM victim tasks, belonging to the memcg
+ * 0: memcg is not eligible, e.g. all belonging tasks are protected
+ * by oom_score_adj set to OOM_SCORE_ADJ_MIN
+ * >0: memcg is eligible, and the returned value is an estimation
+ * of the memory footprint
+ */
+static long oom_evaluate_memcg(struct mem_cgroup *memcg,
+ const nodemask_t *nodemask,
+ unsigned long totalpages)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ int eligible = 0;
+
+ /*
+ * Root memory cgroup is a special case:
+ * we don't have necessary stats to evaluate it exactly as
+ * leaf memory cgroups, so we approximate it's oom_score
+ * by summing oom_score of all belonging tasks, which are
+ * owners of their mm structs.
+ *
+ * If there are inflight OOM victim tasks inside
+ * the root memcg, we return -1.
+ */
+ if (memcg == root_mem_cgroup) {
+ struct css_task_iter it;
+ struct task_struct *task;
+ long score = 0;
+
+ css_task_iter_start(&memcg->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ if (tsk_is_oom_victim(task) &&
+ !test_bit(MMF_OOM_SKIP,
+ &task->signal->oom_mm->flags)) {
+ score = -1;
+ break;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ task_unlock(task);
+ continue;
+ }
+ task_unlock(task);
+
+ score += oom_badness(task, memcg, nodemask,
+ totalpages);
+ }
+ css_task_iter_end(&it);
+
+ return score;
+ }
+
+ /*
+ * Memcg is OOM eligible if there are OOM killable tasks inside.
+ *
+ * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN
+ * as unkillable.
+ *
+ * If there are inflight OOM victim tasks inside the memcg,
+ * we return -1.
+ */
+ css_task_iter_start(&memcg->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ if (!eligible &&
+ task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)
+ eligible = 1;
+
+ if (tsk_is_oom_victim(task) &&
+ !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
+ eligible = -1;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ if (eligible <= 0)
+ return eligible;
+
+ return memcg_oom_badness(memcg, nodemask, totalpages);
+}
+
+static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
+{
+ struct mem_cgroup *iter, *group = NULL;
+ long group_score = 0;
+
+ oc->chosen_memcg = NULL;
+ oc->chosen_points = 0;
+
+ /*
+ * If OOM is memcg-wide, and the memcg has the oom_group flag set,
+ * all tasks belonging to the memcg should be killed.
+ * So, we mark the memcg as a victim.
+ */
+ if (oc->memcg && mem_cgroup_oom_group(oc->memcg)) {
+ oc->chosen_memcg = oc->memcg;
+ css_get(&oc->chosen_memcg->css);
+ return;
+ }
+
+ /*
+ * The oom_score is calculated for leaf memory cgroups (including
+ * the root memcg).
+ * Non-leaf oom_group cgroups accumulating score of descendant
+ * leaf memory cgroups.
+ */
+ rcu_read_lock();
+ for_each_mem_cgroup_tree(iter, root) {
+ long score;
+
+ /*
+ * We don't consider non-leaf non-oom_group memory cgroups
+ * as OOM victims.
+ */
+ if (memcg_has_children(iter) && iter != root_mem_cgroup &&
+ !mem_cgroup_oom_group(iter))
+ continue;
+
+ /*
+ * If group is not set or we've ran out of the group's sub-tree,
+ * we should set group and reset group_score.
+ */
+ if (!group || group == root_mem_cgroup ||
+ !mem_cgroup_is_descendant(iter, group)) {
+ group = iter;
+ group_score = 0;
+ }
+
+ if (memcg_has_children(iter) && iter != root_mem_cgroup)
+ continue;
+
+ score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages);
+
+ /*
+ * Ignore empty and non-eligible memory cgroups.
+ */
+ if (score == 0)
+ continue;
+
+ /*
+ * If there are inflight OOM victims, we don't need
+ * to look further for new victims.
+ */
+ if (score == -1) {
+ oc->chosen_memcg = INFLIGHT_VICTIM;
+ mem_cgroup_iter_break(root, iter);
+ break;
+ }
+
+ group_score += score;
+
+ if (group_score > oc->chosen_points) {
+ oc->chosen_points = group_score;
+ oc->chosen_memcg = group;
+ }
+ }
+
+ if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM)
+ css_get(&oc->chosen_memcg->css);
+
+ rcu_read_unlock();
+}
+
+bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+ struct mem_cgroup *root;
+
+ if (mem_cgroup_disabled())
+ return false;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return false;
+
+ if (oc->memcg)
+ root = oc->memcg;
+ else
+ root = root_mem_cgroup;
+
+ select_victim_memcg(root, oc);
+
+ return oc->chosen_memcg;
+}
+
/*
* Reclaims as many pages from the given memcg as possible.
*
@@ -5446,6 +5663,39 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ bool oom_group = memcg->oom_group;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return -ENOTSUPP;
+
+ seq_printf(m, "%d\n", oom_group);
+
+ return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int oom_group;
+ int err;
+
+ if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
+ return -ENOTSUPP;
+
+ err = kstrtoint(strstrip(buf), 0, &oom_group);
+ if (err)
+ return err;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5577,6 +5827,12 @@ static struct cftype memory_files[] = {
.write = memory_max_write,
},
{
+ .name = "oom_group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
+ {
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, events_file),
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 84081e77bc51c..6b74142a12597 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -304,7 +304,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
unsigned long points;
@@ -338,26 +338,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto next;
/* Prefer thread group leaders for display purposes */
- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+ if (points == oc->chosen_points && thread_group_leader(oc->chosen_task))
goto next;
select:
- if (oc->chosen)
- put_task_struct(oc->chosen);
+ if (oc->chosen_task)
+ put_task_struct(oc->chosen_task);
get_task_struct(task);
- oc->chosen = task;
+ oc->chosen_task = task;
oc->chosen_points = points;
next:
return 0;
abort:
- if (oc->chosen)
- put_task_struct(oc->chosen);
- oc->chosen = (void *)-1UL;
+ if (oc->chosen_task)
+ put_task_struct(oc->chosen_task);
+ oc->chosen_task = INFLIGHT_VICTIM;
return 1;
}
/*
* Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'. In case scan was aborted, oc->chosen_task is set to -1.
*/
static void select_bad_process(struct oom_control *oc)
{
@@ -835,67 +835,22 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
/*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
+ * __oom_kill_process() is used to kill all tasks belonging to
+ * the selected memory cgroup, so we should check that we're not
+ * trying to kill an unkillable task.
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ if (is_global_init(victim) || (victim->flags & PF_KTHREAD) ||
+ victim->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ put_task_struct(victim);
return;
}
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
p = find_lock_task_mm(victim);
if (!p) {
@@ -970,6 +925,108 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
#undef K
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *p = oc->chosen_task;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ __oom_kill_process(victim);
+}
+
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ get_task_struct(task);
+ __oom_kill_process(task);
+ return 0;
+}
+
+static bool oom_kill_memcg_victim(struct oom_control *oc)
+{
+ if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM)
+ return oc->chosen_memcg;
+
+ /*
+ * If memory.oom_group is set, kill all tasks belonging to the sub-tree
+ * of the chosen memory cgroup, otherwise kill the task with the biggest
+ * memory footprint.
+ */
+ if (mem_cgroup_oom_group(oc->chosen_memcg)) {
+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_kill_memcg_member,
+ NULL);
+ /* We have one or more terminating processes at this point. */
+ oc->chosen_task = INFLIGHT_VICTIM;
+ } else {
+ oc->chosen_points = 0;
+ oc->chosen_task = NULL;
+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);
+
+ if (oc->chosen_task == NULL ||
+ oc->chosen_task == INFLIGHT_VICTIM)
+ goto out;
+
+ __oom_kill_process(oc->chosen_task);
+ }
+
+out:
+ mem_cgroup_put(oc->chosen_memcg);
+ return oc->chosen_task;
+}
+
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
@@ -1022,6 +1079,7 @@ bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
enum oom_constraint constraint = CONSTRAINT_NONE;
+ bool delay = false; /* if set, delay next allocation attempt */
if (oom_killer_disabled)
return false;
@@ -1066,27 +1124,37 @@ bool out_of_memory(struct oom_control *oc)
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oc->chosen = current;
+ oc->chosen_task = current;
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
return true;
}
+ if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc)) {
+ delay = true;
+ goto out;
+ }
+
select_bad_process(oc);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+ if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (oc->chosen && oc->chosen != (void *)-1UL) {
+ if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
- /*
- * Give the killed process a good chance to exit before trying
- * to allocate memory again.
- */
- schedule_timeout_killable(1);
+ delay = true;
}
- return !!oc->chosen;
+
+out:
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
+ if (delay)
+ schedule_timeout_killable(1);
+
+ return !!oc->chosen_task;
}
/*