aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 12:02:50 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 12:02:50 -0700
commite5a3878c947ceef7b6ab68fdc093f3848059842c (patch)
tree0ad530cfbc2f6c1c2b6e98a08988a8c48a522e86 /kernel
parent1ddeeb2a058d7b2a58ed9e820396b4ceb715d529 (diff)
parent3add00be5fe5810d7aa5ec3af8b6a245ef33144b (diff)
downloadlinux-e5a3878c947ceef7b6ab68fdc093f3848059842c.tar.gz
Merge tag 'rcu.next.v6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/boqun/linux
Pull RCU updates from Boqun Feng: - Eliminate deadlocks involving do_exit() and RCU tasks, by Paul: Instead of SRCU read side critical sections, now a percpu list is used in do_exit() for scaning yet-to-exit tasks - Fix a deadlock due to the dependency between workqueue and RCU expedited grace period, reported by Anna-Maria Behnsen and Thomas Gleixner and fixed by Frederic: Now RCU expedited always uses its own kthread worker instead of a workqueue - RCU NOCB updates, code cleanups, unnecessary barrier removals and minor bug fixes - Maintain real-time response in rcu_tasks_postscan() and a minor fix for tasks trace quiescence check - Misc updates, comments and readibility improvement, boot time parameter for lazy RCU and rcutorture improvement - Documentation updates * tag 'rcu.next.v6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/boqun/linux: (34 commits) rcu-tasks: Maintain real-time response in rcu_tasks_postscan() rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks rcu-tasks: Maintain lists to eliminate RCU-tasks/do_exit() deadlocks rcu-tasks: Initialize data to eliminate RCU-tasks/do_exit() deadlocks rcu-tasks: Initialize callback lists at rcu_init() time rcu-tasks: Add data to eliminate RCU-tasks/do_exit() deadlocks rcu-tasks: Repair RCU Tasks Trace quiescence check rcu/sync: remove un-used rcu_sync_enter_start function rcutorture: Suppress rtort_pipe_count warnings until after stalls srcu: Improve comments about acceleration leak rcu: Provide a boot time parameter to control lazy RCU rcu: Rename jiffies_till_flush to jiffies_lazy_flush doc: Update checklist.rst discussion of callback execution doc: Clarify use of slab constructors and SLAB_TYPESAFE_BY_RCU context_tracking: Fix kerneldoc headers for __ct_user_{enter,exit}() doc: Add EARLY flag to early-parsed kernel boot parameters doc: Add CONFIG_RCU_STRICT_GRACE_PERIOD to checklist.rst doc: Make checklist.rst note that spinlocks are implied RCU readers doc: Make whatisRCU.rst note that spinlocks are RCU readers doc: Spinlocks are implied RCU readers ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/context_tracking.c4
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/rcu/Kconfig13
-rw-r--r--kernel/rcu/rcu.h19
-rw-r--r--kernel/rcu/rcuscale.c6
-rw-r--r--kernel/rcu/rcutorture.c13
-rw-r--r--kernel/rcu/srcutree.c24
-rw-r--r--kernel/rcu/sync.c16
-rw-r--r--kernel/rcu/tasks.h135
-rw-r--r--kernel/rcu/tiny.c1
-rw-r--r--kernel/rcu/tree.c237
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_exp.h83
-rw-r--r--kernel/rcu/tree_nocb.h69
-rw-r--r--kernel/rcu/tree_plugin.h52
15 files changed, 388 insertions, 305 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6ef0b35fc28c5..70ae70d038233 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void)
* __ct_user_enter - Inform the context tracking that the CPU is going
* to enter user or guest space mode.
*
+ * @state: userspace context-tracking state to enter.
+ *
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
@@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable);
* __ct_user_exit - Inform the context tracking that the CPU is
* exiting user or guest mode and entering the kernel.
*
+ * @state: userspace context-tracking state being exited from.
+ *
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
* potentially include any high level kernel code like syscalls, exceptions,
diff --git a/kernel/fork.c b/kernel/fork.c
index 1af8dfd149ee0..39a5046c2f0bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1978,6 +1978,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bdd7eadb33d8f..e7d2dd2675931 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,6 +314,19 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure, or callback list growing too big.
+ Requires rcu_nocbs=all to be set.
+
+ Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+
+config RCU_LAZY_DEFAULT_OFF
+ bool "Turn RCU lazy invocation off by default"
+ depends on RCU_LAZY
+ default n
+ help
+ Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
+ off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
+ it back on.
+
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
depends on RCU_EXPERT
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index f94f65877f2b6..86fce206560e8 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -528,6 +528,12 @@ struct task_struct *get_rcu_tasks_gp_kthread(void);
struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+#ifdef CONFIG_TASKS_RCU_GENERIC
+void tasks_cblist_init_generic(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void tasks_cblist_init_generic(void) { }
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -543,11 +549,11 @@ enum rcutorture_type {
};
#if defined(CONFIG_RCU_LAZY)
-unsigned long rcu_lazy_get_jiffies_till_flush(void);
-void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+unsigned long rcu_get_jiffies_lazy_flush(void);
+void rcu_set_jiffies_lazy_flush(unsigned long j);
#else
-static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; }
+static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif
#if defined(CONFIG_TREE_RCU)
@@ -623,12 +629,7 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
-#ifdef CONFIG_RCU_EXP_KTHREAD
extern struct kthread_worker *rcu_exp_gp_kworker;
-extern struct kthread_worker *rcu_exp_par_gp_kworker;
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-extern struct workqueue_struct *rcu_par_gp_wq;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index ffdb30495e3cc..8db4fedaaa1eb 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -764,9 +764,9 @@ kfree_scale_init(void)
if (kfree_by_call_rcu) {
/* do a test to check the timeout. */
- orig_jif = rcu_lazy_get_jiffies_till_flush();
+ orig_jif = rcu_get_jiffies_lazy_flush();
- rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_set_jiffies_lazy_flush(2 * HZ);
rcu_barrier();
jif_start = jiffies;
@@ -775,7 +775,7 @@ kfree_scale_init(void)
smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
- rcu_lazy_set_jiffies_till_flush(orig_jif);
+ rcu_set_jiffies_lazy_flush(orig_jif);
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7567ca8e743ca..45d6b4c3d199c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg)
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
+ unsigned long stallsdone = jiffies;
bool stutter_waited;
unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ // If a new stall test is added, this must be adjusted.
+ if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
@@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg)
!atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- boot_ended)
+ boot_ended &&
+ time_after(jiffies, stallsdone))
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
- rcu_access_pointer(rcu_torture_current) !=
- &rcu_tortures[i]) {
+ rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
@@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = {
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
+ * induces a CPU stall for the time specified by stall_cpu. If a new
+ * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
static int rcu_torture_stall(void *args)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0351a4e83529e..e4d673fc30f42 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
/*
- * The snapshot for acceleration must be taken _before_ the read of the
- * current gp sequence used for advancing, otherwise advancing may fail
- * and acceleration may then fail too.
+ * It's crucial to capture the snapshot 's' for acceleration before
+ * reading the current gp_seq that is used for advancing. This is
+ * essential because if the acceleration snapshot is taken after a
+ * failed advancement attempt, there's a risk that a grace period may
+ * conclude and a new one may start in the interim. If the snapshot is
+ * captured after this sequence of events, the acceleration snapshot 's'
+ * could be excessively advanced, leading to acceleration failure.
+ * In such a scenario, an 'acceleration leak' can occur, where new
+ * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
+ * Also note that encountering advancing failures is a normal
+ * occurrence when the grace period for RCU_WAIT_TAIL is in progress.
*
- * This could happen if:
+ * To see this, consider the following events which occur if
+ * rcu_seq_snap() were to be called after advance:
*
* 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
* RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
@@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp) {
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Acceleration can never fail because the base current gp_seq
+ * used for acceleration is <= the value of gp_seq used for
+ * advancing. This means that RCU_NEXT_TAIL segment will
+ * always be able to be emptied by the acceleration into the
+ * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
+ */
WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
}
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index e550f97779b8d..86df878a2fee8 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp)
init_waitqueue_head(&rsp->gp_wait);
}
-/**
- * rcu_sync_enter_start - Force readers onto slow path for multiple updates
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * Must be called after rcu_sync_init() and before first use.
- *
- * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
- * pairs turn into NO-OPs.
- */
-void rcu_sync_enter_start(struct rcu_sync *rsp)
-{
- rsp->gp_count++;
- rsp->gp_state = GP_PASSED;
-}
-
-
static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 732ad5b39946a..147b5945d67a0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_irq_work: IRQ work queue for deferred wakeups.
* @barrier_q_head: RCU callback for barrier operation.
* @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @rtp_exit_list: List of tasks in the latter portion of do_exit().
* @cpu: CPU number corresponding to this entry.
* @rtpp: Pointer to the rcu_tasks structure.
*/
@@ -46,6 +47,7 @@ struct rcu_tasks_percpu {
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
struct list_head rtp_blkd_tasks;
+ struct list_head rtp_exit_list;
int cpu;
struct rcu_tasks *rtpp;
};
@@ -144,8 +146,6 @@ static struct rcu_tasks rt_name = \
}
#ifdef CONFIG_TASKS_RCU
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
@@ -240,7 +240,6 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
static void cblist_init_generic(struct rcu_tasks *rtp)
{
int cpu;
- unsigned long flags;
int lim;
int shift;
@@ -266,15 +265,15 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
WARN_ON_ONCE(!rtpcp);
if (cpu)
raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
- local_irq_save(flags); // serialize initialization
if (rcu_segcblist_empty(&rtpcp->cblist))
rcu_segcblist_init(&rtpcp->cblist);
- local_irq_restore(flags);
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
}
pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
@@ -851,10 +850,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// number of voluntary context switches, and add that task to the
// holdout list.
// rcu_tasks_postscan():
-// Invoke synchronize_srcu() to ensure that all tasks that were
-// in the process of exiting (and which thus might not know to
-// synchronize with this RCU Tasks grace period) have completed
-// exiting.
+// Gather per-CPU lists of tasks in do_exit() to ensure that all
+// tasks that were in the process of exiting (and which thus might
+// not know to synchronize with this RCU Tasks grace period) have
+// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
+// will take care of any tasks stuck in the non-preemptible region
+// of do_exit() following its call to exit_tasks_rcu_stop().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -867,8 +868,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// with interrupts disabled.
//
// For each exiting task, the exit_tasks_rcu_start() and
-// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
-// read-side critical sections waited for by rcu_tasks_postscan().
+// exit_tasks_rcu_finish() functions add and remove, respectively, the
+// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
+// wait on. This is necessary because rcu_tasks_postscan() must wait on
+// tasks that have already been removed from the global list of tasks.
//
// Pre-grace-period update-side code is ordered before the grace
// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
@@ -932,9 +935,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
}
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int cpu;
int rtsi = READ_ONCE(rcu_task_stall_info);
if (!IS_ENABLED(CONFIG_TINY_RCU)) {
@@ -948,9 +955,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
* this, divide the fragile exit path part in two intersecting
* read side critical sections:
*
- * 1) An _SRCU_ read side starting before calling exit_notify(),
- * which may remove the task from the tasklist, and ending after
- * the final preempt_disable() call in do_exit().
+ * 1) A task_struct list addition before calling exit_notify(),
+ * which may remove the task from the tasklist, with the
+ * removal after the final preempt_disable() call in do_exit().
*
* 2) An _RCU_ read side starting with the final preempt_disable()
* call in do_exit() and ending with the final call to schedule()
@@ -959,7 +966,37 @@ static void rcu_tasks_postscan(struct list_head *hop)
* This handles the part 1). And postgp will handle part 2) with a
* call to synchronize_rcu().
*/
- synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long j = jiffies + 1;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+ struct task_struct *t;
+ struct task_struct *t1;
+ struct list_head tmp;
+
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) {
+ if (list_empty(&t->rcu_tasks_holdout_list))
+ rcu_tasks_pertask(t, hop);
+
+ // RT kernels need frequent pauses, otherwise
+ // pause at least once per pair of jiffies.
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j))
+ continue;
+
+ // Keep our place in the list while pausing.
+ // Nothing else traverses this list, so adding a
+ // bare list_head is OK.
+ list_add(&tmp, &t->rcu_tasks_exit_list);
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ cond_resched(); // For CONFIG_PREEMPT=n kernels
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list);
+ list_del(&tmp);
+ j = jiffies + 1;
+ }
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ }
if (!IS_ENABLED(CONFIG_TINY_RCU))
del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
@@ -1027,7 +1064,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
* enforcing the whole region before tasklist removal until
* the final schedule() with TASK_DEAD state to be an RCU TASKS
* read side critical section.
@@ -1035,9 +1071,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
synchronize_rcu();
}
-void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
-
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
{
#ifndef CONFIG_TINY_RCU
@@ -1118,7 +1151,6 @@ module_param(rcu_tasks_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_kthread(void)
{
- cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
if (rcu_tasks_lazy_ms >= 0)
@@ -1147,25 +1179,48 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist. Do this by adding the task to yet
+ * another list.
+ *
+ * Note that the task will remove itself from this list, so there is no
+ * need for get_task_struct(), except in the case where rcu_tasks_pertask()
+ * adds it to the holdout list, in which case rcu_tasks_pertask() supplies
+ * the needed get_task_struct().
*/
-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_start(void)
{
- current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t = current;
+
+ WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+ preempt_disable();
+ rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
+ t->rcu_tasks_exit_cpu = smp_processor_id();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ preempt_enable();
}
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_stop(void)
{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
struct task_struct *t = current;
- __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
+ WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+ rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->rcu_tasks_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
/*
@@ -1282,7 +1337,6 @@ module_param(rcu_tasks_rude_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_rude_kthread(void)
{
- cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
if (rcu_tasks_rude_lazy_ms >= 0)
rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
@@ -1914,7 +1968,6 @@ module_param(rcu_tasks_trace_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_trace_kthread(void)
{
- cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
@@ -2086,6 +2139,24 @@ late_initcall(rcu_tasks_verify_schedule_work);
static void rcu_tasks_initiate_self_tests(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
+void __init tasks_cblist_init_generic(void)
+{
+ lockdep_assert_irqs_disabled();
+ WARN_ON(num_online_cpus() > 1);
+
+#ifdef CONFIG_TASKS_RCU
+ cblist_init_generic(&rcu_tasks);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ cblist_init_generic(&rcu_tasks_rude);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ cblist_init_generic(&rcu_tasks_trace);
+#endif
+}
+
void __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fec804b790803..705c0d16850aa 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -261,4 +261,5 @@ void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
+ tasks_cblist_init_generic();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b2bccfd37c383..d9642dd06c253 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
@@ -2145,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
+ *
+ * Callbacks execution is fully ordered against preceding grace period
+ * completion (materialized by rnp->gp_seq update) thanks to the
+ * smp_mb__after_unlock_lock() upon node locking required for callbacks
+ * advancing. In NOCB mode this ordering is then further relayed through
+ * the nocb locking that protects both callbacks advancing and extraction.
*/
rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -2591,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void)
return 0;
}
+static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
+{
+ rcu_segcblist_enqueue(&rdp->cblist, head);
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
+ (unsigned long)func,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ else
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+}
+
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
*/
-static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
- unsigned long flags)
+static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags)
{
+ rcutree_enqueue(rdp, head, func);
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2692,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
unsigned long flags;
bool lazy;
struct rcu_data *rdp;
- bool was_alldone;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2729,30 +2748,18 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
- return; // Enqueued onto ->nocb_bypass, so just leave.
- // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
- rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
- trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
-
- /* Go handle any RCU core processing required. */
- if (unlikely(rcu_rdp_is_offloaded(rdp))) {
- __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
- } else {
- __call_rcu_core(rdp, head, flags);
- local_irq_restore(flags);
- }
+ if (unlikely(rcu_rdp_is_offloaded(rdp)))
+ call_rcu_nocb(rdp, head, func, flags, lazy);
+ else
+ call_rcu_core(rdp, head, func, flags);
+ local_irq_restore(flags);
}
#ifdef CONFIG_RCU_LAZY
+static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
+module_param(enable_rcu_lazy, bool, 0444);
+
/**
* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
* flush all lazy callbacks (including the new one) to the main ->cblist while
@@ -2778,6 +2785,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
__call_rcu_common(head, func, false);
}
EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#else
+#define enable_rcu_lazy false
#endif
/**
@@ -2826,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+ __call_rcu_common(head, func, enable_rcu_lazy);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -4394,6 +4403,66 @@ rcu_boot_init_percpu_data(int cpu)
rcu_boot_init_nocb_percpu_data(rdp);
}
+struct kthread_worker *rcu_exp_gp_kworker;
+
+static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker;
+ const char *name = "rcu_exp_par_gp_kthread_worker/%d";
+ struct sched_param param = { .sched_priority = kthread_prio };
+ int rnp_index = rnp - rcu_get_root();
+
+ if (rnp->exp_kworker)
+ return;
+
+ kworker = kthread_create_worker(0, name, rnp_index);
+ if (IS_ERR_OR_NULL(kworker)) {
+ pr_err("Failed to create par gp kworker on %d/%d\n",
+ rnp->grplo, rnp->grphi);
+ return;
+ }
+ WRITE_ONCE(rnp->exp_kworker, kworker);
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+}
+
+static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
+
+ if (!kworker)
+ return NULL;
+
+ return kworker->task;
+}
+
+static void __init rcu_start_exp_gp_kworker(void)
+{
+ const char *name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", name);
+ rcu_exp_gp_kworker = NULL;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+}
+
+static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
+{
+ if (rcu_scheduler_fully_active) {
+ mutex_lock(&rnp->kthread_mutex);
+ rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_exp_par_gp_kworker(rnp);
+ mutex_unlock(&rnp->kthread_mutex);
+ }
+}
+
/*
* Invoked early in the CPU-online process, when pretty much all services
* are available. The incoming CPU is not present.
@@ -4442,7 +4511,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
@@ -4450,13 +4519,64 @@ int rcutree_prepare_cpu(unsigned int cpu)
}
/*
- * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ * Update kthreads affinity during CPU-hotplug changes.
+ *
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->kthread_mutex.
*/
-static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ cpumask_var_t cm;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct task_struct *task_boost, *task_exp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+
+ task_boost = rcu_boost_task(rnp);
+ task_exp = rcu_exp_par_gp_task(rnp);
+
+ /*
+ * If CPU is the boot one, those tasks are created later from early
+ * initcall since kthreadd must be created first.
+ */
+ if (!task_boost && !task_exp)
+ return;
+
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+
+ mutex_lock(&rnp->kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+ cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm)) {
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
+
+ if (task_exp)
+ set_cpus_allowed_ptr(task_exp, cm);
+
+ if (task_boost)
+ set_cpus_allowed_ptr(task_boost, cm);
- rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+ mutex_unlock(&rnp->kthread_mutex);
+
+ free_cpumask_var(cm);
}
/*
@@ -4640,8 +4760,9 @@ void rcutree_migrate_callbacks(int cpu)
__call_rcu_nocb_wake(my_rdp, true, flags);
} else {
rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
}
+ local_irq_restore(flags);
if (needwake)
rcu_gp_kthread_wake();
lockdep_assert_irqs_enabled();
@@ -4730,51 +4851,6 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-#ifdef CONFIG_RCU_EXP_KTHREAD
-struct kthread_worker *rcu_exp_gp_kworker;
-struct kthread_worker *rcu_exp_par_gp_kworker;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
- const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
- const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
- struct sched_param param = { .sched_priority = kthread_prio };
-
- rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
- if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
- pr_err("Failed to create %s!\n", gp_kworker_name);
- return;
- }
-
- rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
- if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
- pr_err("Failed to create %s!\n", par_gp_kworker_name);
- kthread_destroy_worker(rcu_exp_gp_kworker);
- return;
- }
-
- sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
- sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
- &param);
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-struct workqueue_struct *rcu_par_gp_wq;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
@@ -4809,10 +4885,10 @@ static int __init rcu_spawn_gp_kthread(void)
* due to rcu_scheduler_fully_active.
*/
rcu_spawn_cpu_nocb_kthread(smp_processor_id());
- rcu_spawn_one_boost_kthread(rdp->mynode);
+ rcu_spawn_rnp_kthreads(rdp->mynode);
rcu_spawn_core_kthreads();
/* Create kthread worker for expedited GPs */
- rcu_start_exp_gp_kworkers();
+ rcu_start_exp_gp_kworker();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4915,7 +4991,7 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
- mutex_init(&rnp->boost_kthread_mutex);
+ mutex_init(&rnp->kthread_mutex);
raw_spin_lock_init(&rnp->exp_poll_lock);
rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
@@ -5152,7 +5228,6 @@ void __init rcu_init(void)
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -5165,6 +5240,8 @@ void __init rcu_init(void)
(void)start_poll_synchronize_rcu_expedited();
rcu_test_sync_prims();
+
+ tasks_cblist_init_generic();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e9821a8422dbe..df48160b3136d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -21,14 +21,10 @@
#include "rcu_segcblist.h"
-/* Communicate arguments to a workqueue handler. */
+/* Communicate arguments to a kthread worker handler. */
struct rcu_exp_work {
unsigned long rew_s;
-#ifdef CONFIG_RCU_EXP_KTHREAD
struct kthread_work rew_work;
-#else
- struct work_struct rew_work;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -72,6 +68,9 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ struct kthread_worker *exp_kworker;
+ /* Workers performing per node expedited GP */
+ /* initialization. */
unsigned long cbovldmask;
/* CPUs experiencing callback overload. */
unsigned long ffmask; /* Fully functional CPUs. */
@@ -113,7 +112,7 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
- struct mutex boost_kthread_mutex;
+ struct mutex kthread_mutex;
/* Exclusion for thread spawning and affinity */
/* manipulation. */
struct task_struct *boost_kthread_task;
@@ -467,11 +466,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
unsigned long j, bool lazy);
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags,
- bool lazy);
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags);
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy);
+static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 2ac440bc7e10b..6b83537480b12 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -198,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
+ if (wake)
swake_up_one_online(&rcu_state.expedited_wq);
- }
+
break;
}
mask = rnp->grpmask;
@@ -419,7 +418,6 @@ retry_ipi:
static void rcu_exp_sel_wait_wake(unsigned long s);
-#ifdef CONFIG_RCU_EXP_KTHREAD
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
{
struct rcu_exp_work *rewp =
@@ -428,9 +426,14 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
__sync_rcu_exp_select_node_cpus(rewp);
}
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_gp_kworker);
+}
+
+static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
{
- return !!READ_ONCE(rcu_exp_par_gp_kworker);
+ return !!READ_ONCE(rnp->exp_kworker);
}
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
@@ -441,7 +444,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
* another work item on the same kthread worker can result in
* deadlock.
*/
- kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+ kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work);
}
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
@@ -466,64 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
}
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
-
- __sync_rcu_exp_select_node_cpus(rewp);
-}
-
-static inline bool rcu_gp_par_worker_started(void)
-{
- return !!READ_ONCE(rcu_par_gp_wq);
-}
-
-static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
-{
- int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
-}
-
-static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
-{
- flush_work(&rnp->rew.rew_work);
-}
-
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
-static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
-{
- INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew->rew_work);
-}
-
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
- destroy_work_on_stack(&rew->rew_work);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
@@ -541,7 +486,7 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!rcu_gp_par_worker_started() ||
+ if (!rcu_exp_par_worker_started(rnp) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
/* No worker started yet or last leaf, do direct call. */
@@ -956,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
@@ -996,7 +940,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(boottime)) {
+ if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -1013,9 +957,6 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
-
- if (likely(!boottime))
- synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 4efbf7333d4e1..3f85577bddd4e 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
return __wake_nocb_gp(rdp_gp, rdp, force, flags);
}
+#ifdef CONFIG_RCU_LAZY
/*
* LAZY_FLUSH_JIFFIES decides the maximum amount of time that
* can elapse before lazy callbacks are flushed. Lazy callbacks
@@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
* left unsubmitted to RCU after those many jiffies.
*/
#define LAZY_FLUSH_JIFFIES (10 * HZ)
-static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES;
-#ifdef CONFIG_RCU_LAZY
// To be called only from test code.
-void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+void rcu_set_jiffies_lazy_flush(unsigned long jif)
{
- jiffies_till_flush = jif;
+ jiffies_lazy_flush = jif;
}
-EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush);
-unsigned long rcu_lazy_get_jiffies_till_flush(void)
+unsigned long rcu_get_jiffies_lazy_flush(void)
{
- return jiffies_till_flush;
+ return jiffies_lazy_flush;
}
-EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush);
#endif
/*
@@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
@@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// flush ->nocb_bypass to ->cblist.
if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
(ncbs && bypass_is_lazy &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
ncbs >= qhimark) {
rcu_nocb_lock(rdp);
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
// 2. Both of these conditions are met:
// a. The bypass list previously had only lazy CBs, and:
// b. The new CB is non-lazy.
- if (ncbs && (!bypass_is_lazy || lazy)) {
- local_irq_restore(flags);
- } else {
+ if (!ncbs || (bypass_is_lazy && !lazy)) {
// No-CBs GP kthread might be indefinitely asleep, if so, wake.
rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
@@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
} else {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
}
}
return true; // Callback already enqueued.
@@ -566,11 +564,12 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
long lazy_len;
long len;
struct task_struct *t;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave.
t = READ_ONCE(rdp->nocb_gp_kthread);
if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeNotPoll"));
return;
@@ -583,17 +582,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rdp->qlen_last_fqs_check = len;
// Only lazy CBs in bypass list
if (lazy_len && bypass_len == lazy_len) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
TPS("WakeLazy"));
} else if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp(rdp, false);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
TPS("WakeEmptyIsDeferred"));
}
@@ -610,20 +609,32 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
smp_mb(); /* Enqueue before timer_pending(). */
if ((rdp->nocb_cb_sleep ||
!rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_timer)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ !timer_pending(&rdp_gp->nocb_timer)) {
+ rcu_nocb_unlock(rdp);
wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
} else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ rcu_nocb_unlock(rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
}
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
+{
+ bool was_alldone;
+
+ if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
+ /* Not enqueued on bypass but locked, do regular enqueue */
+ rcutree_enqueue(rdp, head, func);
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ }
+}
+
static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
bool *wake_state)
{
@@ -723,7 +734,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
lazy_ncbs = READ_ONCE(rdp->lazy_len);
if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
bypass_ncbs > 2 * qhimark)) {
flush_bypass = true;
} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
@@ -779,7 +790,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
needwake = rdp->nocb_cb_sleep;
WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
} else {
needwake = false;
}
@@ -933,8 +943,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
nocb_cb_wait_cond(rdp));
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ if (READ_ONCE(rdp->nocb_cb_sleep)) {
WARN_ON(signal_pending(current));
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
}
@@ -1383,7 +1392,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
rcu_nocb_unlock_irqrestore(rdp, flags);
continue;
}
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ rcu_nocb_try_flush_bypass(rdp, jiffies);
rcu_nocb_unlock_irqrestore(rdp, flags);
wake_nocb_gp(rdp, false);
sc->nr_to_scan -= _count;
@@ -1768,10 +1777,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
return true;
}
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags, bool lazy)
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
{
- return false;
+ WARN_ON_ONCE(1); /* Should be dead code! */
}
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 41021080ad258..36a8b5dbf5b52 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct sched_param sp;
struct task_struct *t;
- mutex_lock(&rnp->boost_kthread_mutex);
- if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
- goto out;
+ if (rnp->boost_kthread_task)
+ return;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- goto out;
+ return;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1210,48 +1209,11 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
-
- out:
- mutex_unlock(&rnp->boost_kthread_mutex);
}
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->boost_kthread_mutex.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
{
- struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask;
- cpumask_var_t cm;
- int cpu;
-
- if (!t)
- return;
- if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
- return;
- mutex_lock(&rnp->boost_kthread_mutex);
- mask = rcu_rnp_online_cpus(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
- cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm)) {
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (outgoingcpu >= 0)
- cpumask_clear_cpu(outgoingcpu, cm);
- }
- set_cpus_allowed_ptr(t, cm);
- mutex_unlock(&rnp->boost_kthread_mutex);
- free_cpumask_var(cm);
+ return READ_ONCE(rnp->boost_kthread_task);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1270,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
{
+ return NULL;
}
-
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*