summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/trace.txt144
-rw-r--r--Documentation/dontdiff26
-rw-r--r--Documentation/kernel-docs.txt27
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/gfs2/glock.c71
-rw-r--r--fs/gfs2/glock.h28
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/incore.h12
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/ops_inode.c18
-rw-r--r--fs/gfs2/quota.c13
-rw-r--r--fs/gfs2/rgrp.c57
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/xattr.c23
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/rculist.h5
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--include/linux/rcutiny.h13
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/sched.h11
-rw-r--r--init/Kconfig55
-rw-r--r--kernel/futex.c235
-rw-r--r--kernel/rcutiny.c105
-rw-r--r--kernel/rcutiny_plugin.h433
-rw-r--r--kernel/rcutorture.c270
-rw-r--r--kernel/rcutree.c156
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h135
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c69
-rw-r--r--kernel/srcu.c8
-rwxr-xr-xscripts/kernel-doc102
33 files changed, 1485 insertions, 656 deletions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index a851118775d8..6a8c73f55b80 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
CONFIG_RCU_TRACE debugfs Files and Formats
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state. This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state. This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
The output of "cat rcu/rcudata" looks as follows:
@@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for
been registered in absence of CPU-hotplug activity.
o "co" is the number of RCU callbacks that have been orphaned due to
- this CPU going offline.
+ this CPU going offline. These orphaned callbacks have been moved
+ to an arbitrarily chosen online CPU.
o "ca" is the number of RCU callbacks that have been adopted due to
other CPUs going offline. Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is
The output of "cat rcu/rcuhier" looks as follows, with very long lines:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1/1 .>. 0:127 ^0
3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
0/1 .>. 0:127 ^0
0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that
exited immediately (without even being counted in nfqs above)
due to contention on ->fqslock.
-o "oqlen" is the number of callbacks on the "orphan" callback
- list. RCU callbacks are placed on this list by CPUs going
- offline, and are "adopted" either by the CPU helping the outgoing
- CPU or by the next rcu_barrier*() call, whichever comes first.
-
o Each element of the form "1/1 0:127 ^0" represents one struct
rcu_node. Each line represents one level of the hierarchy, from
root to leaves. It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert
readers will note that the rcu "nn" number for a given CPU very
closely matches the rcu_bh "np" number for that same CPU. This
is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+ ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+ normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+ exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds. The fields are as follows:
+
+o "qlen" is the number of RCU callbacks currently waiting either
+ for an RCU grace period or waiting to be invoked. This is the
+ only field present for rcu_sched and rcu_bh, due to the
+ short-circuiting of grace period in those two cases.
+
+o "gp" is the number of grace periods that have completed.
+
+o "g197/p197/c197" displays the grace-period state, with the
+ "g" number being the number of grace periods that have started
+ (mod 256), the "p" number being the number of grace periods
+ that the CPU has responded to (also mod 256), and the "c"
+ number being the number of grace periods that have completed
+ (once again mode 256).
+
+ Why have both "gp" and "g"? Because the data flowing into
+ "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o "tasks" is a set of bits. The first bit is "T" if there are
+ currently tasks that have recently blocked within an RCU
+ read-side critical section, the second bit is "N" if any of the
+ aforementioned tasks are blocking the current RCU grace period,
+ and the third bit is "E" if any of the aforementioned tasks are
+ blocking the current expedited grace period. Each bit is "."
+ if the corresponding condition does not hold.
+
+o "ttb" is a single bit. It is "B" if any of the blocked tasks
+ need to be priority boosted and "." otherwise.
+
+o "btg" indicates whether boosting has been carried out during
+ the current grace period, with "exp" indicating that boosting
+ is in progress for an expedited grace period, "no" indicating
+ that boosting has not yet started for a normal grace period,
+ "begun" indicating that boosting has bebug for a normal grace
+ period, and "done" indicating that boosting has completed for
+ a normal grace period.
+
+o "ntb" is the total number of tasks subjected to RCU priority boosting
+ periods since boot.
+
+o "neb" is the number of expedited grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "nnb" is the number of normal grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o "bt" is the low-order 12 bits of the value that the jiffies counter
+ will have at the next time that boosting is scheduled to begin.
+
+o In the line beginning with "normal balk", the fields are as follows:
+
+ o "nt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+ Note that the system will balk from boosting even if the
+ grace period is overdue when the currently running task
+ is looping within an RCU read-side critical section.
+ There is no point in boosting in this case, because
+ boosting a running task won't make it run any faster.
+
+ o "gt" is the number of times that the system balked
+ from boosting because, although there were blocked tasks,
+ none of them were preventing the current grace period
+ from completing.
+
+ o "bt" is the number of times that the system balked
+ from boosting because boosting was already in progress.
+
+ o "b" is the number of times that the system balked from
+ boosting because boosting had already completed for
+ the grace period in question.
+
+ o "ny" is the number of times that the system balked from
+ boosting because it was not yet time to start boosting
+ the grace period in question.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons. This can actually happen due to races involving
+ increments of the jiffies counter.
+
+o In the line beginning with "exp balk", the fields are as follows:
+
+ o "bt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons.
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index d9bcffd59433..470d3dba1a69 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,6 +62,10 @@ aic7*reg_print.c*
aic7*seq.h*
aicasm
aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
asm-offsets.h
asm_offsets.h
autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
build
bvmlinux
bzImage*
+capflags.c
classlist.h*
comp*.log
compile.h*
@@ -94,6 +99,7 @@ devlist.h*
docproc
elf2ecoff
elfconfig.h*
+evergreen_reg_safe.h
fixdep
flask.h
fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
*_gray256.c
ihex2fw
ikconfig.h*
+inat-tables.c
initramfs_data.cpio
initramfs_data.cpio.gz
initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
kallsyms
kconfig
keywords.c
@@ -140,6 +153,7 @@ mkprep
mktables
mktree
modpost
+modules.builtin
modules.order
modversions.h*
ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
pca200e_ecd.bin2
piggy.gz
piggyback
+piggy.S
pnmtologo
ppc_defs.h*
pss_boot.h
qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
raid6altivec*.c
raid6int*.c
raid6tables.c
relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
series
setup
setup.bin
@@ -169,6 +192,7 @@ sImage
sm_tbl*
split-include
syscalltab.h
+tables.c
tags
tftpboot.img
timeconst.h
@@ -190,6 +214,7 @@ vmlinux
vmlinux-*
vmlinux.aout
vmlinux.lds
+voffset.h
vsyscall.lds
vsyscall_32.lds
wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
wakeup.lds
zImage*
zconf.hash.c
+zoffset.h
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 715eaaf1519d..9a8674629a07 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -537,7 +537,7 @@
Notes: Further information in
http://www.oreilly.com/catalog/linuxdrive2/
- * Title: "Linux Device Drivers, 3nd Edition"
+ * Title: "Linux Device Drivers, 3rd Edition"
Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
Publisher: O'Reilly & Associates.
Date: 2005.
@@ -592,14 +592,6 @@
Pages: 600.
ISBN: 0-13-101908-2
- * Title: "The Design and Implementation of the 4.4 BSD UNIX
- Operating System"
- Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
- John S. Quarterman.
- Publisher: Addison-Wesley.
- Date: 1996.
- ISBN: 0-201-54979-4
-
* Title: "Programming for the real world - POSIX.4"
Author: Bill O. Gallmeister.
Publisher: O'Reilly & Associates, Inc..
@@ -610,28 +602,13 @@
POSIX. Good reference.
* Title: "UNIX Systems for Modern Architectures: Symmetric
- Multiprocesssing and Caching for Kernel Programmers"
+ Multiprocessing and Caching for Kernel Programmers"
Author: Curt Schimmel.
Publisher: Addison Wesley.
Date: June, 1994.
Pages: 432.
ISBN: 0-201-63338-8
- * Title: "The Design and Implementation of the 4.3 BSD UNIX
- Operating System"
- Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
- Karels, John S. Quarterman.
- Publisher: Addison-Wesley.
- Date: 1989 (reprinted with corrections on October, 1990).
- ISBN: 0-201-06196-1
-
- * Title: "The Design of the UNIX Operating System"
- Author: Maurice J. Bach.
- Publisher: Prentice Hall.
- Date: 1986.
- Pages: 471.
- ISBN: 0-13-201757-1
-
MISCELLANEOUS:
* Name: linux/Documentation
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4ee..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
int metadata;
unsigned int revokes = 0;
int x;
- int error;
+ int error = 0;
if (!*top)
sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (metadata)
revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+ if (ip != GFS2_I(sdp->sd_rindex))
+ error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+ else if (!sdp->sd_rgrps)
+ error = gfs2_ri_update(ip);
+
if (error)
return error;
@@ -879,7 +883,8 @@ out_rg_gunlock:
out_rlist:
gfs2_rlist_free(&rlist);
out:
- gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+ if (ip != GFS2_I(sdp->sd_rindex))
+ gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
return error;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f92c17704169..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
spin_unlock(&gl->gl_spin);
}
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int req_state,
- unsigned int flags)
-{
- int ret = LM_OUT_ERROR;
-
- if (!sdp->sd_lockstruct.ls_ops->lm_lock)
- return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
- req_state, flags);
- return ret;
-}
-
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
- BUG_ON(gl->gl_state == target);
- BUG_ON(gl->gl_state == gl->gl_target);
+ GLOCK_BUG_ON(gl, gl->gl_state == target);
+ GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
glops->go_inval) {
set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
do_error(gl, 0); /* Fail queued try locks */
}
+ gl->gl_req = target;
spin_unlock(&gl->gl_spin);
if (glops->go_xmote_th)
glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
gl->gl_state == LM_ST_DEFERRED) &&
!(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
lck_flags |= LM_FLAG_TRY_1CB;
- ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
- if (!(ret & LM_OUT_ASYNC)) {
- finish_xmote(gl, ret);
+ if (sdp->sd_lockstruct.ls_ops->lm_lock) {
+ /* lock_dlm */
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+ GLOCK_BUG_ON(gl, ret);
+ } else { /* lock_nolock */
+ finish_xmote(gl, target);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
- } else {
- GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
}
+
spin_lock(&gl->gl_spin);
}
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
+
if (seq) {
struct gfs2_glock_iter *gi = seq->private;
vsprintf(gi->string, fmt, args);
seq_printf(seq, gi->string);
} else {
- printk(KERN_ERR " ");
- vprintk(fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_ERR " %pV", &vaf);
}
+
va_end(args);
}
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
*/
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
- spin_lock(&gl->gl_spin);
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
return;
}
- spin_unlock(&gl->gl_spin);
}
+
+ spin_unlock(&gl->gl_spin);
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ smp_wmb();
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
struct task_struct *gh_owner = NULL;
- char buffer[KSYM_SYMBOL_LEN];
char flags_buf[32];
- sprint_symbol(buffer, gh->gh_ip);
if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
- gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
- state2str(gh->gh_state),
- hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
- gh->gh_error,
- gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
- gh_owner ? gh_owner->comm : "(ended)", buffer);
+ gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+ state2str(gh->gh_state),
+ hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+ gh->gh_error,
+ gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+ gh_owner ? gh_owner->comm : "(ended)",
+ (void *)gh->gh_ip);
return 0;
}
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
}
#endif
- glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+ glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZEABLE, 0);
if (IS_ERR(glock_workqueue))
return PTR_ERR(glock_workqueue);
- gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
- WQ_FREEZEABLE, 0);
+ gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+ WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+ 0);
if (IS_ERR(gfs2_delete_workqueue)) {
destroy_workqueue(glock_workqueue);
return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d220..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
#define GL_ASYNC 0x00000040
#define GL_EXACT 0x00000080
#define GL_SKIP 0x00000100
-#define GL_ATIME 0x00000200
#define GL_NOCACHE 0x00000400
/*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
*
* LM_OUT_ST_MASK
* Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
* LM_OUT_CANCELED
* The lock request was canceled.
*
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
*/
#define LM_OUT_ST_MASK 0x00000003
#define LM_OUT_CANCELED 0x00000008
-#define LM_OUT_ASYNC 0x00000080
-#define LM_OUT_ERROR 0x00000100
+#define LM_OUT_ERROR 0x00000004
/*
* lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
- unsigned int (*lm_lock) (struct gfs2_glock *gl,
- unsigned int req_state, unsigned int flags);
+ int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+ unsigned int flags);
void (*lm_cancel) (struct gfs2_glock *gl);
const match_table_t *lm_tokens;
};
-#define LM_FLAG_TRY 0x00000001
-#define LM_FLAG_TRY_1CB 0x00000002
-#define LM_FLAG_NOEXP 0x00000004
-#define LM_FLAG_ANY 0x00000008
-#define LM_FLAG_PRIORITY 0x00000010
-
-#define GL_ASYNC 0x00000040
-#define GL_EXACT 0x00000080
-#define GL_SKIP 0x00000100
-#define GL_NOCACHE 0x00000400
-
-#define GLR_TRYFAILED 13
-
extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e5..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
if (gl->gl_state != LM_ST_UNLOCKED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- flush_workqueue(gfs2_delete_workqueue);
gfs2_meta_syncfs(sdp);
gfs2_log_shutdown(sdp);
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc8..8d3d2b4a0a7d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
spinlock_t gl_spin;
- unsigned int gl_state;
- unsigned int gl_target;
- unsigned int gl_reply;
+ /* State fields protected by gl_spin */
+ unsigned int gl_state:2, /* Current state */
+ gl_target:2, /* Target state */
+ gl_demote_state:2, /* State requested by remote node */
+ gl_req:2, /* State in last dlm request */
+ gl_reply:8; /* Last reply from the dlm */
+
unsigned int gl_hash;
- unsigned int gl_req;
- unsigned int gl_demote_state; /* state requested by remote node */
unsigned long gl_demote_time; /* time of first demote request */
struct list_head gl_holders;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f9217..14e682dbe8bf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
-
- gfs2_assert_warn(GFS2_SB(inode), !error);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45fd..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
return lkf;
}
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
- unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+ unsigned int flags)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
- int error;
int req;
u32 lkf;
- gl->gl_req = req_state;
req = make_mode(req_state);
lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
* Submit the actual lock request.
*/
- error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
- GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
- if (error == -EAGAIN)
- return 0;
- if (error)
- return LM_OUT_ERROR;
- return LM_OUT_ASYNC;
+ return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+ GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
}
static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c2..1db6b7343229 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct buffer_head *dibh;
u32 ouid, ogid, nuid, ngid;
int error;
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out_gunlock_q;
- error = gfs2_meta_inode_buffer(ip, &dibh);
+ error = gfs2_setattr_simple(ip, attr);
if (error)
goto out_end_trans;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- int error;
-
- error = vmtruncate(inode, attr->ia_size);
- gfs2_assert_warn(sdp, !error);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-
if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f606baf9ba72..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
qd->qd_qb.qb_limit = qp->qu_limit;
}
+ if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+ qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+ qd->qd_qb.qb_value = qp->qu_value;
+ }
}
/* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
}
/* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+ if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+ ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+ fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
if (fdq->d_fieldmask == 0)
goto out_i;
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
.get_dqblk = gfs2_get_dqblk,
.set_dqblk = gfs2_set_dqblk,
};
-
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 33c8407b876f..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
- if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+ if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
break;
error = gfs2_internal_read(ip, &ra_state, buf, &pos,
sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
* Returns: 0 on successful update, error code otherwise
*/
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
}
/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct inode *inode = &ip->i_inode;
- struct file_ra_state ra_state;
- struct gfs2_rgrpd *rgd;
- unsigned int max_data = 0;
- int error;
-
- file_ra_state_init(&ra_state, inode->i_mapping);
- for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
- /* Ignore partials */
- if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- i_size_read(inode))
- break;
- error = read_rindex_entry(ip, &ra_state);
- if (error) {
- clear_rgrpdi(sdp);
- return error;
- }
- }
- list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
- if (rgd->rd_data > max_data)
- max_data = rgd->rd_data;
- sdp->sd_max_rg_data = max_data;
-
- sdp->sd_rindex_uptodate = 1;
- return 0;
-}
-
-/**
* gfs2_rindex_hold - Grab a lock on the rindex
* @sdp: The GFS2 superblock
* @ri_gh: the glock holder
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
else if (!sdp->sd_rgrps) /* We may not have the rindex read
in, so: */
- error = gfs2_ri_update_special(ip);
+ error = gfs2_ri_update(ip);
if (error)
return error;
}
+try_again:
do {
error = get_local_rgrp(ip, &last_unlinked);
/* If there is no space, flushing the log may release some */
- if (error)
+ if (error) {
+ if (ip == GFS2_I(sdp->sd_rindex) &&
+ !sdp->sd_rindex_uptodate) {
+ error = gfs2_ri_update(ip);
+ if (error)
+ return error;
+ goto try_again;
+ }
gfs2_log_flush(sdp, NULL);
+ }
} while (error && tries++ < 3);
if (error) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9a..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a6..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
- struct inode *inode = &ip->i_inode;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_ea_location el;
- struct buffer_head *dibh;
int error;
error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
if (error)
return error;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_trans_end;
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- int error;
-
- error = vmtruncate(inode, attr->ia_size);
- gfs2_assert_warn(GFS2_SB(inode), !error);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-
-out_trans_end:
+ error = gfs2_setattr_simple(ip, attr);
gfs2_trans_end(sdp);
return error;
}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f8c06ce0fa6..6b281fae114a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -83,6 +83,12 @@ extern struct group_info init_groups;
*/
# define CAP_INIT_BSET CAP_FULL_SET
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST() \
+ .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
#ifdef CONFIG_TREE_PREEMPT_RCU
#define INIT_TASK_RCU_TREE_PREEMPT() \
.rcu_blocked_node = NULL,
@@ -94,7 +100,8 @@ extern struct group_info init_groups;
.rcu_read_lock_nesting = 0, \
.rcu_read_unlock_special = 0, \
.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
- INIT_TASK_RCU_TREE_PREEMPT()
+ INIT_TASK_RCU_TREE_PREEMPT() \
+ INIT_TASK_RCU_BOOST()
#else
#define INIT_TASK_RCU_PREEMPT(tsk)
#endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f31ef61f1c65..2dea94fc4402 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
#define list_first_entry_rcu(ptr, type, member) \
list_entry_rcu((ptr)->next, type, member)
-#define __list_for_each_rcu(pos, head) \
- for (pos = rcu_dereference_raw(list_next_rcu(head)); \
- pos != (head); \
- pos = rcu_dereference_raw(list_next_rcu((pos)))
-
/**
* list_for_each_entry_rcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03cda7bed985..af5614856285 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
extern int rcutorture_runnable; /* for sysctl */
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
extern void synchronize_sched(void);
extern void rcu_barrier_bh(void);
extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
extern int sched_expedited_torture_stats(char *page);
static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/* Internal to kernel */
-extern void rcu_init(void);
extern void rcu_sched_qs(int cpu);
extern void rcu_bh_qs(int cpu);
extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 13877cb93a60..30ebd7c8d874 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
#include <linux/cache.h>
-#define rcu_init_sched() do { } while (0)
+static inline void rcu_init(void)
+{
+}
#ifdef CONFIG_TINY_RCU
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
synchronize_sched();
}
+static inline void synchronize_sched_expedited(void)
+{
+ synchronize_sched();
+}
+
#ifdef CONFIG_TINY_RCU
static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
extern int rcu_scheduler_active __read_mostly;
extern void rcu_scheduler_starting(void);
-
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
static inline void rcu_scheduler_starting(void)
{
}
-
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 95518e628794..3a933482734a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H
+extern void rcu_init(void);
extern void rcu_note_context_switch(int cpu);
extern int rcu_needs_cpu(int cpu);
extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
extern void synchronize_rcu_expedited(void);
static inline void synchronize_rcu_bh_expedited(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a99d735db3df..48c409c279d3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1233,6 +1233,9 @@ struct task_struct {
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -1763,7 +1766,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
#ifdef CONFIG_PREEMPT_RCU
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
static inline void rcu_copy_process(struct task_struct *p)
{
@@ -1771,7 +1775,10 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_read_unlock_special = 0;
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
diff --git a/init/Kconfig b/init/Kconfig
index c9728992a776..526ec1c7456a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -393,7 +393,6 @@ config PREEMPT_RCU
config RCU_TRACE
bool "Enable tracing for RCU"
- depends on TREE_RCU || TREE_PREEMPT_RCU
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
TREE_PREEMPT_RCU implementations, permitting Makefile to
trivially select kernel/rcutree_trace.c.
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && TINY_PREEMPT_RCU
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+ int "Real-time priority to boost RCU readers to"
+ range 1 99
+ depends on RCU_BOOST
+ default 1
+ help
+ This option specifies the real-time priority to which preempted
+ RCU readers are to be boosted. If you are working with CPU-bound
+ real-time applications, you should specify a priority higher then
+ the highest-priority CPU-bound application.
+
+ Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+ int "Microseconds to delay before waiting for readers"
+ range 0 20
+ default 10
+ help
+ This option controls how long SRCU delays before entering its
+ loop waiting on SRCU readers. The purpose of this loop is
+ to avoid the unconditional context-switch penalty that would
+ otherwise be incurred if there was an active SRCU reader,
+ in a manner similar to adaptive locking schemes. This should
+ be set to be a bit longer than the common-case SRCU read-side
+ critical-section overhead.
+
+ Accept the default if unsure.
+
endmenu # "RCU Subsystem"
config IKCONFIG
diff --git a/kernel/futex.c b/kernel/futex.c
index 40a8777a27d0..3019b92e6917 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED 0x01
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
+/*
* Priority Inheritance state:
*/
struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
u32 bitset;
};
+static const struct futex_q futex_q_init = {
+ /* list gets initialized in queue_me()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
return 0;
}
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
{
drop_futex_key_refs(key);
}
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
if (unlikely(ret != 0))
goto out;
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
}
@@ -917,7 +931,7 @@ out:
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -962,11 +976,11 @@ retry_private:
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
@@ -996,9 +1010,9 @@ retry_private:
double_unlock_hb(hb1, hb2);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
return ret;
}
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL)
* @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
+ * pi futex (pi to pi requeue is not supported)
*
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* >=0 - on success, the number of tasks requeued or woken
* <0 - on error
*/
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval,
- int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1216,11 +1230,11 @@ retry_private:
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
cond_resched();
goto retry;
default:
@@ -1352,9 +1366,9 @@ out_unlock:
drop_futex_key_refs(&key1);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
if (pi_state != NULL)
free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner, int fshared)
+ struct task_struct *newowner)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
goto retry;
}
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED 0x01
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
static long futex_wait_restart(struct restart_block *restart);
/**
* fixup_owner() - Post lock pi_state and corner case management
* @uaddr: user address of the futex
- * @fshared: whether the futex is shared (1) or not (0)
* @q: futex_q (contains pi_state and access to the rt_mutex)
* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
*
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
* 0 - success, lock not taken
* <0 - on error (-EFAULT)
*/
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
- int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
struct task_struct *owner;
int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
* did a lock-steal - fix up the PI-state in that case:
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, current);
goto out;
}
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
* lock. Fix the state up.
*/
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* futex_wait_setup() - Prepare to wait on a futex
* @uaddr: the futex userspace address
* @val: the expected value
- * @fshared: whether the futex is shared (1) or not (0)
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
* @hb: storage for hash_bucket pointer to be returned to caller
*
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* 0 - uaddr contains val and hb has been locked
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
*/
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
{
u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
* rare, but normal.
*/
retry:
- q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
if (unlikely(ret != 0))
return ret;
@@ -1769,10 +1772,10 @@ retry_private:
if (ret)
goto out;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
goto retry;
}
@@ -1783,32 +1786,29 @@ retry_private:
out:
if (ret)
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
return ret;
}
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
-
- q.pi_state = NULL;
q.bitset = bitset;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
* Prepare to wait on uaddr. On success, holds hb lock and increments
* q.key refs.
*/
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out;
@@ -1852,12 +1852,7 @@ retry:
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
+ restart->futex.flags = flags;
ret = -ERESTART_RESTARTBLOCK;
@@ -1873,7 +1868,6 @@ out:
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = restart->futex.uaddr;
- int fshared = 0;
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
tp = &t;
}
restart->fn = do_no_restart_syscall;
- if (restart->futex.flags & FLAGS_SHARED)
- fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
- restart->futex.bitset,
- restart->futex.flags & FLAGS_CLOCKRT);
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
}
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
- int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+ ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int res, ret;
if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
hrtimer_set_expires(&to->timer, *time);
}
- q.pi_state = NULL;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
if (unlikely(ret != 0))
goto out;
@@ -1941,7 +1929,7 @@ retry_private:
* exit to complete.
*/
queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
cond_resched();
goto retry;
default:
@@ -1971,7 +1959,7 @@ retry_private:
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr, fshared, &q, !ret);
+ res = fixup_owner(uaddr, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
queue_unlock(&q, hb);
out_put_key:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
if (ret)
goto out_put_key;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
goto retry;
}
@@ -2020,7 +2008,7 @@ uaddr_faulted:
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
if (unlikely(ret != 0))
goto out;
@@ -2093,14 +2081,14 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
pi_faulted:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
- * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* 0 - On success
* <0 - On error
*/
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
- int clockrt, u32 __user *uaddr2)
+ u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
- union futex_key key2;
- struct futex_q q;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
int res, ret;
if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out;
- q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* Prepare to wait on uaddr. On success, increments q.key (key1) ref
* count.
*/
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out_key2;
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current,
- fshared);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
spin_unlock(q.lock_ptr);
}
} else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr2, fshared, &q, !ret);
+ res = fixup_owner(uaddr2, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
}
out_put_keys:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out_key2:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out:
if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int clockrt, ret = -ENOSYS;
- int cmd = op & FUTEX_CMD_MASK;
- int fshared = 0;
+ int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
- fshared = 1;
+ flags |= FLAGS_SHARED;
- clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ return -ENOSYS;
+ }
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+ ret = futex_wait(uaddr, flags, val, timeout, val3);
break;
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, fshared, val, val3);
+ ret = futex_wake(uaddr, flags, val, val3);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+ ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, fshared);
+ ret = futex_unlock_pi(uaddr, flags);
break;
case FUTEX_TRYLOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
break;
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
- clockrt, uaddr2);
+ ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
break;
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 1);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
break;
default:
ret = -ENOSYS;
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735342ac..034493724749 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
#include <linux/time.h>
#include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
/* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
void rcu_bh_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
}
/*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
*/
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
struct rcu_head *next, *list;
unsigned long flags;
+ RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
+ local_bh_disable();
list->func(list);
+ local_bh_enable();
list = next;
+ RCU_TRACE(cb_count++);
}
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
}
/*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed. It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
{
- __rcu_process_callbacks(&rcu_sched_ctrlblk);
- __rcu_process_callbacks(&rcu_bh_ctrlblk);
- rcu_preempt_process_callbacks();
+ unsigned long work;
+ unsigned long morework;
+ unsigned long flags;
+
+ for (;;) {
+ wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+ morework = rcu_boost();
+ local_irq_save(flags);
+ work = have_rcu_kthread_work;
+ have_rcu_kthread_work = morework;
+ local_irq_restore(flags);
+ if (work) {
+ rcu_process_callbacks(&rcu_sched_ctrlblk);
+ rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
+ }
+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
+ }
+
+ return 0; /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+ local_irq_restore(flags);
}
/*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
+ RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
}
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
{
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ struct sched_param sp;
+
+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+ return 0;
}
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f745ff..015abaea962a 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
#ifdef CONFIG_TINY_PREEMPT_RCU
#include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
struct list_head *gp_tasks;
/* Pointer to the first task blocking the */
/* current grace period, or NULL if there */
- /* is not such task. */
+ /* is no such task. */
struct list_head *exp_tasks;
/* Pointer to first task blocking the */
/* current expedited grace period, or NULL */
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority-boosted, or NULL if no priority */
+ /* boosting is needed. If there is no */
+ /* current or expedited grace period, there */
+ /* can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
u8 gpnum; /* Current grace period. */
u8 gpcpu; /* Last grace period blocked by the CPU. */
u8 completed; /* Last grace period completed. */
/* If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+ s8 boosted_this_gp; /* Has boosting already happened? */
+ unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+ unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+ unsigned long n_tasks_boosted;
+ unsigned long n_exp_boosts;
+ unsigned long n_normal_boosts;
+ unsigned long n_normal_balk_blkd_tasks;
+ unsigned long n_normal_balk_gp_tasks;
+ unsigned long n_normal_balk_boost_tasks;
+ unsigned long n_normal_balk_boosted;
+ unsigned long n_normal_balk_notyet;
+ unsigned long n_normal_balk_nos;
+ unsigned long n_exp_balk_blkd_tasks;
+ unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
};
static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void)
}
/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+ seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+ rcu_preempt_ctrlblk.rcb.qlen,
+ rcu_preempt_ctrlblk.n_grace_periods,
+ rcu_preempt_ctrlblk.gpnum,
+ rcu_preempt_ctrlblk.gpcpu,
+ rcu_preempt_ctrlblk.completed,
+ "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+ "N."[!rcu_preempt_ctrlblk.gp_tasks],
+ "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " ttb=%c btg=",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+ switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+ case -1:
+ seq_puts(m, "exp");
+ break;
+ case 0:
+ seq_puts(m, "no");
+ break;
+ case 1:
+ seq_puts(m, "begun");
+ break;
+ case 2:
+ seq_puts(m, "done");
+ break;
+ default:
+ seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+ }
+ seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ rcu_preempt_ctrlblk.n_tasks_boosted,
+ rcu_preempt_ctrlblk.n_exp_boosts,
+ rcu_preempt_ctrlblk.n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+ seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+ "normal balk",
+ rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boosted,
+ rcu_preempt_ctrlblk.n_normal_balk_notyet,
+ rcu_preempt_ctrlblk.n_normal_balk_nos);
+ seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct list_head *np;
+ struct task_struct *t;
+
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ return 0; /* Nothing to boost. */
+ raw_local_irq_save(flags);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+ rcu_node_entry);
+ np = rcu_next_node_entry(t);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_local_irq_restore(flags);
+ rt_mutex_lock(&mtx);
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ rt_mutex_unlock(&mtx);
+ return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them. If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1. Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+ if (!rcu_preempt_blocked_readers_cgp()) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ return 0;
+ }
+ if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+ rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_boost_trace());
+ return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+ rcu_preempt_ctrlblk.boosted_this_gp = -1;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_exp_boost_trace());
+ raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+ rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+ if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+ rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+ return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
+/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* in a quiescent state. There might be any number of tasks blocked
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ /* If there is no GP then there is nothing more to do. */
+ if (!rcu_preempt_gp_in_progress())
+ return;
/*
- * If there is no GP, or if blocked readers are still blocking GP,
- * then there is nothing more to do.
+ * Check up on boosting. If there are no readers blocking the
+ * current grace period, leave.
*/
- if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+ if (rcu_initiate_boost())
return;
/* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
if (!rcu_preempt_blocked_readers_any())
rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
- /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+ /* If there are done callbacks, cause them to be invoked. */
if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
/* Official start of GP. */
rcu_preempt_ctrlblk.gpnum++;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
/* Any blocked RCU readers block new GP. */
if (rcu_preempt_blocked_readers_any())
rcu_preempt_ctrlblk.gp_tasks =
rcu_preempt_ctrlblk.blkd_tasks.next;
+ /* Set up for RCU priority boosting. */
+ rcu_preempt_boost_start_gp();
+
/* If there is no running reader, CPU is done with GP. */
if (!rcu_preempt_running_reader())
rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
*/
empty = !rcu_preempt_blocked_readers_cgp();
empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
- np = t->rcu_node_entry.next;
- if (np == &rcu_preempt_ctrlblk.blkd_tasks)
- np = NULL;
+ np = rcu_next_node_entry(t);
list_del(&t->rcu_node_entry);
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
rcu_preempt_ctrlblk.gp_tasks = np;
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+ rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&t->rcu_node_entry);
/*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
rcu_report_exp_done();
}
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost self if was boosted. */
+ if (special & RCU_READ_UNLOCK_BOOSTED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+ rt_mutex_unlock(t->rcu_boost_mutex);
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
local_irq_restore(flags);
}
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
rcu_preempt_cpu_qs();
if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
rcu_preempt_ctrlblk.rcb.donetail)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
if (rcu_preempt_gp_in_progress() &&
rcu_cpu_blocking_cur_gp() &&
rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
/*
* TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
* handle that case. Of course, it is invoked for all flavors of
* RCU, but RCU callbacks can appear only on one of the lists, and
* neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
*/
static void rcu_preempt_process_callbacks(void)
{
- __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+ rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
}
/*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
local_irq_save(flags);
*rcu_preempt_ctrlblk.nexttail = head;
rcu_preempt_ctrlblk.nexttail = &head->next;
+ RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
rcu_preempt_start_gp(); /* checks to see if GP needed. */
local_irq_restore(flags);
}
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
/* Wait for tail of ->blkd_tasks list to drain. */
if (rcu_preempted_readers_exp())
+ rcu_initiate_expedited_boost();
wait_event(sync_rcu_preempt_exp_wq,
!rcu_preempted_readers_exp());
@@ -572,6 +857,27 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
#include <linux/kernel_stat.h>
/*
* During boot, we forgive RCU lockdep issues. After this function is
* invoked, we start taking RCU lockdep issues seriously.
*/
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
rcu_scheduler_active = 1;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+ if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+ else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+ else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+ rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+ else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+ rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+ else
+ rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+ else
+ rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ rcp->qlen -= n;
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+ show_tiny_preempt_stats(m);
+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+ return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = show_tiny_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+ struct dentry *retval;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
+ NULL, &show_tiny_stats_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb2515f..89613f97ff26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <asm/byteorder.h>
+#include <linux/sched.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
static int fqs_holdoff = 0; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
static char *torture_type = "rcu"; /* What RCU implementation to torture. */
module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
static struct task_struct *shuffler_task;
static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
#define RCU_TORTURE_PIPE_LEN 10
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+
/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
void (*fqs)(void);
int (*stats)(char *page);
int irq_capable;
+ int can_boost;
char *name;
};
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu"
};
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_sync"
};
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_expedited"
};
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
};
/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (jiffies - oldstarttime > ULONG_MAX / 2) {
+ schedule_timeout_uninterruptible(1);
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (jiffies - endtime > ULONG_MAX / 2) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched();
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: rcu_stutter_wait("rcu_torture_boost");
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+
+ /* Clean up and exit. */
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_boost");
+ while (!kthread_should_stop() || rbi.inflight)
+ schedule_timeout_uninterruptible(1);
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ return 0;
+}
+
+/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
* of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d nt: %ld",
+ "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+ "rtbf: %ld rtb: %ld nt: %ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free),
atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror,
+ n_rcu_torture_boost_allocerror,
+ n_rcu_torture_boost_afferror,
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
n_rcu_torture_timers);
- if (atomic_read(&n_rcu_torture_mberror) != 0)
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_allocerror != 0 ||
+ n_rcu_torture_boost_afferror != 0 ||
+ n_rcu_torture_boost_failure != 0)
cnt += sprintf(&page[cnt], " !!!");
cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
}
static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
{
printk(KERN_ALERT "%s" TORTURE_FLAG
"--- %s: nreaders=%d nfakewriters=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval=%d stutter=%d irqreader=%d "
- "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration);
}
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
.notifier_call = rcutorture_shutdown_notify,
};
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
static void
rcu_torture_cleanup(void)
{
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
}
fullstop = FULLSTOP_RMMOD;
mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_nb);
+ unregister_reboot_notifier(&rcutorture_shutdown_nb);
if (stutter_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
kthread_stop(fqs_task);
}
fqs_task = NULL;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
/* Wait for all RCU callbacks to fire. */
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
- rcu_torture_print_module_parms("End of test: FAILURE");
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else
- rcu_torture_print_module_parms("End of test: SUCCESS");
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
}
static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
nrealreaders = nreaders;
else
nrealreaders = 2 * num_online_cpus();
- rcu_torture_print_module_parms("Start of test");
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
fullstop = FULLSTOP_DONTSTOP;
/* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_allocerror = 0;
+ n_rcu_torture_boost_afferror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
atomic_set(&rcu_torture_wcount[i], 0);
for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
goto unwind;
}
}
- register_reboot_notifier(&rcutorture_nb);
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ int retval;
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ retval = rcutorture_booster_init(i);
+ if (retval < 0) {
+ firsterr = retval;
+ goto unwind;
+ }
+ }
+ }
+ register_reboot_notifier(&rcutorture_shutdown_nb);
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c47981..d0ddfea6579d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
- .orphan_cbs_list = NULL, \
- .orphan_cbs_tail = &structname.orphan_cbs_list, \
- .orphan_qlen = 0, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
if (rdp->gpnum != rnp->gpnum) {
- rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
+ /*
+ * If the current grace period is waiting for this CPU,
+ * set up to detect a quiescent state, otherwise don't
+ * go looking for one.
+ */
rdp->gpnum = rnp->gpnum;
+ if (rnp->qsmask & rdp->grpmask) {
+ rdp->qs_pending = 1;
+ rdp->passed_quiesc = 0;
+ } else
+ rdp->qs_pending = 0;
}
}
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
+
+ /*
+ * If we were in an extended quiescent state, we may have
+ * missed some grace periods that others CPUs handled on
+ * our behalf. Catch up with this state to avoid noting
+ * spurious new grace periods. If another grace period
+ * has started, then rnp->gpnum will have advanced, so
+ * we will detect this later on.
+ */
+ if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+ rdp->gpnum = rdp->completed;
+
+ /*
+ * If RCU does not need a quiescent state from this CPU,
+ * then make sure that this CPU doesn't go looking for one.
+ */
+ if ((rnp->qsmask & rdp->grpmask) == 0)
+ rdp->qs_pending = 0;
}
}
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU. The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first. Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
*/
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
int i;
+ /* current DYING CPU is cleared in the cpu_online_mask */
+ int receive_cpu = cpumask_any(cpu_online_mask);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
if (rdp->nxtlist == NULL)
return; /* irqs disabled, so comparison is stable. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
- *rsp->orphan_cbs_tail = rdp->nxtlist;
- rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+ *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+ receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ receive_rdp->qlen += rdp->qlen;
+ receive_rdp->n_cbs_adopted += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
+
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- rsp->orphan_qlen += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen = 0;
- raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rdp = this_cpu_ptr(rsp->rda);
- if (rsp->orphan_cbs_list == NULL) {
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- return;
- }
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
- rdp->qlen += rsp->orphan_qlen;
- rdp->n_cbs_adopted += rsp->orphan_qlen;
- rsp->orphan_cbs_list = NULL;
- rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
- rsp->orphan_qlen = 0;
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
}
/*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp);
-
- rcu_adopt_orphan_cbs(rsp);
}
/*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
}
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
*/
local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda);
- rcu_process_gp_end(rsp, rdp);
- check_for_new_grace_period(rsp, rdp);
/* Add the callback to our list. */
*rdp->nxttail[RCU_NEXT_TAIL] = head;
rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
- /* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
- }
-
/*
* Force the grace period if too many callbacks or too long waiting.
* Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* is the only one waiting for a grace period to complete.
*/
if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
+
+ /* Are we ignoring a completed grace period? */
+ rcu_process_gp_end(rsp, rdp);
+ check_for_new_grace_period(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ unsigned long nestflag;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+ rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp, 0);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
force_quiescent_state(rsp, 1);
local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
* decrement rcu_barrier_cpu_count -- otherwise the first CPU
* might complete its grace period before all of the other CPUs
* did their increment, causing this function to return too
- * early.
+ * early. Note that on_each_cpu() disables irqs, which prevents
+ * any CPUs from coming online or going offline until each online
+ * CPU has queued its RCU-barrier callback.
*/
atomic_set(&rcu_barrier_cpu_count, 1);
- preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
- rcu_adopt_orphan_cbs(rsp);
on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
- preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
case CPU_DYING:
case CPU_DYING_FROZEN:
/*
- * preempt_disable() in _rcu_barrier() prevents stop_machine(),
- * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
- * returns, all online cpus have queued rcu_barrier_func().
- * The dying CPU clears its cpu_online_mask bit and
- * moves all of its RCU callbacks to ->orphan_cbs_list
- * in the context of stop_machine(), so subsequent calls
- * to _rcu_barrier() will adopt these callbacks and only
- * then queue rcu_barrier_func() on all remaining CPUs.
+ * The whole machine is "stopped" except this CPU, so we can
+ * touch any data without introducing corruption. We send the
+ * dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
- rcu_send_cbs_to_orphanage(&rcu_bh_state);
- rcu_send_cbs_to_orphanage(&rcu_sched_state);
- rcu_preempt_send_cbs_to_orphanage();
+ rcu_send_cbs_to_online(&rcu_bh_state);
+ rcu_send_cbs_to_online(&rcu_sched_state);
+ rcu_preempt_send_cbs_to_online();
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+ for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ rsp->levelspread[0] = RCU_FANOUT_LEAF;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170c5c13..e8f057e44e3e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
/*
* Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
* In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#define RCU_FANOUT (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF 16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
# define NUM_RCU_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
# define NUM_RCU_LVLS 2
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
# define NUM_RCU_LVLS 3
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-# define NUM_RCU_LVL_3 NR_CPUS
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
# define NUM_RCU_LVLS 4
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-# define NUM_RCU_LVL_4 NR_CPUS
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_4 (NR_CPUS)
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
+ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
raw_spinlock_t onofflock; /* exclude on/offline and */
- /* starting new GP. Also */
- /* protects the following */
- /* orphan_cbs fields. */
- struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
- /* orphaned by all CPUs in */
- /* a given leaf rcu_node */
- /* going offline. */
- struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
- long orphan_qlen; /* Number of orphaned cbs. */
+ /* starting new GP. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147473f9..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/
#include <linux/delay.h>
+#include <linux/stop_machine.h>
/*
* Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
- rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+ rcu_send_cbs_to_online(&rcu_preempt_state);
}
/*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
/*
* Because there is no preemptable RCU, there are no callbacks to move.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
}
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly. This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier. Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word. Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs. If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period. We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done. If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot. In this case, our work is
+ * done for us, and we can simply return. Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+ int firstsnap, s, snap, trycount = 0;
+
+ /* Note that atomic_inc_return() implies full memory barrier. */
+ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+ get_online_cpus();
+
+ /*
+ * Each pass through the following loop attempts to force a
+ * context switch on each CPU.
+ */
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
+ put_online_cpus();
+
+ /* No joy, try again later. Or just synchronize_sched(). */
+ if (trycount++ < 10)
+ udelay(trycount * num_online_cpus());
+ else {
+ synchronize_sched();
+ return;
+ }
+
+ /* Check to see if someone else did our work for us. */
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ return;
+ }
+
+ /*
+ * Refetching sync_sched_expedited_started allows later
+ * callers to piggyback on our grace period. We subtract
+ * 1 to get the same token that the last incrementer got.
+ * We retry after they started, so our grace period works
+ * for them, and they started after our first try, so their
+ * grace period works for us.
+ */
+ get_online_cpus();
+ snap = atomic_read(&sync_sched_expedited_started) - 1;
+ smp_mb(); /* ensure read is before try_stop_cpus(). */
+ }
+
+ /*
+ * Everyone up to our most recent fetch is covered by our grace
+ * period. Update the counter, but only if our work is still
+ * relevant -- which it won't be if someone who started later
+ * than we did beat us to the punch.
+ */
+ do {
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ break;
+ }
+ } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b9d122..c8e97853b970 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
rsp->completed, gpnum, rsp->signaled,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh, rsp->orphan_qlen);
+ rsp->n_force_qs_lh);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
{
struct dentry *retval;
@@ -337,14 +337,14 @@ free_out:
return 1;
}
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
{
debugfs_remove_recursive(rcudir);
}
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index c68cead94dd7..260132961a99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9532,72 +9532,3 @@ struct cgroup_subsys cpuacct_subsys = {
};
#endif /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e07500536..98d8c1e80edb 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/smp.h>
+#include <linux/delay.h>
#include <linux/srcu.h>
static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
* all srcu_read_lock() calls using the old counters have completed.
* Their corresponding critical sections might well be still
* executing, but the srcu_read_lock() primitives themselves
- * will have finished executing.
+ * will have finished executing. We initially give readers
+ * an arbitrarily chosen 10 microseconds to get out of their
+ * SRCU read-side critical sections, then loop waiting 1/HZ
+ * seconds per iteration.
*/
+ if (srcu_readers_active_idx(sp, idx))
+ udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
while (srcu_readers_active_idx(sp, idx))
schedule_timeout_interruptible(1);
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 39580a5dc5df..9f85012acf0d 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -155,6 +155,8 @@ use strict;
# '@parameter' - name of a parameter
# '%CONST' - name of a constant.
+## init lots of data
+
my $errors = 0;
my $warnings = 0;
my $anon_struct_union = 0;
@@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1",
$type_param, "\$1" );
my $blankline_list = "";
-sub usage {
- print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
- print " [ -no-doc-sections ]\n";
- print " [ -function funcname [ -function funcname ...] ]\n";
- print " [ -nofunction funcname [ -nofunction funcname ...] ]\n";
- print " c source file(s) > outputfile\n";
- print " -v : verbose output, more warnings & other info listed\n";
- exit 1;
-}
-
# read arguments
if ($#ARGV == -1) {
usage();
}
+my $kernelversion;
+my $dohighlight = "";
+
my $verbose = 0;
my $output_mode = "man";
my $no_doc_sections = 0;
@@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June',
'November', 'December')[(localtime)[4]] .
" " . ((localtime)[5]+1900);
-# Essentially these are globals
+# Essentially these are globals.
# They probably want to be tidied up, made more localised or something.
# CAVEAT EMPTOR! Some of the others I localised may not want to be, which
# could cause "use of undefined value" or other bugs.
@@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) {
}
}
+# continue execution near EOF;
+
+sub usage {
+ print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
+ print " [ -no-doc-sections ]\n";
+ print " [ -function funcname [ -function funcname ...] ]\n";
+ print " [ -nofunction funcname [ -nofunction funcname ...] ]\n";
+ print " c source file(s) > outputfile\n";
+ print " -v : verbose output, more warnings & other info listed\n";
+ exit 1;
+}
+
# get kernel version from env
sub get_kernel_version() {
my $version = 'unknown kernel version';
@@ -362,15 +369,6 @@ sub get_kernel_version() {
}
return $version;
}
-my $kernelversion = get_kernel_version();
-
-# generate a sequence of code that will splice in highlighting information
-# using the s// operator.
-my $dohighlight = "";
-foreach my $pattern (keys %highlights) {
-# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
- $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
-}
##
# dumps section contents to arrays/hashes intended for that purpose.
@@ -1851,34 +1849,6 @@ sub dump_function($$) {
});
}
-sub process_file($);
-
-# Read the file that maps relative names to absolute names for
-# separate source and object directories and for shadow trees.
-if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
- my ($relname, $absname);
- while(<SOURCE_MAP>) {
- chop();
- ($relname, $absname) = (split())[0..1];
- $relname =~ s:^/+::;
- $source_map{$relname} = $absname;
- }
- close(SOURCE_MAP);
-}
-
-foreach (@ARGV) {
- chomp;
- process_file($_);
-}
-if ($verbose && $errors) {
- print STDERR "$errors errors\n";
-}
-if ($verbose && $warnings) {
- print STDERR "$warnings warnings\n";
-}
-
-exit($errors);
-
sub reset_state {
$function = "";
%constants = ();
@@ -2285,3 +2255,39 @@ sub process_file($) {
}
}
}
+
+
+$kernelversion = get_kernel_version();
+
+# generate a sequence of code that will splice in highlighting information
+# using the s// operator.
+foreach my $pattern (keys %highlights) {
+# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
+ $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
+}
+
+# Read the file that maps relative names to absolute names for
+# separate source and object directories and for shadow trees.
+if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
+ my ($relname, $absname);
+ while(<SOURCE_MAP>) {
+ chop();
+ ($relname, $absname) = (split())[0..1];
+ $relname =~ s:^/+::;
+ $source_map{$relname} = $absname;
+ }
+ close(SOURCE_MAP);
+}
+
+foreach (@ARGV) {
+ chomp;
+ process_file($_);
+}
+if ($verbose && $errors) {
+ print STDERR "$errors errors\n";
+}
+if ($verbose && $warnings) {
+ print STDERR "$warnings warnings\n";
+}
+
+exit($errors);