summaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-16 05:31:34 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-19 03:05:28 +0300
commit577e6a7fd50d519c201d20968b6a027a6563dc4c (patch)
treead8e4f8a75f78b627bf7ce28aabff48cac32bb54 /arch/x86/lib
parent3639a535587d7aac449cdce9710dfdc97a3c8c8e (diff)
downloadlinux-577e6a7fd50d519c201d20968b6a027a6563dc4c.tar.xz
x86: inline the 'rep movs' in user copies for the FSRM case
This does the same thing for the user copies as commit 0db7058e8e23 ("x86/clear_user: Make it faster") did for clear_user(). In other words, it inlines the "rep movs" case when X86_FEATURE_FSRM is set, avoiding the function call entirely. In order to do that, it makes the calling convention for the out-of-line case ("copy_user_generic_unrolled") match the 'rep movs' calling convention, although it does also end up clobbering a number of additional registers. Also, to simplify code sharing in the low-level assembly with the __copy_user_nocache() function (that uses the normal C calling convention), we end up with a kind of mixed return value for the low-level asm code: it will return the result in both %rcx (to work as an alternative for the 'rep movs' case), _and_ in %rax (for the nocache case). We could avoid this by wrapping __copy_user_nocache() callers in an inline asm, but since the cost is just an extra register copy, it's probably not worth it. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/copy_user_64.S55
1 files changed, 21 insertions, 34 deletions
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 818f2f728294..16a743f11b11 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -45,13 +45,29 @@
* Input:
* rdi destination
* rsi source
- * rdx count
+ * rcx count
*
* Output:
- * eax uncopied bytes or 0 if successful.
+ * rcx uncopied bytes or 0 if successful.
+ *
+ * NOTE! The calling convention is very intentionally the same as
+ * for 'rep movs', so that we can rewrite the function call with
+ * just a plain 'rep movs' on machines that have FSRM.
+ *
+ * HOWEVER! This function ends up having a lot of the code common
+ * with __copy_user_nocache(), which is a normal C function, and
+ * has a similar calling convention, but gets the 'count' in %rdx,
+ * and returns the result in %rax.
+ *
+ * To share as much code as possible, we end up returning the
+ * result in *both* %rcx/%rax, and we also move the initial count
+ * into %rdx.
+ *
+ * We can clobber rdx/rsi/rdi and r8-r11
*/
SYM_FUNC_START(copy_user_generic_unrolled)
- cmpl $8,%edx
+ movl %ecx,%edx
+ cmpl $8,%ecx
jb .Lcopy_user_short_string_bytes
ALIGN_DESTINATION
movl %edx,%ecx
@@ -104,37 +120,6 @@ SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)
/*
- * Some CPUs support FSRM for Fast Short REP MOVS.
- *
- * Only 4GB of copy is supported. This shouldn't be a problem
- * because the kernel normally only writes from/to page sized chunks
- * even if user space passed a longer buffer.
- * And more would be dangerous because both Intel and AMD have
- * errata with rep movsq > 4GB. If someone feels the need to fix
- * this please consider this.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_fast_string)
- movl %edx,%ecx
-1: rep movsb
- xorl %eax,%eax
- RET
-
-12: movl %ecx,%eax /* ecx is zerorest also */
- RET
-
- _ASM_EXTABLE_CPY(1b, 12b)
-SYM_FUNC_END(copy_user_fast_string)
-EXPORT_SYMBOL(copy_user_fast_string)
-
-/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
* it is not necessary to optimize tail handling.
@@ -160,6 +145,7 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
3:
movl %edx,%eax
+ movl %edx,%ecx
RET
_ASM_EXTABLE_CPY(1b, 2b)
@@ -203,6 +189,7 @@ SYM_CODE_START_LOCAL(copy_user_short_string)
decl %ecx
jnz 21b
23: xor %eax,%eax
+ xor %ecx,%ecx
RET
40: leal (%rdx,%rcx,8),%edx