summaryrefslogtreecommitdiff
path: root/arch/loongarch/lib/copy_user.S
diff options
context:
space:
mode:
authorHuacai Chen <chenhuacai@loongson.cn>2022-12-10 17:39:59 +0300
committerHuacai Chen <chenhuacai@loongson.cn>2022-12-14 03:36:11 +0300
commita275a82dcd4024c75337db15d59ed039c31e21da (patch)
tree98fa3db04d230544ded2dff4c39a262aef6e88b9 /arch/loongarch/lib/copy_user.S
parent19e5eb15b00c5841b4b9bd9777af2865a40d2f39 (diff)
downloadlinux-a275a82dcd4024c75337db15d59ed039c31e21da.tar.xz
LoongArch: Use alternative to optimize libraries
Use the alternative to optimize common libraries according whether CPU has UAL (hardware unaligned access support) feature, including memset(), memcopy(), memmove(), copy_user() and clear_user(). We have tested UnixBench on a Loongson-3A5000 quad-core machine (1.6GHz): 1, One copy, before patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 9566582.0 819.8 Double-Precision Whetstone 55.0 2805.3 510.1 Execl Throughput 43.0 2120.0 493.0 File Copy 1024 bufsize 2000 maxblocks 3960.0 209833.0 529.9 File Copy 256 bufsize 500 maxblocks 1655.0 89400.0 540.2 File Copy 4096 bufsize 8000 maxblocks 5800.0 320036.0 551.8 Pipe Throughput 12440.0 340624.0 273.8 Pipe-based Context Switching 4000.0 109939.1 274.8 Process Creation 126.0 4728.7 375.3 Shell Scripts (1 concurrent) 42.4 2223.1 524.3 Shell Scripts (8 concurrent) 6.0 883.1 1471.9 System Call Overhead 15000.0 518639.1 345.8 ======== System Benchmarks Index Score 500.2 2, One copy, after patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 9567674.7 819.9 Double-Precision Whetstone 55.0 2805.5 510.1 Execl Throughput 43.0 2392.7 556.4 File Copy 1024 bufsize 2000 maxblocks 3960.0 417804.0 1055.1 File Copy 256 bufsize 500 maxblocks 1655.0 112909.5 682.2 File Copy 4096 bufsize 8000 maxblocks 5800.0 1255207.4 2164.2 Pipe Throughput 12440.0 555712.0 446.7 Pipe-based Context Switching 4000.0 99964.5 249.9 Process Creation 126.0 5192.5 412.1 Shell Scripts (1 concurrent) 42.4 2302.4 543.0 Shell Scripts (8 concurrent) 6.0 919.6 1532.6 System Call Overhead 15000.0 511159.3 340.8 ======== System Benchmarks Index Score 640.1 3, Four copies, before patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 38268610.5 3279.2 Double-Precision Whetstone 55.0 11222.2 2040.4 Execl Throughput 43.0 7892.0 1835.3 File Copy 1024 bufsize 2000 maxblocks 3960.0 235149.6 593.8 File Copy 256 bufsize 500 maxblocks 1655.0 74959.6 452.9 File Copy 4096 bufsize 8000 maxblocks 5800.0 545048.5 939.7 Pipe Throughput 12440.0 1337359.0 1075.0 Pipe-based Context Switching 4000.0 473663.9 1184.2 Process Creation 126.0 17491.2 1388.2 Shell Scripts (1 concurrent) 42.4 6865.7 1619.3 Shell Scripts (8 concurrent) 6.0 1015.9 1693.1 System Call Overhead 15000.0 1899535.2 1266.4 ======== System Benchmarks Index Score 1278.3 4, Four copies, after patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 38272815.5 3279.6 Double-Precision Whetstone 55.0 11222.8 2040.5 Execl Throughput 43.0 8839.2 2055.6 File Copy 1024 bufsize 2000 maxblocks 3960.0 313912.9 792.7 File Copy 256 bufsize 500 maxblocks 1655.0 80976.1 489.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 1176594.3 2028.6 Pipe Throughput 12440.0 2100941.9 1688.9 Pipe-based Context Switching 4000.0 476696.4 1191.7 Process Creation 126.0 18394.7 1459.9 Shell Scripts (1 concurrent) 42.4 7172.2 1691.6 Shell Scripts (8 concurrent) 6.0 1058.3 1763.9 System Call Overhead 15000.0 1874714.7 1249.8 ======== System Benchmarks Index Score 1488.8 Signed-off-by: Jun Yi <yijun@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Diffstat (limited to 'arch/loongarch/lib/copy_user.S')
-rw-r--r--arch/loongarch/lib/copy_user.S91
1 files changed, 86 insertions, 5 deletions
diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S
index 61933d964da0..55ac6020a1ad 100644
--- a/arch/loongarch/lib/copy_user.S
+++ b/arch/loongarch/lib/copy_user.S
@@ -3,26 +3,38 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
+#include <asm/cpu.h>
#include <asm/export.h>
#include <asm/regdef.h>
-.irp to, 0
+.irp to, 0, 1, 2, 3, 4, 5, 6, 7
.L_fixup_handle_\to\():
addi.d a0, a2, (\to) * (-8)
jr ra
.endr
+SYM_FUNC_START(__copy_user)
+ /*
+ * Some CPUs support hardware unaligned access
+ */
+ ALTERNATIVE "b __copy_user_generic", \
+ "b __copy_user_fast", CPU_FEATURE_UAL
+SYM_FUNC_END(__copy_user)
+
+EXPORT_SYMBOL(__copy_user)
+
/*
- * unsigned long __copy_user(void *to, const void *from, size_t n)
+ * unsigned long __copy_user_generic(void *to, const void *from, size_t n)
*
* a0: to
* a1: from
* a2: n
*/
-SYM_FUNC_START(__copy_user)
+SYM_FUNC_START(__copy_user_generic)
beqz a2, 3f
1: ld.b t0, a1, 0
@@ -37,6 +49,75 @@ SYM_FUNC_START(__copy_user)
_asm_extable 1b, .L_fixup_handle_0
_asm_extable 2b, .L_fixup_handle_0
-SYM_FUNC_END(__copy_user)
+SYM_FUNC_END(__copy_user_generic)
-EXPORT_SYMBOL(__copy_user)
+/*
+ * unsigned long __copy_user_fast(void *to, const void *from, unsigned long n)
+ *
+ * a0: to
+ * a1: from
+ * a2: n
+ */
+SYM_FUNC_START(__copy_user_fast)
+ beqz a2, 19f
+
+ ori a3, zero, 64
+ blt a2, a3, 17f
+
+ /* copy 64 bytes at a time */
+1: ld.d t0, a1, 0
+2: ld.d t1, a1, 8
+3: ld.d t2, a1, 16
+4: ld.d t3, a1, 24
+5: ld.d t4, a1, 32
+6: ld.d t5, a1, 40
+7: ld.d t6, a1, 48
+8: ld.d t7, a1, 56
+9: st.d t0, a0, 0
+10: st.d t1, a0, 8
+11: st.d t2, a0, 16
+12: st.d t3, a0, 24
+13: st.d t4, a0, 32
+14: st.d t5, a0, 40
+15: st.d t6, a0, 48
+16: st.d t7, a0, 56
+
+ addi.d a0, a0, 64
+ addi.d a1, a1, 64
+ addi.d a2, a2, -64
+ bge a2, a3, 1b
+
+ beqz a2, 19f
+
+ /* copy the remaining bytes */
+17: ld.b t0, a1, 0
+18: st.b t0, a0, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ addi.d a2, a2, -1
+ bgt a2, zero, 17b
+
+ /* return */
+19: move a0, a2
+ jr ra
+
+ /* fixup and ex_table */
+ _asm_extable 1b, .L_fixup_handle_0
+ _asm_extable 2b, .L_fixup_handle_1
+ _asm_extable 3b, .L_fixup_handle_2
+ _asm_extable 4b, .L_fixup_handle_3
+ _asm_extable 5b, .L_fixup_handle_4
+ _asm_extable 6b, .L_fixup_handle_5
+ _asm_extable 7b, .L_fixup_handle_6
+ _asm_extable 8b, .L_fixup_handle_7
+ _asm_extable 9b, .L_fixup_handle_0
+ _asm_extable 10b, .L_fixup_handle_1
+ _asm_extable 11b, .L_fixup_handle_2
+ _asm_extable 12b, .L_fixup_handle_3
+ _asm_extable 13b, .L_fixup_handle_4
+ _asm_extable 14b, .L_fixup_handle_5
+ _asm_extable 15b, .L_fixup_handle_6
+ _asm_extable 16b, .L_fixup_handle_7
+ _asm_extable 17b, .L_fixup_handle_0
+ _asm_extable 18b, .L_fixup_handle_0
+SYM_FUNC_END(__copy_user_fast)