summaryrefslogtreecommitdiff
path: root/arch/arm/crypto/chacha-glue.c
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2020-11-03 19:28:09 +0300
committerHerbert Xu <herbert@gondor.apana.org.au>2020-11-13 12:38:44 +0300
commit86cd97ec4b943af35562a74688bc4e909b32c3d1 (patch)
tree4ff0341af0c4e597ceb25153719f3a5c9a5c8c42 /arch/arm/crypto/chacha-glue.c
parentec3c5b32fccc537e7184dcf39387b2e57d12f517 (diff)
downloadlinux-86cd97ec4b943af35562a74688bc4e909b32c3d1.tar.xz
crypto: arm/chacha-neon - optimize for non-block size multiples
The current NEON based ChaCha implementation for ARM is optimized for multiples of 4x the ChaCha block size (64 bytes). This makes sense for block encryption, but given that ChaCha is also often used in the context of networking, it makes sense to consider arbitrary length inputs as well. For example, WireGuard typically uses 1420 byte packets, and performing ChaCha encryption involves 5 invocations of chacha_4block_xor_neon() and 3 invocations of chacha_block_xor_neon(), where the last one also involves a memcpy() using a buffer on the stack to process the final chunk of 1420 % 64 == 12 bytes. Let's optimize for this case as well, by letting chacha_4block_xor_neon() deal with any input size between 64 and 256 bytes, using NEON permutation instructions and overlapping loads and stores. This way, the 140 byte tail of a 1420 byte input buffer can simply be processed in one go. This results in the following performance improvements for 1420 byte blocks, without significant impact on power-of-2 input sizes. (Note that Raspberry Pi is widely used in combination with a 32-bit kernel, even though the core is 64-bit capable) Cortex-A8 (BeagleBone) : 7% Cortex-A15 (Calxeda Midway) : 21% Cortex-A53 (Raspberry Pi 3) : 3% Cortex-A72 (Raspberry Pi 4) : 19% Cc: Eric Biggers <ebiggers@google.com> Cc: "Jason A . Donenfeld" <Jason@zx2c4.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm/crypto/chacha-glue.c')
-rw-r--r--arch/arm/crypto/chacha-glue.c34
1 files changed, 17 insertions, 17 deletions
diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
index 59da6c0b63b6..7b5cf8430c6d 100644
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -23,7 +23,7 @@
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
- int nrounds);
+ int nrounds, unsigned int nbytes);
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
{
u8 buf[CHACHA_BLOCK_SIZE];
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- state[12]++;
+ while (bytes > CHACHA_BLOCK_SIZE) {
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
}
if (bytes) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
+ const u8 *s = src;
+ u8 *d = dst;
+
+ if (bytes != CHACHA_BLOCK_SIZE)
+ s = d = memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, d, s, nrounds);
+ if (d != dst)
+ memcpy(dst, buf, bytes);
}
}