diff options
author | Ard Biesheuvel <ardb@kernel.org> | 2023-01-16 14:01:48 +0300 |
---|---|---|
committer | Russell King (Oracle) <rmk+kernel@armlinux.org.uk> | 2023-01-18 18:04:51 +0300 |
commit | b575b5a1e625b589ba1b1eb36c05fcca588cbc85 (patch) | |
tree | 6f6fe162d68873370effbe60f51494837e63bc3a /arch/arm/crypto/ghash-ce-core.S | |
parent | cdc3116f191ad4250f992d750b2d6f72fb98afde (diff) | |
download | linux-b575b5a1e625b589ba1b1eb36c05fcca588cbc85.tar.xz |
ARM: 9286/1: crypto: Implement fused AES-CTR/GHASH version of GCM
On 32-bit ARM, AES in GCM mode takes full advantage of the ARMv8 Crypto
Extensions when available, resulting in a performance of 6-7 cycles per
byte for typical IPsec frames on cores such as Cortex-A53, using the
generic GCM template encapsulating the accelerated AES-CTR and GHASH
implementations.
At such high rates, any time spent copying data or doing other poorly
optimized work in the generic layer hurts disproportionately, and we can
get a significant performance improvement by combining the optimized
AES-CTR and GHASH implementations into a single GCM driver.
On Cortex-A53, this results in a performance improvement of around 75%,
and AES-256-GCM-128 with RFC4106 encapsulation runs in 4 cycles per
byte.
Note that this code takes advantage of the fact that kernel mode NEON is
now supported in softirq context as well, and therefore does not provide
a non-NEON fallback path at all. (AEADs are only callable in process or
softirq context)
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Diffstat (limited to 'arch/arm/crypto/ghash-ce-core.S')
-rw-r--r-- | arch/arm/crypto/ghash-ce-core.S | 382 |
1 files changed, 369 insertions, 13 deletions
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index 9f51e3fa4526..858c0d66798b 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -2,7 +2,8 @@ /* * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. * - * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 - 2017 Linaro Ltd. + * Copyright (C) 2023 Google LLC. <ardb@google.com> */ #include <linux/linkage.h> @@ -44,7 +45,7 @@ t2q .req q7 t3q .req q8 t4q .req q9 - T2 .req q9 + XH2 .req q9 s1l .req d20 s1h .req d21 @@ -80,7 +81,7 @@ XL2 .req q5 XM2 .req q6 - XH2 .req q7 + T2 .req q7 T3 .req q8 XL2_L .req d10 @@ -192,9 +193,10 @@ vshr.u64 XL, XL, #1 .endm - .macro ghash_update, pn + .macro ghash_update, pn, enc, aggregate=1, head=1 vld1.64 {XL}, [r1] + .if \head /* do the head block first, if supplied */ ldr ip, [sp] teq ip, #0 @@ -202,13 +204,32 @@ vld1.64 {T1}, [ip] teq r0, #0 b 3f + .endif 0: .ifc \pn, p64 + .if \aggregate tst r0, #3 // skip until #blocks is a bne 2f // round multiple of 4 vld1.8 {XL2-XM2}, [r2]! -1: vld1.8 {T3-T2}, [r2]! +1: vld1.8 {T2-T3}, [r2]! + + .ifnb \enc + \enc\()_4x XL2, XM2, T2, T3 + + add ip, r3, #16 + vld1.64 {HH}, [ip, :128]! + vld1.64 {HH3-HH4}, [ip, :128] + + veor SHASH2_p64, SHASH_L, SHASH_H + veor SHASH2_H, HH_L, HH_H + veor HH34_L, HH3_L, HH3_H + veor HH34_H, HH4_L, HH4_H + + vmov.i8 MASK, #0xe1 + vshl.u64 MASK, MASK, #57 + .endif + vrev64.8 XL2, XL2 vrev64.8 XM2, XM2 @@ -218,8 +239,8 @@ veor XL2_H, XL2_H, XL_L veor XL, XL, T1 - vrev64.8 T3, T3 - vrev64.8 T1, T2 + vrev64.8 T1, T3 + vrev64.8 T3, T2 vmull.p64 XH, HH4_H, XL_H // a1 * b1 veor XL2_H, XL2_H, XL_H @@ -267,14 +288,22 @@ b 1b .endif + .endif + +2: vld1.8 {T1}, [r2]! + + .ifnb \enc + \enc\()_1x T1 + veor SHASH2_p64, SHASH_L, SHASH_H + vmov.i8 MASK, #0xe1 + vshl.u64 MASK, MASK, #57 + .endif -2: vld1.64 {T1}, [r2]! subs r0, r0, #1 3: /* multiply XL by SHASH in GF(2^128) */ -#ifndef CONFIG_CPU_BIG_ENDIAN vrev64.8 T1, T1 -#endif + vext.8 IN1, T1, T1, #8 veor T1_L, T1_L, XL_H veor XL, XL, IN1 @@ -293,9 +322,6 @@ veor XL, XL, T1 bne 0b - - vst1.64 {XL}, [r1] - bx lr .endm /* @@ -316,6 +342,9 @@ ENTRY(pmull_ghash_update_p64) vshl.u64 MASK, MASK, #57 ghash_update p64 + vst1.64 {XL}, [r1] + + bx lr ENDPROC(pmull_ghash_update_p64) ENTRY(pmull_ghash_update_p8) @@ -336,4 +365,331 @@ ENTRY(pmull_ghash_update_p8) vmov.i64 k48, #0xffffffffffff ghash_update p8 + vst1.64 {XL}, [r1] + + bx lr ENDPROC(pmull_ghash_update_p8) + + e0 .req q9 + e1 .req q10 + e2 .req q11 + e3 .req q12 + e0l .req d18 + e0h .req d19 + e2l .req d22 + e2h .req d23 + e3l .req d24 + e3h .req d25 + ctr .req q13 + ctr0 .req d26 + ctr1 .req d27 + + ek0 .req q14 + ek1 .req q15 + + .macro round, rk:req, regs:vararg + .irp r, \regs + aese.8 \r, \rk + aesmc.8 \r, \r + .endr + .endm + + .macro aes_encrypt, rkp, rounds, regs:vararg + vld1.8 {ek0-ek1}, [\rkp, :128]! + cmp \rounds, #12 + blt .L\@ // AES-128 + + round ek0, \regs + vld1.8 {ek0}, [\rkp, :128]! + round ek1, \regs + vld1.8 {ek1}, [\rkp, :128]! + + beq .L\@ // AES-192 + + round ek0, \regs + vld1.8 {ek0}, [\rkp, :128]! + round ek1, \regs + vld1.8 {ek1}, [\rkp, :128]! + +.L\@: .rept 4 + round ek0, \regs + vld1.8 {ek0}, [\rkp, :128]! + round ek1, \regs + vld1.8 {ek1}, [\rkp, :128]! + .endr + + round ek0, \regs + vld1.8 {ek0}, [\rkp, :128] + + .irp r, \regs + aese.8 \r, ek1 + .endr + .irp r, \regs + veor \r, \r, ek0 + .endr + .endm + +pmull_aes_encrypt: + add ip, r5, #4 + vld1.8 {ctr0}, [r5] // load 12 byte IV + vld1.8 {ctr1}, [ip] + rev r8, r7 + vext.8 ctr1, ctr1, ctr1, #4 + add r7, r7, #1 + vmov.32 ctr1[1], r8 + vmov e0, ctr + + add ip, r3, #64 + aes_encrypt ip, r6, e0 + bx lr +ENDPROC(pmull_aes_encrypt) + +pmull_aes_encrypt_4x: + add ip, r5, #4 + vld1.8 {ctr0}, [r5] + vld1.8 {ctr1}, [ip] + rev r8, r7 + vext.8 ctr1, ctr1, ctr1, #4 + add r7, r7, #1 + vmov.32 ctr1[1], r8 + rev ip, r7 + vmov e0, ctr + add r7, r7, #1 + vmov.32 ctr1[1], ip + rev r8, r7 + vmov e1, ctr + add r7, r7, #1 + vmov.32 ctr1[1], r8 + rev ip, r7 + vmov e2, ctr + add r7, r7, #1 + vmov.32 ctr1[1], ip + vmov e3, ctr + + add ip, r3, #64 + aes_encrypt ip, r6, e0, e1, e2, e3 + bx lr +ENDPROC(pmull_aes_encrypt_4x) + +pmull_aes_encrypt_final: + add ip, r5, #4 + vld1.8 {ctr0}, [r5] + vld1.8 {ctr1}, [ip] + rev r8, r7 + vext.8 ctr1, ctr1, ctr1, #4 + mov r7, #1 << 24 // BE #1 for the tag + vmov.32 ctr1[1], r8 + vmov e0, ctr + vmov.32 ctr1[1], r7 + vmov e1, ctr + + add ip, r3, #64 + aes_encrypt ip, r6, e0, e1 + bx lr +ENDPROC(pmull_aes_encrypt_final) + + .macro enc_1x, in0 + bl pmull_aes_encrypt + veor \in0, \in0, e0 + vst1.8 {\in0}, [r4]! + .endm + + .macro dec_1x, in0 + bl pmull_aes_encrypt + veor e0, e0, \in0 + vst1.8 {e0}, [r4]! + .endm + + .macro enc_4x, in0, in1, in2, in3 + bl pmull_aes_encrypt_4x + + veor \in0, \in0, e0 + veor \in1, \in1, e1 + veor \in2, \in2, e2 + veor \in3, \in3, e3 + + vst1.8 {\in0-\in1}, [r4]! + vst1.8 {\in2-\in3}, [r4]! + .endm + + .macro dec_4x, in0, in1, in2, in3 + bl pmull_aes_encrypt_4x + + veor e0, e0, \in0 + veor e1, e1, \in1 + veor e2, e2, \in2 + veor e3, e3, \in3 + + vst1.8 {e0-e1}, [r4]! + vst1.8 {e2-e3}, [r4]! + .endm + + /* + * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, + * struct gcm_key const *k, char *dst, + * char *iv, int rounds, u32 counter) + */ +ENTRY(pmull_gcm_encrypt) + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + + vld1.64 {SHASH}, [r3] + + ghash_update p64, enc, head=0 + vst1.64 {XL}, [r1] + + pop {r4-r8, pc} +ENDPROC(pmull_gcm_encrypt) + + /* + * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, + * struct gcm_key const *k, char *dst, + * char *iv, int rounds, u32 counter) + */ +ENTRY(pmull_gcm_decrypt) + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + + vld1.64 {SHASH}, [r3] + + ghash_update p64, dec, head=0 + vst1.64 {XL}, [r1] + + pop {r4-r8, pc} +ENDPROC(pmull_gcm_decrypt) + + /* + * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, + * struct gcm_key const *k, char *head, + * char *iv, int rounds, u32 counter) + */ +ENTRY(pmull_gcm_enc_final) + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + + bl pmull_aes_encrypt_final + + cmp r0, #0 + beq .Lenc_final + + mov_l ip, .Lpermute + sub r4, r4, #16 + add r8, ip, r0 + add ip, ip, #32 + add r4, r4, r0 + sub ip, ip, r0 + + vld1.8 {e3}, [r8] // permute vector for key stream + vld1.8 {e2}, [ip] // permute vector for ghash input + + vtbl.8 e3l, {e0}, e3l + vtbl.8 e3h, {e0}, e3h + + vld1.8 {e0}, [r4] // encrypt tail block + veor e0, e0, e3 + vst1.8 {e0}, [r4] + + vtbl.8 T1_L, {e0}, e2l + vtbl.8 T1_H, {e0}, e2h + + vld1.64 {XL}, [r1] +.Lenc_final: + vld1.64 {SHASH}, [r3, :128] + vmov.i8 MASK, #0xe1 + veor SHASH2_p64, SHASH_L, SHASH_H + vshl.u64 MASK, MASK, #57 + mov r0, #1 + bne 3f // process head block first + ghash_update p64, aggregate=0, head=0 + + vrev64.8 XL, XL + vext.8 XL, XL, XL, #8 + veor XL, XL, e1 + + sub r2, r2, #16 // rewind src pointer + vst1.8 {XL}, [r2] // store tag + + pop {r4-r8, pc} +ENDPROC(pmull_gcm_enc_final) + + /* + * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, + * struct gcm_key const *k, char *head, + * char *iv, int rounds, u32 counter, + * const char *otag, int authsize) + */ +ENTRY(pmull_gcm_dec_final) + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + + bl pmull_aes_encrypt_final + + cmp r0, #0 + beq .Ldec_final + + mov_l ip, .Lpermute + sub r4, r4, #16 + add r8, ip, r0 + add ip, ip, #32 + add r4, r4, r0 + sub ip, ip, r0 + + vld1.8 {e3}, [r8] // permute vector for key stream + vld1.8 {e2}, [ip] // permute vector for ghash input + + vtbl.8 e3l, {e0}, e3l + vtbl.8 e3h, {e0}, e3h + + vld1.8 {e0}, [r4] + + vtbl.8 T1_L, {e0}, e2l + vtbl.8 T1_H, {e0}, e2h + + veor e0, e0, e3 + vst1.8 {e0}, [r4] + + vld1.64 {XL}, [r1] +.Ldec_final: + vld1.64 {SHASH}, [r3] + vmov.i8 MASK, #0xe1 + veor SHASH2_p64, SHASH_L, SHASH_H + vshl.u64 MASK, MASK, #57 + mov r0, #1 + bne 3f // process head block first + ghash_update p64, aggregate=0, head=0 + + vrev64.8 XL, XL + vext.8 XL, XL, XL, #8 + veor XL, XL, e1 + + mov_l ip, .Lpermute + ldrd r2, r3, [sp, #40] // otag and authsize + vld1.8 {T1}, [r2] + add ip, ip, r3 + vceq.i8 T1, T1, XL // compare tags + vmvn T1, T1 // 0 for eq, -1 for ne + + vld1.8 {e0}, [ip] + vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only + vtbl.8 XL_H, {T1}, e0h + + vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector + vpmin.s8 XL_L, XL_L, XL_L + vmov.32 r0, XL_L[0] // fail if != 0x0 + + pop {r4-r8, pc} +ENDPROC(pmull_gcm_dec_final) + + .section ".rodata", "a", %progbits + .align 5 +.Lpermute: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |