From 4d6d6a2c850f89bc9283d02519cb536baba72032 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Wed, 11 Jul 2012 19:37:37 +0200 Subject: crypto: cast5 - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Cast5 block cipher. The implementation processes sixteen blocks in parallel (four 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast5-avx-x86_64 vs. cast5-generic 64bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.02x 1.01x 64B 1.00x 1.00x 0.98x 1.00x 1.01x 1.02x 256B 2.03x 2.01x 0.95x 2.11x 2.12x 2.13x 1024B 2.30x 2.24x 0.95x 2.29x 2.35x 2.35x 8192B 2.31x 2.27x 0.95x 2.31x 2.39x 2.39x 128bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.01x 1.01x 64B 1.00x 1.00x 0.98x 1.01x 1.02x 1.01x 256B 2.17x 2.13x 0.96x 2.19x 2.19x 2.19x 1024B 2.29x 2.32x 0.95x 2.34x 2.37x 2.38x 8192B 2.35x 2.32x 0.95x 2.35x 2.39x 2.39x Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- crypto/Kconfig | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'crypto/Kconfig') diff --git a/crypto/Kconfig b/crypto/Kconfig index a3238051b03e..cda97fcaa822 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -692,6 +692,20 @@ config CRYPTO_CAST5 The CAST5 encryption algorithm (synonymous with CAST-128) is described in RFC2144. +config CRYPTO_CAST5_AVX_X86_64 + tristate "CAST5 (CAST-128) cipher algorithm (x86_64/AVX)" + depends on X86 && 64BIT + select CRYPTO_ALGAPI + select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 + select CRYPTO_CAST5 + help + The CAST5 encryption algorithm (synonymous with CAST-128) is + described in RFC2144. + + This module provides the Cast5 cipher algorithm that processes + sixteen blocks parallel using the AVX instruction set. + config CRYPTO_CAST6 tristate "CAST6 (CAST-256) cipher algorithm" select CRYPTO_ALGAPI -- cgit v1.2.3 From 4ea1277d301eb776e321684cd4ea95116b4e8847 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Wed, 11 Jul 2012 19:38:57 +0200 Subject: crypto: cast6 - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Cast6 block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast6-avx-x86_64 vs. cast6-generic 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 1.00x 1.01x 1.01x 0.99x 0.97x 0.98x 1.01x 0.96x 0.98x 64B 0.98x 0.99x 1.02x 1.01x 0.99x 1.00x 1.01x 0.99x 1.00x 0.99x 256B 1.77x 1.84x 0.99x 1.85x 1.77x 1.77x 1.70x 1.74x 1.69x 1.72x 1024B 1.93x 1.95x 0.99x 1.96x 1.93x 1.93x 1.84x 1.85x 1.89x 1.87x 8192B 1.91x 1.95x 0.99x 1.97x 1.95x 1.91x 1.86x 1.87x 1.93x 1.90x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 0.99x 1.02x 1.01x 0.98x 0.99x 1.00x 1.00x 0.98x 0.98x 64B 0.98x 0.99x 1.01x 1.00x 1.00x 1.00x 1.01x 1.01x 0.97x 1.00x 256B 1.77x 1.83x 1.00x 1.86x 1.79x 1.78x 1.70x 1.76x 1.71x 1.69x 1024B 1.92x 1.95x 0.99x 1.96x 1.93x 1.93x 1.83x 1.86x 1.89x 1.87x 8192B 1.94x 1.95x 0.99x 1.97x 1.95x 1.95x 1.87x 1.87x 1.93x 1.91x Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 335 +++++++++++++++ arch/x86/crypto/cast6_avx_glue.c | 648 ++++++++++++++++++++++++++++++ crypto/Kconfig | 17 + crypto/testmgr.c | 60 +++ 5 files changed, 1062 insertions(+) create mode 100644 arch/x86/crypto/cast6-avx-x86_64-asm_64.S create mode 100644 arch/x86/crypto/cast6_avx_glue.c (limited to 'crypto/Kconfig') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 565e82b00142..5bacb4a226ac 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o +obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o @@ -34,6 +35,7 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o +cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S new file mode 100644 index 000000000000..d258ce0d2e06 --- /dev/null +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -0,0 +1,335 @@ +/* + * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "cast6-avx-x86_64-asm_64.S" +.text + +.extern cast6_s1 +.extern cast6_s2 +.extern cast6_s3 +.extern cast6_s4 + +/* structure of crypto context */ +#define km 0 +#define kr (12*4*4) + +/* s-boxes */ +#define s1 cast6_s1 +#define s2 cast6_s2 +#define s3 cast6_s3 +#define s4 cast6_s4 + +/********************************************************************** + 8-way AVX cast6 + **********************************************************************/ +#define CTX %rdi + +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 + +#define RA2 %xmm4 +#define RB2 %xmm5 +#define RC2 %xmm6 +#define RD2 %xmm7 + +#define RX %xmm8 + +#define RKM %xmm9 +#define RKRF %xmm10 +#define RKRR %xmm11 + +#define RTMP %xmm12 +#define RMASK %xmm13 +#define R32 %xmm14 + +#define RID1 %rax +#define RID1b %al +#define RID2 %rbx +#define RID2b %bl + +#define RGI1 %rdx +#define RGI1bl %dl +#define RGI1bh %dh +#define RGI2 %rcx +#define RGI2bl %cl +#define RGI2bh %ch + +#define RFS1 %r8 +#define RFS1d %r8d +#define RFS2 %r9 +#define RFS2d %r9d +#define RFS3 %r10 +#define RFS3d %r10d + + +#define lookup_32bit(src, dst, op1, op2, op3) \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + movl s1(, RID1, 4), dst ## d; \ + op1 s2(, RID2, 4), dst ## d; \ + shrq $16, src; \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + op2 s3(, RID1, 4), dst ## d; \ + op3 s4(, RID2, 4), dst ## d; + +#define F(a, x, op0, op1, op2, op3) \ + op0 a, RKM, x; \ + vpslld RKRF, x, RTMP; \ + vpsrld RKRR, x, x; \ + vpor RTMP, x, x; \ + \ + vpshufb RMASK, x, x; \ + vmovq x, RGI1; \ + vpsrldq $8, x, x; \ + vmovq x, RGI2; \ + \ + lookup_32bit(RGI1, RFS1, op1, op2, op3); \ + shrq $16, RGI1; \ + lookup_32bit(RGI1, RFS2, op1, op2, op3); \ + shlq $32, RFS2; \ + orq RFS1, RFS2; \ + \ + lookup_32bit(RGI2, RFS1, op1, op2, op3); \ + shrq $16, RGI2; \ + lookup_32bit(RGI2, RFS3, op1, op2, op3); \ + shlq $32, RFS3; \ + orq RFS1, RFS3; \ + \ + vmovq RFS2, x; \ + vpinsrq $1, RFS3, x, x; + +#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) +#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) +#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) + +#define qop(in, out, x, f) \ + F ## f(in ## 1, x); \ + vpxor out ## 1, x, out ## 1; \ + F ## f(in ## 2, x); \ + vpxor out ## 2, x, out ## 2; \ + +#define Q(n) \ + vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RD, RC, RX, 1); \ + \ + vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RC, RB, RX, 2); \ + \ + vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RB, RA, RX, 3); \ + \ + vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RA, RD, RX, 1); + +#define QBAR(n) \ + vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RA, RD, RX, 1); \ + \ + vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RB, RA, RX, 3); \ + \ + vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RC, RB, RX, 2); \ + \ + vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RD, RC, RX, 1); + + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + vmovdqu (0*4*4)(in), x0; \ + vmovdqu (1*4*4)(in), x1; \ + vmovdqu (2*4*4)(in), x2; \ + vmovdqu (3*4*4)(in), x3; \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpshufb RMASK, x2, x2; \ + vpshufb RMASK, x3, x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpshufb RMASK, x2, x2; \ + vpshufb RMASK, x3, x3; \ + vmovdqu x0, (0*4*4)(out); \ + vmovdqu x1, (1*4*4)(out); \ + vmovdqu x2, (2*4*4)(out); \ + vmovdqu x3, (3*4*4)(out); + +#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpshufb RMASK, x2, x2; \ + vpshufb RMASK, x3, x3; \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor (2*4*4)(out), x2, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor (3*4*4)(out), x3, x3; \ + vmovdqu x3, (3*4*4)(out); + +.align 16 +.Lbswap_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.L32_mask: + .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 + +.align 16 +.global __cast6_enc_blk_8way +.type __cast6_enc_blk_8way,@function; + +__cast6_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pushq %rbx; + pushq %rcx; + + vmovdqu .Lbswap_mask, RMASK; + vmovdqu .L32_mask, R32; + vpxor RKRF, RKRF, RKRF; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + xorq RID1, RID1; + xorq RID2, RID2; + + Q(0); + Q(1); + Q(2); + Q(3); + Q(4); + Q(5); + QBAR(6); + QBAR(7); + QBAR(8); + QBAR(9); + QBAR(10); + QBAR(11); + + popq %rcx; + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + ret; + +__enc_xor8: + outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + ret; + +.align 16 +.global cast6_dec_blk_8way +.type cast6_dec_blk_8way,@function; + +cast6_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %rbx; + + vmovdqu .Lbswap_mask, RMASK; + vmovdqu .L32_mask, R32; + vpxor RKRF, RKRF, RKRF; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + xorq RID1, RID1; + xorq RID2, RID2; + + Q(11); + Q(10); + Q(9); + Q(8); + Q(7); + Q(6); + QBAR(5); + QBAR(4); + QBAR(3); + QBAR(2); + QBAR(1); + QBAR(0); + + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + ret; diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c new file mode 100644 index 000000000000..15e5f85a5011 --- /dev/null +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -0,0 +1,648 @@ +/* + * Glue Code for the AVX assembler implemention of the Cast6 Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAST6_PARALLEL_BLOCKS 8 + +asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, + const u8 *src) +{ + __cast6_enc_blk_8way(ctx, dst, src, false); +} + +static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst, + const u8 *src) +{ + __cast6_enc_blk_8way(ctx, dst, src, true); +} + +static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst, + const u8 *src) +{ + cast6_dec_blk_8way(ctx, dst, src); +} + + +static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ + u128 ivs[CAST6_PARALLEL_BLOCKS - 1]; + unsigned int j; + + for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; + + cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +{ + be128 ctrblk; + + u128_to_be128(&ctrblk, iv); + u128_inc(iv); + + __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, src, (u128 *)&ctrblk); +} + +static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) +{ + be128 ctrblks[CAST6_PARALLEL_BLOCKS]; + unsigned int i; + + for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); + } + + cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx cast6_enc = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } + } } +}; + +static const struct common_glue_ctx cast6_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx cast6_dec = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } + } } +}; + +static const struct common_glue_ctx cast6_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&cast6_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&cast6_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__cast6_encrypt), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&cast6_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&cast6_ctr, desc, dst, src, nbytes); +} + +static inline bool cast6_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); +} + +static inline void cast6_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { + struct cast6_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAST6_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { + cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __cast6_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAST6_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { + cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __cast6_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct cast6_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct cast6_ctx cast6_ctx; +}; + +static int lrw_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __cast6_setkey(&ctx->cast6_ctx, key, keylen - CAST6_BLOCK_SIZE, + &tfm->crt_flags); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - CAST6_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->cast6_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->cast6_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +struct cast6_xts_ctx { + struct cast6_ctx tweak_ctx; + struct cast6_ctx crypt_ctx; +}; + +static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct cast6_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg cast6_algs[10] = { { + .cra_name = "__ecb-cast6-avx", + .cra_driver_name = "__driver-ecb-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = cast6_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-cast6-avx", + .cra_driver_name = "__driver-cbc-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = cast6_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-cast6-avx", + .cra_driver_name = "__driver-ctr-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cast6_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = cast6_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-cast6-avx", + .cra_driver_name = "__driver-lrw-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE + + CAST6_BLOCK_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE + + CAST6_BLOCK_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = lrw_cast6_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-cast6-avx", + .cra_driver_name = "__driver-xts-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE * 2, + .max_keysize = CAST6_MAX_KEY_SIZE * 2, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = xts_cast6_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(cast6)", + .cra_driver_name = "ecb-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(cast6)", + .cra_driver_name = "cbc-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(cast6)", + .cra_driver_name = "ctr-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(cast6)", + .cra_driver_name = "lrw-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE + + CAST6_BLOCK_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE + + CAST6_BLOCK_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(cast6)", + .cra_driver_name = "xts-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE * 2, + .max_keysize = CAST6_MAX_KEY_SIZE * 2, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init cast6_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + pr_info("AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(cast6_algs, ARRAY_SIZE(cast6_algs)); +} + +static void __exit cast6_exit(void) +{ + crypto_unregister_algs(cast6_algs, ARRAY_SIZE(cast6_algs)); +} + +module_init(cast6_init); +module_exit(cast6_exit); + +MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("cast6"); diff --git a/crypto/Kconfig b/crypto/Kconfig index cda97fcaa822..fe8ed62efe2f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -713,6 +713,23 @@ config CRYPTO_CAST6 The CAST6 encryption algorithm (synonymous with CAST-256) is described in RFC2612. +config CRYPTO_CAST6_AVX_X86_64 + tristate "CAST6 (CAST-256) cipher algorithm (x86_64/AVX)" + depends on X86 && 64BIT + select CRYPTO_ALGAPI + select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 + select CRYPTO_GLUE_HELPER_X86 + select CRYPTO_CAST6 + select CRYPTO_LRW + select CRYPTO_XTS + help + The CAST6 encryption algorithm (synonymous with CAST-256) is + described in RFC2612. + + This module provides the Cast6 cipher algorithm that processes + eight blocks parallel using the AVX instruction set. + config CRYPTO_DES tristate "DES and Triple DES EDE cipher algorithms" select CRYPTO_ALGAPI diff --git a/crypto/testmgr.c b/crypto/testmgr.c index cff3c1c3f83c..575b57c3244b 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1548,6 +1548,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__cbc-cast6-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__cbc-serpent-avx", .test = alg_test_null, @@ -1624,6 +1639,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__driver-cbc-cast6-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__driver-cbc-serpent-avx", .test = alg_test_null, @@ -1700,6 +1730,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "__driver-ecb-cast6-avx", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "__driver-ecb-serpent-avx", .test = alg_test_null, @@ -2026,6 +2071,21 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "cryptd(__driver-ecb-cast6-avx)", + .test = alg_test_null, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } }, { .alg = "cryptd(__driver-ecb-serpent-avx)", .test = alg_test_null, -- cgit v1.2.3 From 35a1fc1873dd6deac6c005ead85424a8ee28183a Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Thu, 19 Jul 2012 09:42:41 -0500 Subject: powerpc/crypto: add 842 crypto driver This patch add the 842 cryptographic API driver that submits compression requests to the 842 hardware compression accelerator driver (nx-compress). If the hardware accelerator goes offline for any reason (dynamic disable, migration, etc...), this driver will use LZO as a software failover for all future compression requests. For decompression requests, the 842 hardware driver contains a software implementation of the 842 decompressor to support the decompression of data that was compressed before the accelerator went offline. Signed-off-by: Robert Jennings Signed-off-by: Seth Jennings Signed-off-by: Herbert Xu --- crypto/842.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ crypto/Kconfig | 9 +++ crypto/Makefile | 1 + 3 files changed, 193 insertions(+) create mode 100644 crypto/842.c (limited to 'crypto/Kconfig') diff --git a/crypto/842.c b/crypto/842.c new file mode 100644 index 000000000000..144767db44e5 --- /dev/null +++ b/crypto/842.c @@ -0,0 +1,183 @@ +/* + * Cryptographic API for the 842 compression algorithm. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Robert Jennings + * Seth Jennings + */ + +#include +#include +#include +#include +#include +#include +#include + +static int nx842_uselzo; + +struct nx842_ctx { + void *nx842_wmem; /* working memory for 842/lzo */ +}; + +enum nx842_crypto_type { + NX842_CRYPTO_TYPE_842, + NX842_CRYPTO_TYPE_LZO +}; + +#define NX842_SENTINEL 0xdeadbeef + +struct nx842_crypto_header { + unsigned int sentinel; /* debug */ + enum nx842_crypto_type type; +}; + +static int nx842_init(struct crypto_tfm *tfm) +{ + struct nx842_ctx *ctx = crypto_tfm_ctx(tfm); + int wmemsize; + + wmemsize = max_t(int, nx842_get_workmem_size(), LZO1X_MEM_COMPRESS); + ctx->nx842_wmem = kmalloc(wmemsize, GFP_NOFS); + if (!ctx->nx842_wmem) + return -ENOMEM; + + return 0; +} + +static void nx842_exit(struct crypto_tfm *tfm) +{ + struct nx842_ctx *ctx = crypto_tfm_ctx(tfm); + + kfree(ctx->nx842_wmem); +} + +static void nx842_reset_uselzo(unsigned long data) +{ + nx842_uselzo = 0; +} + +static DEFINE_TIMER(failover_timer, nx842_reset_uselzo, 0, 0); + +static int nx842_crypto_compress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct nx842_ctx *ctx = crypto_tfm_ctx(tfm); + struct nx842_crypto_header *hdr; + unsigned int tmp_len = *dlen; + size_t lzodlen; /* needed for lzo */ + int err; + + *dlen = 0; + hdr = (struct nx842_crypto_header *)dst; + hdr->sentinel = NX842_SENTINEL; /* debug */ + dst += sizeof(struct nx842_crypto_header); + tmp_len -= sizeof(struct nx842_crypto_header); + lzodlen = tmp_len; + + if (likely(!nx842_uselzo)) { + err = nx842_compress(src, slen, dst, &tmp_len, ctx->nx842_wmem); + + if (likely(!err)) { + hdr->type = NX842_CRYPTO_TYPE_842; + *dlen = tmp_len + sizeof(struct nx842_crypto_header); + return 0; + } + + /* hardware failed */ + nx842_uselzo = 1; + + /* set timer to check for hardware again in 1 second */ + mod_timer(&failover_timer, jiffies + msecs_to_jiffies(1000)); + } + + /* no hardware, use lzo */ + err = lzo1x_1_compress(src, slen, dst, &lzodlen, ctx->nx842_wmem); + if (err != LZO_E_OK) + return -EINVAL; + + hdr->type = NX842_CRYPTO_TYPE_LZO; + *dlen = lzodlen + sizeof(struct nx842_crypto_header); + return 0; +} + +static int nx842_crypto_decompress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct nx842_ctx *ctx = crypto_tfm_ctx(tfm); + struct nx842_crypto_header *hdr; + unsigned int tmp_len = *dlen; + size_t lzodlen; /* needed for lzo */ + int err; + + *dlen = 0; + hdr = (struct nx842_crypto_header *)src; + + if (unlikely(hdr->sentinel != NX842_SENTINEL)) + return -EINVAL; + + src += sizeof(struct nx842_crypto_header); + slen -= sizeof(struct nx842_crypto_header); + + if (likely(hdr->type == NX842_CRYPTO_TYPE_842)) { + err = nx842_decompress(src, slen, dst, &tmp_len, + ctx->nx842_wmem); + if (err) + return -EINVAL; + *dlen = tmp_len; + } else if (hdr->type == NX842_CRYPTO_TYPE_LZO) { + lzodlen = tmp_len; + err = lzo1x_decompress_safe(src, slen, dst, &lzodlen); + if (err != LZO_E_OK) + return -EINVAL; + *dlen = lzodlen; + } else + return -EINVAL; + + return 0; +} + +static struct crypto_alg alg = { + .cra_name = "842", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct nx842_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_init = nx842_init, + .cra_exit = nx842_exit, + .cra_u = { .compress = { + .coa_compress = nx842_crypto_compress, + .coa_decompress = nx842_crypto_decompress } } +}; + +static int __init nx842_mod_init(void) +{ + del_timer(&failover_timer); + return crypto_register_alg(&alg); +} + +static void __exit nx842_mod_exit(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(nx842_mod_init); +module_exit(nx842_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("842 Compression Algorithm"); diff --git a/crypto/Kconfig b/crypto/Kconfig index fe8ed62efe2f..cbcc0e2eeda0 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1039,6 +1039,15 @@ config CRYPTO_LZO help This is the LZO algorithm. +config CRYPTO_842 + tristate "842 compression algorithm" + depends on CRYPTO_DEV_NX_COMPRESS + # 842 uses lzo if the hardware becomes unavailable + select LZO_COMPRESS + select LZO_DECOMPRESS + help + This is the 842 algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index 396966d2d849..a301ad2b258c 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o +obj-$(CONFIG_CRYPTO_842) += 842.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o -- cgit v1.2.3 From 023af608254add7ba037cd634cc5f2fb21ff6420 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 22 Jul 2012 18:18:37 +0300 Subject: crypto: aesni_intel - improve lrw and xts performance by utilizing parallel AES-NI hardware pipelines Use parallel LRW and XTS encryption facilities to better utilize AES-NI hardware pipelines and gain extra performance. Tcrypt benchmark results (async), old vs new ratios: Intel Core i5-2450M CPU (fam: 6, model: 42, step: 7) aes:128bit lrw:256bit xts:256bit size lrw-enc lrw-dec xts-dec xts-dec 16B 0.99x 1.00x 1.22x 1.19x 64B 1.38x 1.50x 1.58x 1.61x 256B 2.04x 2.02x 2.27x 2.29x 1024B 2.56x 2.54x 2.89x 2.92x 8192B 2.85x 2.99x 3.40x 3.23x aes:192bit lrw:320bit xts:384bit size lrw-enc lrw-dec xts-dec xts-dec 16B 1.08x 1.08x 1.16x 1.17x 64B 1.48x 1.54x 1.59x 1.65x 256B 2.18x 2.17x 2.29x 2.28x 1024B 2.67x 2.67x 2.87x 3.05x 8192B 2.93x 2.84x 3.28x 3.33x aes:256bit lrw:348bit xts:512bit size lrw-enc lrw-dec xts-dec xts-dec 16B 1.07x 1.07x 1.18x 1.19x 64B 1.56x 1.56x 1.70x 1.71x 256B 2.22x 2.24x 2.46x 2.46x 1024B 2.76x 2.77x 3.13x 3.05x 8192B 2.99x 3.05x 3.40x 3.30x Cc: Huang Ying Signed-off-by: Jussi Kivilinna Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 253 ++++++++++++++++++++++++++++++++----- crypto/Kconfig | 2 + 2 files changed, 220 insertions(+), 35 deletions(-) (limited to 'crypto/Kconfig') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 648347a05773..7c04d0da709b 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -41,18 +44,10 @@ #define HAS_CTR #endif -#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) -#define HAS_LRW -#endif - #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) #define HAS_PCBC #endif -#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) -#define HAS_XTS -#endif - /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -79,6 +74,16 @@ struct aesni_hash_subkey_req_data { #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) #define RFC4106_HASH_SUBKEY_SIZE 16 +struct aesni_lrw_ctx { + struct lrw_table_ctx lrw_table; + u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; +}; + +struct aesni_xts_ctx { + u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; + u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; +}; + asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out, @@ -398,13 +403,6 @@ static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) #endif #endif -#ifdef HAS_LRW -static int ablk_lrw_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "fpu(lrw(__driver-aes-aesni))"); -} -#endif - #ifdef HAS_PCBC static int ablk_pcbc_init(struct crypto_tfm *tfm) { @@ -412,12 +410,160 @@ static int ablk_pcbc_init(struct crypto_tfm *tfm) } #endif -#ifdef HAS_XTS -static int ablk_xts_init(struct crypto_tfm *tfm) +static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) { - return ablk_init_common(tfm, "fpu(xts(__driver-aes-aesni))"); + aesni_ecb_enc(ctx, blks, blks, nbytes); +} + +static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) +{ + aesni_ecb_dec(ctx, blks, blks, nbytes); +} + +static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, + keylen - AES_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE); +} + +static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) +{ + struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), + .crypt_fn = lrw_xts_encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = lrw_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), + .crypt_fn = lrw_xts_decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = lrw_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, + keylen / 2); +} + + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), + .tweak_fn = XTS_TWEAK_CAST(aesni_enc), + .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), + .crypt_fn = lrw_xts_encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = xts_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), + .tweak_fn = XTS_TWEAK_CAST(aesni_enc), + .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), + .crypt_fn = lrw_xts_decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = xts_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; } -#endif #ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_tfm *tfm) @@ -1035,10 +1181,10 @@ static struct crypto_alg aesni_algs[] = { { }, #endif #endif -#ifdef HAS_LRW +#ifdef HAS_PCBC }, { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", + .cra_name = "pcbc(aes)", + .cra_driver_name = "pcbc-aes-aesni", .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, @@ -1046,12 +1192,12 @@ static struct crypto_alg aesni_algs[] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_init = ablk_lrw_init, + .cra_init = ablk_pcbc_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, @@ -1059,10 +1205,50 @@ static struct crypto_alg aesni_algs[] = { { }, }, #endif -#ifdef HAS_PCBC }, { - .cra_name = "pcbc(aes)", - .cra_driver_name = "pcbc-aes-aesni", + .cra_name = "__lrw-aes-aesni", + .cra_driver_name = "__driver-lrw-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesni_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_aesni_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = lrw_aesni_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-aes-aesni", + .cra_driver_name = "__driver-xts-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesni_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aesni_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, @@ -1070,20 +1256,18 @@ static struct crypto_alg aesni_algs[] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_init = ablk_pcbc_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, .decrypt = ablk_decrypt, }, }, -#endif -#ifdef HAS_XTS }, { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-aesni", @@ -1094,7 +1278,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_init = ablk_xts_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -1106,7 +1290,6 @@ static struct crypto_alg aesni_algs[] = { { .decrypt = ablk_decrypt, }, }, -#endif } }; diff --git a/crypto/Kconfig b/crypto/Kconfig index cbcc0e2eeda0..213fb37be51f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -564,6 +564,8 @@ config CRYPTO_AES_NI_INTEL select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 select CRYPTO_ALGAPI + select CRYPTO_LRW + select CRYPTO_XTS help Use Intel AES-NI instructions for AES algorithm. -- cgit v1.2.3 From f0be44f4fb1faee42635ca5ea06dc9c3e820a35d Mon Sep 17 00:00:00 2001 From: David McCullough Date: Fri, 7 Sep 2012 04:17:02 +0800 Subject: arm/crypto: Add optimized AES and SHA1 routines Add assembler versions of AES and SHA1 for ARM platforms. This has provided up to a 50% improvement in IPsec/TCP throughout for tunnels using AES128/SHA1. Platform CPU SPeed Endian Before (bps) After (bps) Improvement IXP425 533 MHz big 11217042 15566294 ~38% KS8695 166 MHz little 3828549 5795373 ~51% Signed-off-by: David McCullough Signed-off-by: Herbert Xu --- arch/arm/Makefile | 1 + arch/arm/crypto/Makefile | 9 + arch/arm/crypto/aes-armv4.S | 1112 ++++++++++++++++++++++++++++++++++++ arch/arm/crypto/aes_glue.c | 108 ++++ arch/arm/crypto/sha1-armv4-large.S | 503 ++++++++++++++++ arch/arm/crypto/sha1_glue.c | 179 ++++++ crypto/Kconfig | 33 ++ 7 files changed, 1945 insertions(+) create mode 100644 arch/arm/crypto/Makefile create mode 100644 arch/arm/crypto/aes-armv4.S create mode 100644 arch/arm/crypto/aes_glue.c create mode 100644 arch/arm/crypto/sha1-armv4-large.S create mode 100644 arch/arm/crypto/sha1_glue.c (limited to 'crypto/Kconfig') diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 30eae87ead6d..46038d756bf6 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -255,6 +255,7 @@ core-$(CONFIG_VFP) += arch/arm/vfp/ # If we have a machine-specific directory, then include it in the build. core-y += arch/arm/kernel/ arch/arm/mm/ arch/arm/common/ core-y += arch/arm/net/ +core-y += arch/arm/crypto/ core-y += $(machdirs) $(platdirs) drivers-$(CONFIG_OPROFILE) += arch/arm/oprofile/ diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile new file mode 100644 index 000000000000..a2c83851bc90 --- /dev/null +++ b/arch/arm/crypto/Makefile @@ -0,0 +1,9 @@ +# +# Arch-specific CryptoAPI modules. +# + +obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o +obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o + +aes-arm-y := aes-armv4.o aes_glue.o +sha1-arm-y := sha1-armv4-large.o sha1_glue.o diff --git a/arch/arm/crypto/aes-armv4.S b/arch/arm/crypto/aes-armv4.S new file mode 100644 index 000000000000..e59b1d505d6c --- /dev/null +++ b/arch/arm/crypto/aes-armv4.S @@ -0,0 +1,1112 @@ +#define __ARM_ARCH__ __LINUX_ARM_ARCH__ +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ AES for ARMv4 + +@ January 2007. +@ +@ Code uses single 1K S-box and is >2 times faster than code generated +@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which +@ allows to merge logical or arithmetic operation with shift or rotate +@ in one instruction and emit combined result every cycle. The module +@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit +@ key [on single-issue Xscale PXA250 core]. + +@ May 2007. +@ +@ AES_set_[en|de]crypt_key is added. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 12% improvement on +@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~21.5 cycles per byte. + +@ A little glue here to select the correct code below for the ARM CPU +@ that is being targetted. + +.text +.code 32 + +.type AES_Te,%object +.align 5 +AES_Te: +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a +@ Te4[256] +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +@ rcon[] +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 +.size AES_Te,.-AES_Te + +@ void AES_encrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_encrypt +.type AES_encrypt,%function +.align 5 +AES_encrypt: + sub r3,pc,#8 @ AES_encrypt + stmdb sp!,{r1,r4-r12,lr} + mov r12,r0 @ inp + mov r11,r2 + sub r10,r3,#AES_encrypt-AES_Te @ Te +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_encrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_encrypt,.-AES_encrypt + +.type _armv4_AES_encrypt,%function +.align 2 +_armv4_AES_encrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4-r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0 + and r8,lr,r0,lsr#8 + and r9,lr,r0,lsr#16 + mov r0,r0,lsr#24 +.Lenc_loop: + ldr r4,[r10,r7,lsl#2] @ Te3[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldr r5,[r10,r8,lsl#2] @ Te2[s0>>8] + and r8,lr,r1 + ldr r6,[r10,r9,lsl#2] @ Te1[s0>>16] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Te0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Te1[s1>>16] + ldr r8,[r10,r8,lsl#2] @ Te3[s1>>0] + ldr r9,[r10,r9,lsl#2] @ Te2[s1>>8] + eor r0,r0,r7,ror#8 + ldr r1,[r10,r1,lsl#2] @ Te0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,ror#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r6,r9,ror#8 + and r9,lr,r2 + ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8] + eor r1,r1,r4,ror#24 + ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,ror#8 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r6,r9,ror#16 + and r9,lr,r3,lsr#16 @ i2 + ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0] + eor r2,r2,r5,ror#16 + ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16] + eor r0,r0,r7,ror#24 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24] + eor r2,r2,r9,ror#8 + ldr r4,[r11,#-12] + eor r3,r3,r6,ror#8 + + ldr r5,[r11,#-8] + eor r0,r0,r7 + ldr r6,[r11,#-4] + and r7,lr,r0 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0,lsr#16 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne .Lenc_loop + + add r10,r10,#2 + + ldrb r4,[r10,r7,lsl#2] @ Te4[s0>>0] + and r7,lr,r1,lsr#16 @ i0 + ldrb r5,[r10,r8,lsl#2] @ Te4[s0>>8] + and r8,lr,r1 + ldrb r6,[r10,r9,lsl#2] @ Te4[s0>>16] + and r9,lr,r1,lsr#8 + ldrb r0,[r10,r0,lsl#2] @ Te4[s0>>24] + mov r1,r1,lsr#24 + + ldrb r7,[r10,r7,lsl#2] @ Te4[s1>>16] + ldrb r8,[r10,r8,lsl#2] @ Te4[s1>>0] + ldrb r9,[r10,r9,lsl#2] @ Te4[s1>>8] + eor r0,r7,r0,lsl#8 + ldrb r1,[r10,r1,lsl#2] @ Te4[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,lsl#8 + and r8,lr,r2,lsr#16 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8] + eor r1,r4,r1,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16] + mov r2,r2,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0] + eor r0,r7,r0,lsl#8 + ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24] + and r7,lr,r3 @ i0 + eor r1,r1,r8,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,lsl#8 + and r9,lr,r3,lsr#16 @ i2 + ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0] + eor r2,r5,r2,lsl#24 + ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8] + mov r3,r3,lsr#24 + + ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16] + eor r0,r7,r0,lsl#8 + ldr r7,[r11,#0] + ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r2,r9,lsl#16 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#2 + ldr pc,[sp],#4 @ pop and return +.size _armv4_AES_encrypt,.-_armv4_AES_encrypt + +.global private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,%function +.align 5 +private_AES_set_encrypt_key: +_armv4_AES_set_encrypt_key: + sub r3,pc,#8 @ AES_set_encrypt_key + teq r0,#0 + moveq r0,#-1 + beq .Labrt + teq r2,#0 + moveq r0,#-1 + beq .Labrt + + teq r1,#128 + beq .Lok + teq r1,#192 + beq .Lok + teq r1,#256 + movne r0,#-1 + bne .Labrt + +.Lok: stmdb sp!,{r4-r12,lr} + sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 + + mov r12,r0 @ inp + mov lr,r1 @ bits + mov r11,r2 @ key + +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + str r0,[r11],#16 + orr r3,r3,r5,lsl#16 + str r1,[r11,#-12] + orr r3,r3,r6,lsl#24 + str r2,[r11,#-8] + str r3,[r11,#-4] +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r11],#16 + str r1,[r11,#-12] + str r2,[r11,#-8] + str r3,[r11,#-4] +#endif + + teq lr,#128 + bne .Lnot128 + mov r12,#10 + str r12,[r11,#240-16] + add r6,r10,#256 @ rcon + mov lr,#255 + +.L128_loop: + and r5,lr,r3,lsr#24 + and r7,lr,r3,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r3 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r5,r5,r4 + eor r0,r0,r5 @ rk[4]=rk[0]^... + eor r1,r1,r0 @ rk[5]=rk[1]^rk[4] + str r0,[r11],#16 + eor r2,r2,r1 @ rk[6]=rk[2]^rk[5] + str r1,[r11,#-12] + eor r3,r3,r2 @ rk[7]=rk[3]^rk[6] + str r2,[r11,#-8] + subs r12,r12,#1 + str r3,[r11,#-4] + bne .L128_loop + sub r2,r11,#176 + b .Ldone + +.Lnot128: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#19] + ldrb r4,[r12,#18] + ldrb r5,[r12,#17] + ldrb r6,[r12,#16] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#23] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#22] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#21] + ldrb r6,[r12,#20] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#16] + ldr r9,[r12,#20] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + teq lr,#192 + bne .Lnot192 + mov r12,#12 + str r12,[r11,#240-24] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#8 + +.L192_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[6]=rk[0]^... + eor r1,r1,r0 @ rk[7]=rk[1]^rk[6] + str r0,[r11],#24 + eor r2,r2,r1 @ rk[8]=rk[2]^rk[7] + str r1,[r11,#-20] + eor r3,r3,r2 @ rk[9]=rk[3]^rk[8] + str r2,[r11,#-16] + subs r12,r12,#1 + str r3,[r11,#-12] + subeq r2,r11,#216 + beq .Ldone + + ldr r7,[r11,#-32] + ldr r8,[r11,#-28] + eor r7,r7,r3 @ rk[10]=rk[4]^rk[9] + eor r9,r8,r7 @ rk[11]=rk[5]^rk[10] + str r7,[r11,#-8] + str r9,[r11,#-4] + b .L192_loop + +.Lnot192: +#if __ARM_ARCH__<7 + ldrb r8,[r12,#27] + ldrb r4,[r12,#26] + ldrb r5,[r12,#25] + ldrb r6,[r12,#24] + orr r8,r8,r4,lsl#8 + ldrb r9,[r12,#31] + orr r8,r8,r5,lsl#16 + ldrb r4,[r12,#30] + orr r8,r8,r6,lsl#24 + ldrb r5,[r12,#29] + ldrb r6,[r12,#28] + orr r9,r9,r4,lsl#8 + orr r9,r9,r5,lsl#16 + str r8,[r11],#8 + orr r9,r9,r6,lsl#24 + str r9,[r11,#-4] +#else + ldr r8,[r12,#24] + ldr r9,[r12,#28] +#ifdef __ARMEL__ + rev r8,r8 + rev r9,r9 +#endif + str r8,[r11],#8 + str r9,[r11,#-4] +#endif + + mov r12,#14 + str r12,[r11,#240-32] + add r6,r10,#256 @ rcon + mov lr,#255 + mov r12,#7 + +.L256_loop: + and r5,lr,r9,lsr#24 + and r7,lr,r9,lsr#16 + ldrb r5,[r10,r5] + and r8,lr,r9,lsr#8 + ldrb r7,[r10,r7] + and r9,lr,r9 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#24 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r6],#4 @ rcon[i++] + orr r5,r5,r9,lsl#8 + eor r9,r5,r4 + eor r0,r0,r9 @ rk[8]=rk[0]^... + eor r1,r1,r0 @ rk[9]=rk[1]^rk[8] + str r0,[r11],#32 + eor r2,r2,r1 @ rk[10]=rk[2]^rk[9] + str r1,[r11,#-28] + eor r3,r3,r2 @ rk[11]=rk[3]^rk[10] + str r2,[r11,#-24] + subs r12,r12,#1 + str r3,[r11,#-20] + subeq r2,r11,#256 + beq .Ldone + + and r5,lr,r3 + and r7,lr,r3,lsr#8 + ldrb r5,[r10,r5] + and r8,lr,r3,lsr#16 + ldrb r7,[r10,r7] + and r9,lr,r3,lsr#24 + ldrb r8,[r10,r8] + orr r5,r5,r7,lsl#8 + ldrb r9,[r10,r9] + orr r5,r5,r8,lsl#16 + ldr r4,[r11,#-48] + orr r5,r5,r9,lsl#24 + + ldr r7,[r11,#-44] + ldr r8,[r11,#-40] + eor r4,r4,r5 @ rk[12]=rk[4]^... + ldr r9,[r11,#-36] + eor r7,r7,r4 @ rk[13]=rk[5]^rk[12] + str r4,[r11,#-16] + eor r8,r8,r7 @ rk[14]=rk[6]^rk[13] + str r7,[r11,#-12] + eor r9,r9,r8 @ rk[15]=rk[7]^rk[14] + str r8,[r11,#-8] + str r9,[r11,#-4] + b .L256_loop + +.Ldone: mov r0,#0 + ldmia sp!,{r4-r12,lr} +.Labrt: tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key + +.global private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,%function +.align 5 +private_AES_set_decrypt_key: + str lr,[sp,#-4]! @ push lr +#if 0 + @ kernel does both of these in setkey so optimise this bit out by + @ expecting the key to already have the enc_key work done (see aes_glue.c) + bl _armv4_AES_set_encrypt_key +#else + mov r0,#0 +#endif + teq r0,#0 + ldrne lr,[sp],#4 @ pop lr + bne .Labrt + + stmdb sp!,{r4-r12} + + ldr r12,[r2,#240] @ AES_set_encrypt_key preserves r2, + mov r11,r2 @ which is AES_KEY *key + mov r7,r2 + add r8,r2,r12,lsl#4 + +.Linv: ldr r0,[r7] + ldr r1,[r7,#4] + ldr r2,[r7,#8] + ldr r3,[r7,#12] + ldr r4,[r8] + ldr r5,[r8,#4] + ldr r6,[r8,#8] + ldr r9,[r8,#12] + str r0,[r8],#-16 + str r1,[r8,#16+4] + str r2,[r8,#16+8] + str r3,[r8,#16+12] + str r4,[r7],#16 + str r5,[r7,#-12] + str r6,[r7,#-8] + str r9,[r7,#-4] + teq r7,r8 + bne .Linv + ldr r0,[r11,#16]! @ prefetch tp1 + mov r7,#0x80 + mov r8,#0x1b + orr r7,r7,#0x8000 + orr r8,r8,#0x1b00 + orr r7,r7,r7,lsl#16 + orr r8,r8,r8,lsl#16 + sub r12,r12,#1 + mvn r9,r7 + mov r12,r12,lsl#2 @ (rounds-1)*4 + +.Lmix: and r4,r0,r7 + and r1,r0,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r1,r4,r1,lsl#1 @ tp2 + + and r4,r1,r7 + and r2,r1,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r2,r4,r2,lsl#1 @ tp4 + + and r4,r2,r7 + and r3,r2,r9 + sub r4,r4,r4,lsr#7 + and r4,r4,r8 + eor r3,r4,r3,lsl#1 @ tp8 + + eor r4,r1,r2 + eor r5,r0,r3 @ tp9 + eor r4,r4,r3 @ tpe + eor r4,r4,r1,ror#24 + eor r4,r4,r5,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) + eor r4,r4,r2,ror#16 + eor r4,r4,r5,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) + eor r4,r4,r5,ror#8 @ ^= ROTATE(tp9,24) + + ldr r0,[r11,#4] @ prefetch tp1 + str r4,[r11],#4 + subs r12,r12,#1 + bne .Lmix + + mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key + +.type AES_Td,%object +.align 5 +AES_Td: +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 +@ Td4[256] +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +.size AES_Td,.-AES_Td + +@ void AES_decrypt(const unsigned char *in, unsigned char *out, +@ const AES_KEY *key) { +.global AES_decrypt +.type AES_decrypt,%function +.align 5 +AES_decrypt: + sub r3,pc,#8 @ AES_decrypt + stmdb sp!,{r1,r4-r12,lr} + mov r12,r0 @ inp + mov r11,r2 + sub r10,r3,#AES_decrypt-AES_Td @ Td +#if __ARM_ARCH__<7 + ldrb r0,[r12,#3] @ load input data in endian-neutral + ldrb r4,[r12,#2] @ manner... + ldrb r5,[r12,#1] + ldrb r6,[r12,#0] + orr r0,r0,r4,lsl#8 + ldrb r1,[r12,#7] + orr r0,r0,r5,lsl#16 + ldrb r4,[r12,#6] + orr r0,r0,r6,lsl#24 + ldrb r5,[r12,#5] + ldrb r6,[r12,#4] + orr r1,r1,r4,lsl#8 + ldrb r2,[r12,#11] + orr r1,r1,r5,lsl#16 + ldrb r4,[r12,#10] + orr r1,r1,r6,lsl#24 + ldrb r5,[r12,#9] + ldrb r6,[r12,#8] + orr r2,r2,r4,lsl#8 + ldrb r3,[r12,#15] + orr r2,r2,r5,lsl#16 + ldrb r4,[r12,#14] + orr r2,r2,r6,lsl#24 + ldrb r5,[r12,#13] + ldrb r6,[r12,#12] + orr r3,r3,r4,lsl#8 + orr r3,r3,r5,lsl#16 + orr r3,r3,r6,lsl#24 +#else + ldr r0,[r12,#0] + ldr r1,[r12,#4] + ldr r2,[r12,#8] + ldr r3,[r12,#12] +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif +#endif + bl _armv4_AES_decrypt + + ldr r12,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + str r0,[r12,#0] + str r1,[r12,#4] + str r2,[r12,#8] + str r3,[r12,#12] +#else + mov r4,r0,lsr#24 @ write output in endian-neutral + mov r5,r0,lsr#16 @ manner... + mov r6,r0,lsr#8 + strb r4,[r12,#0] + strb r5,[r12,#1] + mov r4,r1,lsr#24 + strb r6,[r12,#2] + mov r5,r1,lsr#16 + strb r0,[r12,#3] + mov r6,r1,lsr#8 + strb r4,[r12,#4] + strb r5,[r12,#5] + mov r4,r2,lsr#24 + strb r6,[r12,#6] + mov r5,r2,lsr#16 + strb r1,[r12,#7] + mov r6,r2,lsr#8 + strb r4,[r12,#8] + strb r5,[r12,#9] + mov r4,r3,lsr#24 + strb r6,[r12,#10] + mov r5,r3,lsr#16 + strb r2,[r12,#11] + mov r6,r3,lsr#8 + strb r4,[r12,#12] + strb r5,[r12,#13] + strb r6,[r12,#14] + strb r3,[r12,#15] +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size AES_decrypt,.-AES_decrypt + +.type _armv4_AES_decrypt,%function +.align 2 +_armv4_AES_decrypt: + str lr,[sp,#-4]! @ push lr + ldmia r11!,{r4-r7} + eor r0,r0,r4 + ldr r12,[r11,#240-16] + eor r1,r1,r5 + eor r2,r2,r6 + eor r3,r3,r7 + sub r12,r12,#1 + mov lr,#255 + + and r7,lr,r0,lsr#16 + and r8,lr,r0,lsr#8 + and r9,lr,r0 + mov r0,r0,lsr#24 +.Ldec_loop: + ldr r4,[r10,r7,lsl#2] @ Td1[s0>>16] + and r7,lr,r1 @ i0 + ldr r5,[r10,r8,lsl#2] @ Td2[s0>>8] + and r8,lr,r1,lsr#16 + ldr r6,[r10,r9,lsl#2] @ Td3[s0>>0] + and r9,lr,r1,lsr#8 + ldr r0,[r10,r0,lsl#2] @ Td0[s0>>24] + mov r1,r1,lsr#24 + + ldr r7,[r10,r7,lsl#2] @ Td3[s1>>0] + ldr r8,[r10,r8,lsl#2] @ Td1[s1>>16] + ldr r9,[r10,r9,lsl#2] @ Td2[s1>>8] + eor r0,r0,r7,ror#24 + ldr r1,[r10,r1,lsl#2] @ Td0[s1>>24] + and r7,lr,r2,lsr#8 @ i0 + eor r5,r8,r5,ror#8 + and r8,lr,r2 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r2,lsr#16 + ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8] + eor r1,r1,r4,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0] + mov r2,r2,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16] + eor r0,r0,r7,ror#16 + ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24] + and r7,lr,r3,lsr#16 @ i0 + eor r1,r1,r8,ror#24 + and r8,lr,r3,lsr#8 @ i1 + eor r6,r9,r6,ror#8 + and r9,lr,r3 @ i2 + ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16] + eor r2,r2,r5,ror#8 + ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8] + mov r3,r3,lsr#24 + + ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0] + eor r0,r0,r7,ror#8 + ldr r7,[r11],#16 + eor r1,r1,r8,ror#16 + ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24] + eor r2,r2,r9,ror#24 + + ldr r4,[r11,#-12] + eor r0,r0,r7 + ldr r5,[r11,#-8] + eor r3,r3,r6,ror#8 + ldr r6,[r11,#-4] + and r7,lr,r0,lsr#16 + eor r1,r1,r4 + and r8,lr,r0,lsr#8 + eor r2,r2,r5 + and r9,lr,r0 + eor r3,r3,r6 + mov r0,r0,lsr#24 + + subs r12,r12,#1 + bne .Ldec_loop + + add r10,r10,#1024 + + ldr r5,[r10,#0] @ prefetch Td4 + ldr r6,[r10,#32] + ldr r4,[r10,#64] + ldr r5,[r10,#96] + ldr r6,[r10,#128] + ldr r4,[r10,#160] + ldr r5,[r10,#192] + ldr r6,[r10,#224] + + ldrb r0,[r10,r0] @ Td4[s0>>24] + ldrb r4,[r10,r7] @ Td4[s0>>16] + and r7,lr,r1 @ i0 + ldrb r5,[r10,r8] @ Td4[s0>>8] + and r8,lr,r1,lsr#16 + ldrb r6,[r10,r9] @ Td4[s0>>0] + and r9,lr,r1,lsr#8 + + ldrb r7,[r10,r7] @ Td4[s1>>0] + ldrb r1,[r10,r1,lsr#24] @ Td4[s1>>24] + ldrb r8,[r10,r8] @ Td4[s1>>16] + eor r0,r7,r0,lsl#24 + ldrb r9,[r10,r9] @ Td4[s1>>8] + eor r1,r4,r1,lsl#8 + and r7,lr,r2,lsr#8 @ i0 + eor r5,r5,r8,lsl#8 + and r8,lr,r2 @ i1 + ldrb r7,[r10,r7] @ Td4[s2>>8] + eor r6,r6,r9,lsl#8 + ldrb r8,[r10,r8] @ Td4[s2>>0] + and r9,lr,r2,lsr#16 + + ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24] + eor r0,r0,r7,lsl#8 + ldrb r9,[r10,r9] @ Td4[s2>>16] + eor r1,r8,r1,lsl#16 + and r7,lr,r3,lsr#16 @ i0 + eor r2,r5,r2,lsl#16 + and r8,lr,r3,lsr#8 @ i1 + ldrb r7,[r10,r7] @ Td4[s3>>16] + eor r6,r6,r9,lsl#16 + ldrb r8,[r10,r8] @ Td4[s3>>8] + and r9,lr,r3 @ i2 + + ldrb r9,[r10,r9] @ Td4[s3>>0] + ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24] + eor r0,r0,r7,lsl#16 + ldr r7,[r11,#0] + eor r1,r1,r8,lsl#8 + ldr r4,[r11,#4] + eor r2,r9,r2,lsl#8 + ldr r5,[r11,#8] + eor r3,r6,r3,lsl#24 + ldr r6,[r11,#12] + + eor r0,r0,r7 + eor r1,r1,r4 + eor r2,r2,r5 + eor r3,r3,r6 + + sub r10,r10,#1024 + ldr pc,[sp],#4 @ pop and return +.size _armv4_AES_decrypt,.-_armv4_AES_decrypt +.asciz "AES for ARMv4, CRYPTOGAMS by " +.align 2 diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c new file mode 100644 index 000000000000..59f7877ead6a --- /dev/null +++ b/arch/arm/crypto/aes_glue.c @@ -0,0 +1,108 @@ +/* + * Glue Code for the asm optimized version of the AES Cipher Algorithm + */ + +#include +#include +#include + +#define AES_MAXNR 14 + +typedef struct { + unsigned int rd_key[4 *(AES_MAXNR + 1)]; + int rounds; +} AES_KEY; + +struct AES_CTX { + AES_KEY enc_key; + AES_KEY dec_key; +}; + +asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx); +asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx); +asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); +asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct AES_CTX *ctx = crypto_tfm_ctx(tfm); + AES_encrypt(src, dst, &ctx->enc_key); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + struct AES_CTX *ctx = crypto_tfm_ctx(tfm); + AES_decrypt(src, dst, &ctx->dec_key); +} + +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct AES_CTX *ctx = crypto_tfm_ctx(tfm); + + switch (key_len) { + case AES_KEYSIZE_128: + key_len = 128; + break; + case AES_KEYSIZE_192: + key_len = 192; + break; + case AES_KEYSIZE_256: + key_len = 256; + break; + default: + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + if (private_AES_set_encrypt_key(in_key, key_len, &ctx->enc_key) == -1) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + /* private_AES_set_decrypt_key expects an encryption key as input */ + ctx->dec_key = ctx->enc_key; + if (private_AES_set_decrypt_key(in_key, key_len, &ctx->dec_key) == -1) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + return 0; +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct AES_CTX), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}; + +static int __init aes_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm (ASM)"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("aes"); +MODULE_ALIAS("aes-asm"); +MODULE_AUTHOR("David McCullough "); diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S new file mode 100644 index 000000000000..7050ab133b9d --- /dev/null +++ b/arch/arm/crypto/sha1-armv4-large.S @@ -0,0 +1,503 @@ +#define __ARM_ARCH__ __LINUX_ARM_ARCH__ +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ sha1_block procedure for ARMv4. +@ +@ January 2007. + +@ Size/performance trade-off +@ ==================================================================== +@ impl size in bytes comp cycles[*] measured performance +@ ==================================================================== +@ thumb 304 3212 4420 +@ armv4-small 392/+29% 1958/+64% 2250/+96% +@ armv4-compact 740/+89% 1552/+26% 1840/+22% +@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] +@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% +@ ==================================================================== +@ thumb = same as 'small' but in Thumb instructions[**] and +@ with recurring code in two private functions; +@ small = detached Xload/update, loops are folded; +@ compact = detached Xload/update, 5x unroll; +@ large = interleaved Xload/update, 5x unroll; +@ full unroll = interleaved Xload/update, full unroll, estimated[!]; +@ +@ [*] Manually counted instructions in "grand" loop body. Measured +@ performance is affected by prologue and epilogue overhead, +@ i-cache availability, branch penalties, etc. +@ [**] While each Thumb instruction is twice smaller, they are not as +@ diverse as ARM ones: e.g., there are only two arithmetic +@ instructions with 3 arguments, no [fixed] rotate, addressing +@ modes are limited. As result it takes more instructions to do +@ the same job in Thumb, therefore the code is never twice as +@ small and always slower. +@ [***] which is also ~35% better than compiler generated code. Dual- +@ issue Cortex A8 core was measured to process input block in +@ ~990 cycles. + +@ August 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 13% improvement on +@ Cortex A8 core and in absolute terms ~870 cycles per input block +@ [or 13.6 cycles per byte]. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 10% +@ improvement on Cortex A8 core and 12.2 cycles per byte. + +.text + +.global sha1_block_data_order +.type sha1_block_data_order,%function + +.align 2 +sha1_block_data_order: + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) + teq r14,sp + bne .L_00_15 @ [((11+4)*5+2)*3] +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + sub sp,sp,#25*4 + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) + teq r14,sp @ preserve carry + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 + teq r14,sp + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.align 2 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +.size sha1_block_data_order,.-sha1_block_data_order +.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by " +.align 2 diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c new file mode 100644 index 000000000000..76cd976230bc --- /dev/null +++ b/arch/arm/crypto/sha1_glue.c @@ -0,0 +1,179 @@ +/* + * Cryptographic API. + * Glue code for the SHA1 Secure Hash Algorithm assembler implementation + * + * This file is based on sha1_generic.c and sha1_ssse3_glue.c + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright (c) Mathias Krause + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct SHA1_CTX { + uint32_t h0,h1,h2,h3,h4; + u64 count; + u8 data[SHA1_BLOCK_SIZE]; +}; + +asmlinkage void sha1_block_data_order(struct SHA1_CTX *digest, + const unsigned char *data, unsigned int rounds); + + +static int sha1_init(struct shash_desc *desc) +{ + struct SHA1_CTX *sctx = shash_desc_ctx(desc); + memset(sctx, 0, sizeof(*sctx)); + sctx->h0 = SHA1_H0; + sctx->h1 = SHA1_H1; + sctx->h2 = SHA1_H2; + sctx->h3 = SHA1_H3; + sctx->h4 = SHA1_H4; + return 0; +} + + +static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data, + unsigned int len, unsigned int partial) +{ + unsigned int done = 0; + + sctx->count += len; + + if (partial) { + done = SHA1_BLOCK_SIZE - partial; + memcpy(sctx->data + partial, data, done); + sha1_block_data_order(sctx, sctx->data, 1); + } + + if (len - done >= SHA1_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; + sha1_block_data_order(sctx, data + done, rounds); + done += rounds * SHA1_BLOCK_SIZE; + } + + memcpy(sctx->data, data + done, len - done); + return 0; +} + + +static int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct SHA1_CTX *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA1_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->data + partial, data, len); + return 0; + } + res = __sha1_update(sctx, data, len, partial); + return res; +} + + +/* Add padding and return the message digest. */ +static int sha1_final(struct shash_desc *desc, u8 *out) +{ + struct SHA1_CTX *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA1_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); + /* We need to fill a whole block for __sha1_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->data + index, padding, padlen); + } else { + __sha1_update(sctx, padding, padlen, index); + } + __sha1_update(sctx, (const u8 *)&bits, sizeof(bits), 56); + + /* Store state in digest */ + for (i = 0; i < 5; i++) + dst[i] = cpu_to_be32(((u32 *)sctx)[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + return 0; +} + + +static int sha1_export(struct shash_desc *desc, void *out) +{ + struct SHA1_CTX *sctx = shash_desc_ctx(desc); + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + + +static int sha1_import(struct shash_desc *desc, const void *in) +{ + struct SHA1_CTX *sctx = shash_desc_ctx(desc); + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_init, + .update = sha1_update, + .final = sha1_final, + .export = sha1_export, + .import = sha1_import, + .descsize = sizeof(struct SHA1_CTX), + .statesize = sizeof(struct SHA1_CTX), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-asm", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + + +static int __init sha1_mod_init(void) +{ + return crypto_register_shash(&alg); +} + + +static void __exit sha1_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + + +module_init(sha1_mod_init); +module_exit(sha1_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)"); +MODULE_ALIAS("sha1"); +MODULE_AUTHOR("David McCullough "); diff --git a/crypto/Kconfig b/crypto/Kconfig index 213fb37be51f..27307981f88e 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -433,6 +433,15 @@ config CRYPTO_SHA1_SSSE3 using Supplemental SSE3 (SSSE3) instructions or Advanced Vector Extensions (AVX), when available. +config CRYPTO_SHA1_ARM + tristate "SHA1 digest algorithm (ARM-asm)" + depends on ARM + select CRYPTO_SHA1 + select CRYPTO_HASH + help + SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented + using optimized ARM assembler. + config CRYPTO_SHA256 tristate "SHA224 and SHA256 digest algorithm" select CRYPTO_HASH @@ -590,6 +599,30 @@ config CRYPTO_AES_NI_INTEL ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional acceleration for CTR. +config CRYPTO_AES_ARM + tristate "AES cipher algorithms (ARM-asm)" + depends on ARM + select CRYPTO_ALGAPI + select CRYPTO_AES + help + Use optimized AES assembler routines for ARM platforms. + + AES cipher algorithms (FIPS-197). AES uses the Rijndael + algorithm. + + Rijndael appears to be consistently a very good performer in + both hardware and software across a wide range of computing + environments regardless of its use in feedback or non-feedback + modes. Its key setup time is excellent, and its key agility is + good. Rijndael's very low memory requirements make it very well + suited for restricted-space environments, in which it also + demonstrates excellent performance. Rijndael's operations are + among the easiest to defend against power and timing attacks. + + The AES specifies three key sizes: 128, 192 and 256 bits + + See for more information. + config CRYPTO_ANUBIS tristate "Anubis cipher algorithm" select CRYPTO_ALGAPI -- cgit v1.2.3