summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 02:43:35 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-26 02:43:35 +0300
commit62606c224d72a98c35d21a849f95cccf95b0a252 (patch)
tree6f6f3466451edf9baa2ea8b5f9fc558aa555c69a /arch/arm64
parent24ed334f33666f2ae929ccc08f72e7e72e353c64 (diff)
parenta1c6fd4308d37f072e939a2782f24214115fc7e8 (diff)
downloadlinux-62606c224d72a98c35d21a849f95cccf95b0a252.tar.xz
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Remove VLA usage - Add cryptostat user-space interface - Add notifier for new crypto algorithms Algorithms: - Add OFB mode - Remove speck Drivers: - Remove x86/sha*-mb as they are buggy - Remove pcbc(aes) from x86/aesni - Improve performance of arm/ghash-ce by up to 85% - Implement CTS-CBC in arm64/aes-blk, faster by up to 50% - Remove PMULL based arm64/crc32 driver - Use PMULL in arm64/crct10dif - Add aes-ctr support in s5p-sss - Add caam/qi2 driver Others: - Pick better transform if one becomes available in crc-t10dif" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (124 commits) crypto: chelsio - Update ntx queue received from cxgb4 crypto: ccree - avoid implicit enum conversion crypto: caam - add SPDX license identifier to all files crypto: caam/qi - simplify CGR allocation, freeing crypto: mxs-dcp - make symbols 'sha1_null_hash' and 'sha256_null_hash' static crypto: arm64/aes-blk - ensure XTS mask is always loaded crypto: testmgr - fix sizeof() on COMP_BUF_SIZE crypto: chtls - remove set but not used variable 'csk' crypto: axis - fix platform_no_drv_owner.cocci warnings crypto: x86/aes-ni - fix build error following fpu template removal crypto: arm64/aes - fix handling sub-block CTS-CBC inputs crypto: caam/qi2 - avoid double export crypto: mxs-dcp - Fix AES issues crypto: mxs-dcp - Fix SHA null hashes and output length crypto: mxs-dcp - Implement sha import/export crypto: aegis/generic - fix for big endian systems crypto: morus/generic - fix for big endian systems crypto: lrw - fix rebase error after out of bounds fix crypto: cavium/nitrox - use pci_alloc_irq_vectors() while enabling MSI-X. crypto: cavium/nitrox - NITROX command queue changes. ...
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/configs/defconfig2
-rw-r--r--arch/arm64/crypto/Kconfig11
-rw-r--r--arch/arm64/crypto/Makefile6
-rw-r--r--arch/arm64/crypto/aes-ce.S5
-rw-r--r--arch/arm64/crypto/aes-glue.c217
-rw-r--r--arch/arm64/crypto/aes-modes.S416
-rw-r--r--arch/arm64/crypto/aes-neon.S6
-rw-r--r--arch/arm64/crypto/crc32-ce-core.S287
-rw-r--r--arch/arm64/crypto/crc32-ce-glue.c244
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S314
-rw-r--r--arch/arm64/crypto/crct10dif-ce-glue.c14
-rw-r--r--arch/arm64/crypto/speck-neon-core.S352
-rw-r--r--arch/arm64/crypto/speck-neon-glue.c282
13 files changed, 672 insertions, 1484 deletions
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index db8d364f8476..3d165b4cdd2a 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -698,6 +698,7 @@ CONFIG_MEMTEST=y
CONFIG_SECURITY=y
CONFIG_CRYPTO_ECHAINIV=y
CONFIG_CRYPTO_ANSI_CPRNG=y
+CONFIG_CRYPTO_DEV_FSL_DPAA2_CAAM=y
CONFIG_ARM64_CRYPTO=y
CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
@@ -706,7 +707,6 @@ CONFIG_CRYPTO_SHA3_ARM64=m
CONFIG_CRYPTO_SM3_ARM64_CE=m
CONFIG_CRYPTO_GHASH_ARM64_CE=y
CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
-CONFIG_CRYPTO_CRC32_ARM64_CE=m
CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_CRYPTO_CHACHA20_NEON=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index e3fdb0fd6f70..a5606823ed4d 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -66,11 +66,6 @@ config CRYPTO_CRCT10DIF_ARM64_CE
depends on KERNEL_MODE_NEON && CRC_T10DIF
select CRYPTO_HASH
-config CRYPTO_CRC32_ARM64_CE
- tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
- depends on CRC32
- select CRYPTO_HASH
-
config CRYPTO_AES_ARM64
tristate "AES core cipher using scalar instructions"
select CRYPTO_AES
@@ -119,10 +114,4 @@ config CRYPTO_AES_ARM64_BS
select CRYPTO_AES_ARM64
select CRYPTO_SIMD
-config CRYPTO_SPECK_NEON
- tristate "NEON accelerated Speck cipher algorithms"
- depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_SPECK
-
endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index bcafd016618e..f476fede09ba 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -32,9 +32,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
-obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
-crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
@@ -56,9 +53,6 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
-
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 623e74ed1c67..143070510809 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -17,6 +17,11 @@
.arch armv8-a+crypto
+ xtsmask .req v16
+
+ .macro xts_reload_mask, tmp
+ .endm
+
/* preload all round keys */
.macro load_round_keys, rounds, rk
cmp \rounds, #12
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index adcb83eb683c..1e676625ef33 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -15,6 +15,7 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <crypto/xts.h>
@@ -31,6 +32,8 @@
#define aes_ecb_decrypt ce_aes_ecb_decrypt
#define aes_cbc_encrypt ce_aes_cbc_encrypt
#define aes_cbc_decrypt ce_aes_cbc_decrypt
+#define aes_cbc_cts_encrypt ce_aes_cbc_cts_encrypt
+#define aes_cbc_cts_decrypt ce_aes_cbc_cts_decrypt
#define aes_ctr_encrypt ce_aes_ctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
@@ -45,6 +48,8 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#define aes_ecb_decrypt neon_aes_ecb_decrypt
#define aes_cbc_encrypt neon_aes_cbc_encrypt
#define aes_cbc_decrypt neon_aes_cbc_decrypt
+#define aes_cbc_cts_encrypt neon_aes_cbc_cts_encrypt
+#define aes_cbc_cts_decrypt neon_aes_cbc_cts_decrypt
#define aes_ctr_encrypt neon_aes_ctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
@@ -63,30 +68,41 @@ MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
/* defined in aes-modes.S */
-asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
-asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
-asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
-asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
-asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 const iv[]);
+asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 const iv[]);
+
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 ctr[]);
-asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
- int rounds, int blocks, u8 const rk2[], u8 iv[],
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int blocks, u32 const rk2[], u8 iv[],
int first);
-asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
- int rounds, int blocks, u8 const rk2[], u8 iv[],
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int blocks, u32 const rk2[], u8 iv[],
int first);
asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
int blocks, u8 dg[], int enc_before,
int enc_after);
+struct cts_cbc_req_ctx {
+ struct scatterlist sg_src[2];
+ struct scatterlist sg_dst[2];
+ struct skcipher_request subreq;
+};
+
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
struct crypto_aes_ctx __aligned(8) key2;
@@ -142,7 +158,7 @@ static int ecb_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, rounds, blocks);
+ ctx->key_enc, rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@@ -162,7 +178,7 @@ static int ecb_decrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_dec, rounds, blocks);
+ ctx->key_dec, rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@@ -182,7 +198,7 @@ static int cbc_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+ ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@@ -202,13 +218,149 @@ static int cbc_decrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_dec, rounds, blocks, walk.iv);
+ ctx->key_dec, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
return err;
}
+static int cts_cbc_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct cts_cbc_req_ctx));
+ return 0;
+}
+
+static int cts_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct cts_cbc_req_ctx *rctx = skcipher_request_ctx(req);
+ int err, rounds = 6 + ctx->key_length / 4;
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct skcipher_walk walk;
+
+ skcipher_request_set_tfm(&rctx->subreq, tfm);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ unsigned int blocks;
+
+ skcipher_request_set_crypt(&rctx->subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+
+ while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
+ aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, rounds, blocks, walk.iv);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(rctx->sg_src, req->src,
+ rctx->subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(rctx->sg_dst, req->dst,
+ rctx->subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&rctx->subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+ aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, rounds, walk.nbytes, walk.iv);
+ kernel_neon_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct cts_cbc_req_ctx *rctx = skcipher_request_ctx(req);
+ int err, rounds = 6 + ctx->key_length / 4;
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct skcipher_walk walk;
+
+ skcipher_request_set_tfm(&rctx->subreq, tfm);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ unsigned int blocks;
+
+ skcipher_request_set_crypt(&rctx->subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+
+ while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
+ aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, rounds, blocks, walk.iv);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(rctx->sg_src, req->src,
+ rctx->subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(rctx->sg_dst, req->dst,
+ rctx->subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&rctx->subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &rctx->subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+ aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, rounds, walk.nbytes, walk.iv);
+ kernel_neon_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -222,7 +374,7 @@ static int ctr_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+ ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@@ -238,7 +390,7 @@ static int ctr_encrypt(struct skcipher_request *req)
blocks = -1;
kernel_neon_begin();
- aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
+ aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds,
blocks, walk.iv);
kernel_neon_end();
crypto_xor_cpy(tdst, tsrc, tail, nbytes);
@@ -272,8 +424,8 @@ static int xts_encrypt(struct skcipher_request *req)
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
kernel_neon_begin();
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key1.key_enc, rounds, blocks,
- (u8 *)ctx->key2.key_enc, walk.iv, first);
+ ctx->key1.key_enc, rounds, blocks,
+ ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@@ -294,8 +446,8 @@ static int xts_decrypt(struct skcipher_request *req)
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
kernel_neon_begin();
aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key1.key_dec, rounds, blocks,
- (u8 *)ctx->key2.key_enc, walk.iv, first);
+ ctx->key1.key_dec, rounds, blocks,
+ ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@@ -336,6 +488,24 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = cbc_decrypt,
}, {
.base = {
+ .cra_name = "__cts(cbc(aes))",
+ .cra_driver_name = "__cts-cbc-aes-" MODE,
+ .cra_priority = PRIO,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = skcipher_aes_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
+ .init = cts_cbc_init_tfm,
+}, {
+ .base = {
.cra_name = "__ctr(aes)",
.cra_driver_name = "__ctr-aes-" MODE,
.cra_priority = PRIO,
@@ -412,7 +582,6 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
{
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
be128 *consts = (be128 *)ctx->consts;
- u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
int err;
@@ -422,7 +591,8 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
/* encrypt the zero vector */
kernel_neon_begin();
- aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1);
+ aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
+ rounds, 1);
kernel_neon_end();
cmac_gf128_mul_by_x(consts, consts);
@@ -441,7 +611,6 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
};
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
- u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
u8 key[AES_BLOCK_SIZE];
int err;
@@ -451,8 +620,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
return err;
kernel_neon_begin();
- aes_ecb_encrypt(key, ks[0], rk, rounds, 1);
- aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2);
+ aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
+ aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
kernel_neon_end();
return cbcmac_setkey(tfm, key, sizeof(key));
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 483a7130cf0e..67700045a0e0 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -14,12 +14,12 @@
.align 4
aes_encrypt_block4x:
- encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
+ encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
ENDPROC(aes_encrypt_block4x)
aes_decrypt_block4x:
- decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
+ decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
ENDPROC(aes_decrypt_block4x)
@@ -31,71 +31,57 @@ ENDPROC(aes_decrypt_block4x)
*/
AES_ENTRY(aes_ecb_encrypt)
- frame_push 5
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
-
-.Lecbencrestart:
- enc_prepare w22, x21, x5
+ enc_prepare w3, x2, x5
.LecbencloopNx:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lecbenc1x
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
bl aes_encrypt_block4x
- st1 {v0.16b-v3.16b}, [x19], #64
- cond_yield_neon .Lecbencrestart
+ st1 {v0.16b-v3.16b}, [x0], #64
b .LecbencloopNx
.Lecbenc1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lecbencout
.Lecbencloop:
- ld1 {v0.16b}, [x20], #16 /* get next pt block */
- encrypt_block v0, w22, x21, x5, w6
- st1 {v0.16b}, [x19], #16
- subs w23, w23, #1
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
+ encrypt_block v0, w3, x2, x5, w6
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
bne .Lecbencloop
.Lecbencout:
- frame_pop
+ ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_ecb_encrypt)
AES_ENTRY(aes_ecb_decrypt)
- frame_push 5
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
-
-.Lecbdecrestart:
- dec_prepare w22, x21, x5
+ dec_prepare w3, x2, x5
.LecbdecloopNx:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lecbdec1x
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
bl aes_decrypt_block4x
- st1 {v0.16b-v3.16b}, [x19], #64
- cond_yield_neon .Lecbdecrestart
+ st1 {v0.16b-v3.16b}, [x0], #64
b .LecbdecloopNx
.Lecbdec1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lecbdecout
.Lecbdecloop:
- ld1 {v0.16b}, [x20], #16 /* get next ct block */
- decrypt_block v0, w22, x21, x5, w6
- st1 {v0.16b}, [x19], #16
- subs w23, w23, #1
+ ld1 {v0.16b}, [x1], #16 /* get next ct block */
+ decrypt_block v0, w3, x2, x5, w6
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
bne .Lecbdecloop
.Lecbdecout:
- frame_pop
+ ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_ecb_decrypt)
@@ -108,162 +94,211 @@ AES_ENDPROC(aes_ecb_decrypt)
*/
AES_ENTRY(aes_cbc_encrypt)
- frame_push 6
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
-
-.Lcbcencrestart:
- ld1 {v4.16b}, [x24] /* get iv */
- enc_prepare w22, x21, x6
+ ld1 {v4.16b}, [x5] /* get iv */
+ enc_prepare w3, x2, x6
.Lcbcencloop4x:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lcbcenc1x
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
- encrypt_block v0, w22, x21, x6, w7
+ encrypt_block v0, w3, x2, x6, w7
eor v1.16b, v1.16b, v0.16b
- encrypt_block v1, w22, x21, x6, w7
+ encrypt_block v1, w3, x2, x6, w7
eor v2.16b, v2.16b, v1.16b
- encrypt_block v2, w22, x21, x6, w7
+ encrypt_block v2, w3, x2, x6, w7
eor v3.16b, v3.16b, v2.16b
- encrypt_block v3, w22, x21, x6, w7
- st1 {v0.16b-v3.16b}, [x19], #64
+ encrypt_block v3, w3, x2, x6, w7
+ st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v3.16b
- st1 {v4.16b}, [x24] /* return iv */
- cond_yield_neon .Lcbcencrestart
b .Lcbcencloop4x
.Lcbcenc1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lcbcencout
.Lcbcencloop:
- ld1 {v0.16b}, [x20], #16 /* get next pt block */
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
- encrypt_block v4, w22, x21, x6, w7
- st1 {v4.16b}, [x19], #16
- subs w23, w23, #1
+ encrypt_block v4, w3, x2, x6, w7
+ st1 {v4.16b}, [x0], #16
+ subs w4, w4, #1
bne .Lcbcencloop
.Lcbcencout:
- st1 {v4.16b}, [x24] /* return iv */
- frame_pop
+ st1 {v4.16b}, [x5] /* return iv */
ret
AES_ENDPROC(aes_cbc_encrypt)
AES_ENTRY(aes_cbc_decrypt)
- frame_push 6
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
-.Lcbcdecrestart:
- ld1 {v7.16b}, [x24] /* get iv */
- dec_prepare w22, x21, x6
+ ld1 {v7.16b}, [x5] /* get iv */
+ dec_prepare w3, x2, x6
.LcbcdecloopNx:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lcbcdec1x
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
bl aes_decrypt_block4x
- sub x20, x20, #16
+ sub x1, x1, #16
eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v4.16b
- ld1 {v7.16b}, [x20], #16 /* reload 1 ct block */
+ ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x19], #64
- st1 {v7.16b}, [x24] /* return iv */
- cond_yield_neon .Lcbcdecrestart
+ st1 {v0.16b-v3.16b}, [x0], #64
b .LcbcdecloopNx
.Lcbcdec1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lcbcdecout
.Lcbcdecloop:
- ld1 {v1.16b}, [x20], #16 /* get next ct block */
+ ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
- decrypt_block v0, w22, x21, x6, w7
+ decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */
- st1 {v0.16b}, [x19], #16
- subs w23, w23, #1
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
- st1 {v7.16b}, [x24] /* return iv */
- frame_pop
+ st1 {v7.16b}, [x5] /* return iv */
+ ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_cbc_decrypt)
/*
+ * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
+ * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
+ */
+
+AES_ENTRY(aes_cbc_cts_encrypt)
+ adr_l x8, .Lcts_permute_table
+ sub x4, x4, #16
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ ld1 {v3.16b}, [x8]
+ ld1 {v4.16b}, [x9]
+
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
+ ld1 {v1.16b}, [x1]
+
+ ld1 {v5.16b}, [x5] /* get iv */
+ enc_prepare w3, x2, x6
+
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
+ tbl v1.16b, {v1.16b}, v4.16b
+ encrypt_block v0, w3, x2, x6, w7
+
+ eor v1.16b, v1.16b, v0.16b
+ tbl v0.16b, {v0.16b}, v3.16b
+ encrypt_block v1, w3, x2, x6, w7
+
+ add x4, x0, x4
+ st1 {v0.16b}, [x4] /* overlapping stores */
+ st1 {v1.16b}, [x0]
+ ret
+AES_ENDPROC(aes_cbc_cts_encrypt)
+
+AES_ENTRY(aes_cbc_cts_decrypt)
+ adr_l x8, .Lcts_permute_table
+ sub x4, x4, #16
+ add x9, x8, #32
+ add x8, x8, x4
+ sub x9, x9, x4
+ ld1 {v3.16b}, [x8]
+ ld1 {v4.16b}, [x9]
+
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
+ ld1 {v1.16b}, [x1]
+
+ ld1 {v5.16b}, [x5] /* get iv */
+ dec_prepare w3, x2, x6
+
+ tbl v2.16b, {v1.16b}, v4.16b
+ decrypt_block v0, w3, x2, x6, w7
+ eor v2.16b, v2.16b, v0.16b
+
+ tbx v0.16b, {v1.16b}, v4.16b
+ tbl v2.16b, {v2.16b}, v3.16b
+ decrypt_block v0, w3, x2, x6, w7
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
+
+ add x4, x0, x4
+ st1 {v2.16b}, [x4] /* overlapping stores */
+ st1 {v0.16b}, [x0]
+ ret
+AES_ENDPROC(aes_cbc_cts_decrypt)
+
+ .section ".rodata", "a"
+ .align 6
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .previous
+
+
+ /*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 ctr[])
*/
AES_ENTRY(aes_ctr_encrypt)
- frame_push 6
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
-
-.Lctrrestart:
- enc_prepare w22, x21, x6
- ld1 {v4.16b}, [x24]
+ enc_prepare w3, x2, x6
+ ld1 {v4.16b}, [x5]
umov x6, v4.d[1] /* keep swabbed ctr in reg */
rev x6, x6
+ cmn w6, w4 /* 32 bit overflow? */
+ bcs .Lctrloop
.LctrloopNx:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lctr1x
- cmn w6, #4 /* 32 bit overflow? */
- bcs .Lctr1x
- ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
- dup v7.4s, w6
+ add w7, w6, #1
mov v0.16b, v4.16b
- add v7.4s, v7.4s, v8.4s
+ add w8, w6, #2
mov v1.16b, v4.16b
- rev32 v8.16b, v7.16b
+ add w9, w6, #3
mov v2.16b, v4.16b
+ rev w7, w7
mov v3.16b, v4.16b
- mov v1.s[3], v8.s[0]
- mov v2.s[3], v8.s[1]
- mov v3.s[3], v8.s[2]
- ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */
+ rev w8, w8
+ mov v1.s[3], w7
+ rev w9, w9
+ mov v2.s[3], w8
+ mov v3.s[3], w9
+ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b
- ld1 {v5.16b}, [x20], #16 /* get 1 input block */
+ ld1 {v5.16b}, [x1], #16 /* get 1 input block */
eor v1.16b, v6.16b, v1.16b
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
- st1 {v0.16b-v3.16b}, [x19], #64
+ st1 {v0.16b-v3.16b}, [x0], #64
add x6, x6, #4
rev x7, x6
ins v4.d[1], x7
- cbz w23, .Lctrout
- st1 {v4.16b}, [x24] /* return next CTR value */
- cond_yield_neon .Lctrrestart
+ cbz w4, .Lctrout
b .LctrloopNx
.Lctr1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lctrout
.Lctrloop:
mov v0.16b, v4.16b
- encrypt_block v0, w22, x21, x8, w7
+ encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */
rev x7, x6
@@ -271,22 +306,22 @@ AES_ENTRY(aes_ctr_encrypt)
bcs .Lctrcarry /* overflow? */
.Lctrcarrydone:
- subs w23, w23, #1
+ subs w4, w4, #1
bmi .Lctrtailblock /* blocks <0 means tail block */
- ld1 {v3.16b}, [x20], #16
+ ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b
- st1 {v3.16b}, [x19], #16
+ st1 {v3.16b}, [x0], #16
bne .Lctrloop
.Lctrout:
- st1 {v4.16b}, [x24] /* return next CTR value */
-.Lctrret:
- frame_pop
+ st1 {v4.16b}, [x5] /* return next CTR value */
+ ldp x29, x30, [sp], #16
ret
.Lctrtailblock:
- st1 {v0.16b}, [x19]
- b .Lctrret
+ st1 {v0.16b}, [x0]
+ ldp x29, x30, [sp], #16
+ ret
.Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */
@@ -296,7 +331,6 @@ AES_ENTRY(aes_ctr_encrypt)
ins v4.d[0], x7
b .Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt)
- .ltorg
/*
@@ -306,150 +340,132 @@ AES_ENDPROC(aes_ctr_encrypt)
* int blocks, u8 const rk2[], u8 iv[], int first)
*/
- .macro next_tweak, out, in, const, tmp
+ .macro next_tweak, out, in, tmp
sshr \tmp\().2d, \in\().2d, #63
- and \tmp\().16b, \tmp\().16b, \const\().16b
+ and \tmp\().16b, \tmp\().16b, xtsmask.16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
-.Lxts_mul_x:
-CPU_LE( .quad 1, 0x87 )
-CPU_BE( .quad 0x87, 1 )
+ .macro xts_load_mask, tmp
+ movi xtsmask.2s, #0x1
+ movi \tmp\().2s, #0x87
+ uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
+ .endm
AES_ENTRY(aes_xts_encrypt)
- frame_push 6
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x6
-
- ld1 {v4.16b}, [x24]
+ ld1 {v4.16b}, [x6]
+ xts_load_mask v8
cbz w7, .Lxtsencnotfirst
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
enc_switch_key w3, x2, x8
- ldr q7, .Lxts_mul_x
b .LxtsencNx
-.Lxtsencrestart:
- ld1 {v4.16b}, [x24]
.Lxtsencnotfirst:
- enc_prepare w22, x21, x8
+ enc_prepare w3, x2, x8
.LxtsencloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
.LxtsencNx:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lxtsenc1x
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
- next_tweak v5, v4, v7, v8
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x19], #64
+ st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
- cbz w23, .Lxtsencout
- st1 {v4.16b}, [x24]
- cond_yield_neon .Lxtsencrestart
+ cbz w4, .Lxtsencout
+ xts_reload_mask v8
b .LxtsencloopNx
.Lxtsenc1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lxtsencout
.Lxtsencloop:
- ld1 {v1.16b}, [x20], #16
+ ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
- encrypt_block v0, w22, x21, x8, w7
+ encrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
- st1 {v0.16b}, [x19], #16
- subs w23, w23, #1
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
beq .Lxtsencout
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
b .Lxtsencloop
.Lxtsencout:
- st1 {v4.16b}, [x24]
- frame_pop
+ st1 {v4.16b}, [x6]
+ ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_xts_encrypt)
AES_ENTRY(aes_xts_decrypt)
- frame_push 6
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x6
-
- ld1 {v4.16b}, [x24]
+ ld1 {v4.16b}, [x6]
+ xts_load_mask v8
cbz w7, .Lxtsdecnotfirst
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
dec_prepare w3, x2, x8
- ldr q7, .Lxts_mul_x
b .LxtsdecNx
-.Lxtsdecrestart:
- ld1 {v4.16b}, [x24]
.Lxtsdecnotfirst:
- dec_prepare w22, x21, x8
+ dec_prepare w3, x2, x8
.LxtsdecloopNx:
- ldr q7, .Lxts_mul_x
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
.LxtsdecNx:
- subs w23, w23, #4
+ subs w4, w4, #4
bmi .Lxtsdec1x
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
- next_tweak v5, v4, v7, v8
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+ next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
- next_tweak v6, v5, v7, v8
+ next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- next_tweak v7, v6, v7, v8
+ next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
- st1 {v0.16b-v3.16b}, [x19], #64
+ st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
- cbz w23, .Lxtsdecout
- st1 {v4.16b}, [x24]
- cond_yield_neon .Lxtsdecrestart
+ cbz w4, .Lxtsdecout
+ xts_reload_mask v8
b .LxtsdecloopNx
.Lxtsdec1x:
- adds w23, w23, #4
+ adds w4, w4, #4
beq .Lxtsdecout
.Lxtsdecloop:
- ld1 {v1.16b}, [x20], #16
+ ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
- decrypt_block v0, w22, x21, x8, w7
+ decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
- st1 {v0.16b}, [x19], #16
- subs w23, w23, #1
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
beq .Lxtsdecout
- next_tweak v4, v4, v7, v8
+ next_tweak v4, v4, v8
b .Lxtsdecloop
.Lxtsdecout:
- st1 {v4.16b}, [x24]
- frame_pop
+ st1 {v4.16b}, [x6]
+ ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 1c7b45b7268e..29100f692e8a 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -14,6 +14,12 @@
#define AES_ENTRY(func) ENTRY(neon_ ## func)
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
+ xtsmask .req v7
+
+ .macro xts_reload_mask, tmp
+ xts_load_mask \tmp
+ .endm
+
/* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const
sshr \temp, \in, #7
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
deleted file mode 100644
index 8061bf0f9c66..000000000000
--- a/arch/arm64/crypto/crc32-ce-core.S
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- * Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .section ".rodata", "a"
- .align 6
- .cpu generic+crypto+crc
-
-.Lcrc32_constants:
- /*
- * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
- * #define CONSTANT_R1 0x154442bd4LL
- *
- * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
- * #define CONSTANT_R2 0x1c6e41596LL
- */
- .octa 0x00000001c6e415960000000154442bd4
-
- /*
- * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
- * #define CONSTANT_R3 0x1751997d0LL
- *
- * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
- * #define CONSTANT_R4 0x0ccaa009eLL
- */
- .octa 0x00000000ccaa009e00000001751997d0
-
- /*
- * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
- * #define CONSTANT_R5 0x163cd6124LL
- */
- .quad 0x0000000163cd6124
- .quad 0x00000000FFFFFFFF
-
- /*
- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
- *
- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
- * = 0x1F7011641LL
- * #define CONSTANT_RU 0x1F7011641LL
- */
- .octa 0x00000001F701164100000001DB710641
-
-.Lcrc32c_constants:
- .octa 0x000000009e4addf800000000740eef02
- .octa 0x000000014cd00bd600000000f20c0dfe
- .quad 0x00000000dd45aab8
- .quad 0x00000000FFFFFFFF
- .octa 0x00000000dea713f10000000105ec76f0
-
- vCONSTANT .req v0
- dCONSTANT .req d0
- qCONSTANT .req q0
-
- BUF .req x19
- LEN .req x20
- CRC .req x21
- CONST .req x22
-
- vzr .req v9
-
- /**
- * Calculate crc32
- * BUF - buffer
- * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
- * CRC - initial crc32
- * return %eax crc32
- * uint crc32_pmull_le(unsigned char const *buffer,
- * size_t len, uint crc32)
- */
- .text
-ENTRY(crc32_pmull_le)
- adr_l x3, .Lcrc32_constants
- b 0f
-
-ENTRY(crc32c_pmull_le)
- adr_l x3, .Lcrc32c_constants
-
-0: frame_push 4, 64
-
- mov BUF, x0
- mov LEN, x1
- mov CRC, x2
- mov CONST, x3
-
- bic LEN, LEN, #15
- ld1 {v1.16b-v4.16b}, [BUF], #0x40
- movi vzr.16b, #0
- fmov dCONSTANT, CRC
- eor v1.16b, v1.16b, vCONSTANT.16b
- sub LEN, LEN, #0x40
- cmp LEN, #0x40
- b.lt less_64
-
- ldr qCONSTANT, [CONST]
-
-loop_64: /* 64 bytes Full cache line folding */
- sub LEN, LEN, #0x40
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull2 v6.1q, v2.2d, vCONSTANT.2d
- pmull2 v7.1q, v3.2d, vCONSTANT.2d
- pmull2 v8.1q, v4.2d, vCONSTANT.2d
-
- pmull v1.1q, v1.1d, vCONSTANT.1d
- pmull v2.1q, v2.1d, vCONSTANT.1d
- pmull v3.1q, v3.1d, vCONSTANT.1d
- pmull v4.1q, v4.1d, vCONSTANT.1d
-
- eor v1.16b, v1.16b, v5.16b
- ld1 {v5.16b}, [BUF], #0x10
- eor v2.16b, v2.16b, v6.16b
- ld1 {v6.16b}, [BUF], #0x10
- eor v3.16b, v3.16b, v7.16b
- ld1 {v7.16b}, [BUF], #0x10
- eor v4.16b, v4.16b, v8.16b
- ld1 {v8.16b}, [BUF], #0x10
-
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- eor v3.16b, v3.16b, v7.16b
- eor v4.16b, v4.16b, v8.16b
-
- cmp LEN, #0x40
- b.lt less_64
-
- if_will_cond_yield_neon
- stp q1, q2, [sp, #.Lframe_local_offset]
- stp q3, q4, [sp, #.Lframe_local_offset + 32]
- do_cond_yield_neon
- ldp q1, q2, [sp, #.Lframe_local_offset]
- ldp q3, q4, [sp, #.Lframe_local_offset + 32]
- ldr qCONSTANT, [CONST]
- movi vzr.16b, #0
- endif_yield_neon
- b loop_64
-
-less_64: /* Folding cache line into 128bit */
- ldr qCONSTANT, [CONST, #16]
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v2.16b
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v3.16b
-
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v4.16b
-
- cbz LEN, fold_64
-
-loop_16: /* Folding rest buffer into 128bit */
- subs LEN, LEN, #0x10
-
- ld1 {v2.16b}, [BUF], #0x10
- pmull2 v5.1q, v1.2d, vCONSTANT.2d
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v5.16b
- eor v1.16b, v1.16b, v2.16b
-
- b.ne loop_16
-
-fold_64:
- /* perform the last 64 bit fold, also adds 32 zeroes
- * to the input stream */
- ext v2.16b, v1.16b, v1.16b, #8
- pmull2 v2.1q, v2.2d, vCONSTANT.2d
- ext v1.16b, v1.16b, vzr.16b, #8
- eor v1.16b, v1.16b, v2.16b
-
- /* final 32-bit fold */
- ldr dCONSTANT, [CONST, #32]
- ldr d3, [CONST, #40]
-
- ext v2.16b, v1.16b, vzr.16b, #4
- and v1.16b, v1.16b, v3.16b
- pmull v1.1q, v1.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v2.16b
-
- /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
- ldr qCONSTANT, [CONST, #48]
-
- and v2.16b, v1.16b, v3.16b
- ext v2.16b, vzr.16b, v2.16b, #8
- pmull2 v2.1q, v2.2d, vCONSTANT.2d
- and v2.16b, v2.16b, v3.16b
- pmull v2.1q, v2.1d, vCONSTANT.1d
- eor v1.16b, v1.16b, v2.16b
- mov w0, v1.s[1]
-
- frame_pop
- ret
-ENDPROC(crc32_pmull_le)
-ENDPROC(crc32c_pmull_le)
-
- .macro __crc32, c
-0: subs x2, x2, #16
- b.mi 8f
- ldp x3, x4, [x1], #16
-CPU_BE( rev x3, x3 )
-CPU_BE( rev x4, x4 )
- crc32\c\()x w0, w0, x3
- crc32\c\()x w0, w0, x4
- b.ne 0b
- ret
-
-8: tbz x2, #3, 4f
- ldr x3, [x1], #8
-CPU_BE( rev x3, x3 )
- crc32\c\()x w0, w0, x3
-4: tbz x2, #2, 2f
- ldr w3, [x1], #4
-CPU_BE( rev w3, w3 )
- crc32\c\()w w0, w0, w3
-2: tbz x2, #1, 1f
- ldrh w3, [x1], #2
-CPU_BE( rev16 w3, w3 )
- crc32\c\()h w0, w0, w3
-1: tbz x2, #0, 0f
- ldrb w3, [x1]
- crc32\c\()b w0, w0, w3
-0: ret
- .endm
-
- .align 5
-ENTRY(crc32_armv8_le)
- __crc32
-ENDPROC(crc32_armv8_le)
-
- .align 5
-ENTRY(crc32c_armv8_le)
- __crc32 c
-ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
deleted file mode 100644
index 34b4e3d46aab..000000000000
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <asm/unaligned.h>
-
-#define PMULL_MIN_LEN 64L /* minimum size of buffer
- * for crc32_pmull_le_16 */
-#define SCALE_F 16L /* size of NEON register */
-
-asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
-asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
-
-asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
-asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
-
-static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
-static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
-
-static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = 0;
- return 0;
-}
-
-static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = ~0;
- return 0;
-}
-
-static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
- unsigned int keylen)
-{
- u32 *mctx = crypto_shash_ctx(hash);
-
- if (keylen != sizeof(u32)) {
- crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- *mctx = le32_to_cpup((__le32 *)key);
- return 0;
-}
-
-static int crc32_pmull_init(struct shash_desc *desc)
-{
- u32 *mctx = crypto_shash_ctx(desc->tfm);
- u32 *crc = shash_desc_ctx(desc);
-
- *crc = *mctx;
- return 0;
-}
-
-static int crc32_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- *crc = crc32_armv8_le(*crc, data, length);
- return 0;
-}
-
-static int crc32c_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- *crc = crc32c_armv8_le(*crc, data, length);
- return 0;
-}
-
-static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
- unsigned int l;
-
- if ((u64)data % SCALE_F) {
- l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
-
- *crc = fallback_crc32(*crc, data, l);
-
- data += l;
- length -= l;
- }
-
- if (length >= PMULL_MIN_LEN && may_use_simd()) {
- l = round_down(length, SCALE_F);
-
- kernel_neon_begin();
- *crc = crc32_pmull_le(data, l, *crc);
- kernel_neon_end();
-
- data += l;
- length -= l;
- }
-
- if (length > 0)
- *crc = fallback_crc32(*crc, data, length);
-
- return 0;
-}
-
-static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
-{
- u32 *crc = shash_desc_ctx(desc);
- unsigned int l;
-
- if ((u64)data % SCALE_F) {
- l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
-
- *crc = fallback_crc32c(*crc, data, l);
-
- data += l;
- length -= l;
- }
-
- if (length >= PMULL_MIN_LEN && may_use_simd()) {
- l = round_down(length, SCALE_F);
-
- kernel_neon_begin();
- *crc = crc32c_pmull_le(data, l, *crc);
- kernel_neon_end();
-
- data += l;
- length -= l;
- }
-
- if (length > 0) {
- *crc = fallback_crc32c(*crc, data, length);
- }
-
- return 0;
-}
-
-static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- put_unaligned_le32(*crc, out);
- return 0;
-}
-
-static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
-{
- u32 *crc = shash_desc_ctx(desc);
-
- put_unaligned_le32(~*crc, out);
- return 0;
-}
-
-static struct shash_alg crc32_pmull_algs[] = { {
- .setkey = crc32_pmull_setkey,
- .init = crc32_pmull_init,
- .update = crc32_update,
- .final = crc32_pmull_final,
- .descsize = sizeof(u32),
- .digestsize = sizeof(u32),
-
- .base.cra_ctxsize = sizeof(u32),
- .base.cra_init = crc32_pmull_cra_init,
- .base.cra_name = "crc32",
- .base.cra_driver_name = "crc32-arm64-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_blocksize = 1,
- .base.cra_module = THIS_MODULE,
-}, {
- .setkey = crc32_pmull_setkey,
- .init = crc32_pmull_init,
- .update = crc32c_update,
- .final = crc32c_pmull_final,
- .descsize = sizeof(u32),
- .digestsize = sizeof(u32),
-
- .base.cra_ctxsize = sizeof(u32),
- .base.cra_init = crc32c_pmull_cra_init,
- .base.cra_name = "crc32c",
- .base.cra_driver_name = "crc32c-arm64-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_blocksize = 1,
- .base.cra_module = THIS_MODULE,
-} };
-
-static int __init crc32_pmull_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
- crc32_pmull_algs[0].update = crc32_pmull_update;
- crc32_pmull_algs[1].update = crc32c_pmull_update;
-
- if (elf_hwcap & HWCAP_CRC32) {
- fallback_crc32 = crc32_armv8_le;
- fallback_crc32c = crc32c_armv8_le;
- } else {
- fallback_crc32 = crc32_le;
- fallback_crc32c = __crc32c_le;
- }
- } else if (!(elf_hwcap & HWCAP_CRC32)) {
- return -ENODEV;
- }
- return crypto_register_shashes(crc32_pmull_algs,
- ARRAY_SIZE(crc32_pmull_algs));
-}
-
-static void __exit crc32_pmull_mod_exit(void)
-{
- crypto_unregister_shashes(crc32_pmull_algs,
- ARRAY_SIZE(crc32_pmull_algs));
-}
-
-static const struct cpu_feature crc32_cpu_feature[] = {
- { cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
-};
-MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
-
-module_init(crc32_pmull_mod_init);
-module_exit(crc32_pmull_mod_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 663ea71cdb38..9e82e8e8ed05 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -80,7 +80,186 @@
vzr .req v13
-ENTRY(crc_t10dif_pmull)
+ ad .req v14
+ bd .req v10
+
+ k00_16 .req v15
+ k32_48 .req v16
+
+ t3 .req v17
+ t4 .req v18
+ t5 .req v19
+ t6 .req v20
+ t7 .req v21
+ t8 .req v22
+ t9 .req v23
+
+ perm1 .req v24
+ perm2 .req v25
+ perm3 .req v26
+ perm4 .req v27
+
+ bd1 .req v28
+ bd2 .req v29
+ bd3 .req v30
+ bd4 .req v31
+
+ .macro __pmull_init_p64
+ .endm
+
+ .macro __pmull_pre_p64, bd
+ .endm
+
+ .macro __pmull_init_p8
+ // k00_16 := 0x0000000000000000_000000000000ffff
+ // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+ movi k32_48.2d, #0xffffffff
+ mov k32_48.h[2], k32_48.h[0]
+ ushr k00_16.2d, k32_48.2d, #32
+
+ // prepare the permutation vectors
+ mov_q x5, 0x080f0e0d0c0b0a09
+ movi perm4.8b, #8
+ dup perm1.2d, x5
+ eor perm1.16b, perm1.16b, perm4.16b
+ ushr perm2.2d, perm1.2d, #8
+ ushr perm3.2d, perm1.2d, #16
+ ushr perm4.2d, perm1.2d, #24
+ sli perm2.2d, perm1.2d, #56
+ sli perm3.2d, perm1.2d, #48
+ sli perm4.2d, perm1.2d, #40
+ .endm
+
+ .macro __pmull_pre_p8, bd
+ tbl bd1.16b, {\bd\().16b}, perm1.16b
+ tbl bd2.16b, {\bd\().16b}, perm2.16b
+ tbl bd3.16b, {\bd\().16b}, perm3.16b
+ tbl bd4.16b, {\bd\().16b}, perm4.16b
+ .endm
+
+__pmull_p8_core:
+.L__pmull_p8_core:
+ ext t4.8b, ad.8b, ad.8b, #1 // A1
+ ext t5.8b, ad.8b, ad.8b, #2 // A2
+ ext t6.8b, ad.8b, ad.8b, #3 // A3
+
+ pmull t4.8h, t4.8b, bd.8b // F = A1*B
+ pmull t8.8h, ad.8b, bd1.8b // E = A*B1
+ pmull t5.8h, t5.8b, bd.8b // H = A2*B
+ pmull t7.8h, ad.8b, bd2.8b // G = A*B2
+ pmull t6.8h, t6.8b, bd.8b // J = A3*B
+ pmull t9.8h, ad.8b, bd3.8b // I = A*B3
+ pmull t3.8h, ad.8b, bd4.8b // K = A*B4
+ b 0f
+
+.L__pmull_p8_core2:
+ tbl t4.16b, {ad.16b}, perm1.16b // A1
+ tbl t5.16b, {ad.16b}, perm2.16b // A2
+ tbl t6.16b, {ad.16b}, perm3.16b // A3
+
+ pmull2 t4.8h, t4.16b, bd.16b // F = A1*B
+ pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
+ pmull2 t5.8h, t5.16b, bd.16b // H = A2*B
+ pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
+ pmull2 t6.8h, t6.16b, bd.16b // J = A3*B
+ pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
+ pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
+
+0: eor t4.16b, t4.16b, t8.16b // L = E + F
+ eor t5.16b, t5.16b, t7.16b // M = G + H
+ eor t6.16b, t6.16b, t9.16b // N = I + J
+
+ uzp1 t8.2d, t4.2d, t5.2d
+ uzp2 t4.2d, t4.2d, t5.2d
+ uzp1 t7.2d, t6.2d, t3.2d
+ uzp2 t6.2d, t6.2d, t3.2d
+
+ // t4 = (L) (P0 + P1) << 8
+ // t5 = (M) (P2 + P3) << 16
+ eor t8.16b, t8.16b, t4.16b
+ and t4.16b, t4.16b, k32_48.16b
+
+ // t6 = (N) (P4 + P5) << 24
+ // t7 = (K) (P6 + P7) << 32
+ eor t7.16b, t7.16b, t6.16b
+ and t6.16b, t6.16b, k00_16.16b
+
+ eor t8.16b, t8.16b, t4.16b
+ eor t7.16b, t7.16b, t6.16b
+
+ zip2 t5.2d, t8.2d, t4.2d
+ zip1 t4.2d, t8.2d, t4.2d
+ zip2 t3.2d, t7.2d, t6.2d
+ zip1 t6.2d, t7.2d, t6.2d
+
+ ext t4.16b, t4.16b, t4.16b, #15
+ ext t5.16b, t5.16b, t5.16b, #14
+ ext t6.16b, t6.16b, t6.16b, #13
+ ext t3.16b, t3.16b, t3.16b, #12
+
+ eor t4.16b, t4.16b, t5.16b
+ eor t6.16b, t6.16b, t3.16b
+ ret
+ENDPROC(__pmull_p8_core)
+
+ .macro __pmull_p8, rq, ad, bd, i
+ .ifnc \bd, v10
+ .err
+ .endif
+ mov ad.16b, \ad\().16b
+ .ifb \i
+ pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B
+ .else
+ pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B
+ .endif
+
+ bl .L__pmull_p8_core\i
+
+ eor \rq\().16b, \rq\().16b, t4.16b
+ eor \rq\().16b, \rq\().16b, t6.16b
+ .endm
+
+ .macro fold64, p, reg1, reg2
+ ldp q11, q12, [arg2], #0x20
+
+ __pmull_\p v8, \reg1, v10, 2
+ __pmull_\p \reg1, \reg1, v10
+
+CPU_LE( rev64 v11.16b, v11.16b )
+CPU_LE( rev64 v12.16b, v12.16b )
+
+ __pmull_\p v9, \reg2, v10, 2
+ __pmull_\p \reg2, \reg2, v10
+
+CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
+CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
+
+ eor \reg1\().16b, \reg1\().16b, v8.16b
+ eor \reg2\().16b, \reg2\().16b, v9.16b
+ eor \reg1\().16b, \reg1\().16b, v11.16b
+ eor \reg2\().16b, \reg2\().16b, v12.16b
+ .endm
+
+ .macro fold16, p, reg, rk
+ __pmull_\p v8, \reg, v10
+ __pmull_\p \reg, \reg, v10, 2
+ .ifnb \rk
+ ldr_l q10, \rk, x8
+ __pmull_pre_\p v10
+ .endif
+ eor v7.16b, v7.16b, v8.16b
+ eor v7.16b, v7.16b, \reg\().16b
+ .endm
+
+ .macro __pmull_p64, rd, rn, rm, n
+ .ifb \n
+ pmull \rd\().1q, \rn\().1d, \rm\().1d
+ .else
+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
+ .endif
+ .endm
+
+ .macro crc_t10dif_pmull, p
frame_push 3, 128
mov arg1_low32, w0
@@ -89,6 +268,8 @@ ENTRY(crc_t10dif_pmull)
movi vzr.16b, #0 // init zero register
+ __pmull_init_\p
+
// adjust the 16-bit initial_crc value, scale it to 32 bits
lsl arg1_low32, arg1_low32, #16
@@ -96,7 +277,7 @@ ENTRY(crc_t10dif_pmull)
cmp arg3, #256
// for sizes less than 128, we can't fold 64B at a time...
- b.lt _less_than_128
+ b.lt .L_less_than_128_\@
// load the initial crc value
// crc value does not need to be byte-reflected, but it needs
@@ -137,6 +318,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
// type of pmull instruction
// will determine which constant to use
+ __pmull_pre_\p v10
//
// we subtract 256 instead of 128 to save one instruction from the loop
@@ -147,41 +329,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// buffer. The _fold_64_B_loop will fold 64B at a time
// until we have 64+y Bytes of buffer
-
// fold 64B at a time. This section of the code folds 4 vector
// registers in parallel
-_fold_64_B_loop:
+.L_fold_64_B_loop_\@:
- .macro fold64, reg1, reg2
- ldp q11, q12, [arg2], #0x20
-
- pmull2 v8.1q, \reg1\().2d, v10.2d
- pmull \reg1\().1q, \reg1\().1d, v10.1d
-
-CPU_LE( rev64 v11.16b, v11.16b )
-CPU_LE( rev64 v12.16b, v12.16b )
-
- pmull2 v9.1q, \reg2\().2d, v10.2d
- pmull \reg2\().1q, \reg2\().1d, v10.1d
-
-CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
-CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
-
- eor \reg1\().16b, \reg1\().16b, v8.16b
- eor \reg2\().16b, \reg2\().16b, v9.16b
- eor \reg1\().16b, \reg1\().16b, v11.16b
- eor \reg2\().16b, \reg2\().16b, v12.16b
- .endm
-
- fold64 v0, v1
- fold64 v2, v3
- fold64 v4, v5
- fold64 v6, v7
+ fold64 \p, v0, v1
+ fold64 \p, v2, v3
+ fold64 \p, v4, v5
+ fold64 \p, v6, v7
subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold
- b.lt _fold_64_B_end
+ b.lt .L_fold_64_B_end_\@
if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset]
@@ -195,11 +355,13 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
ldr_l q10, rk3, x8
movi vzr.16b, #0 // init zero register
+ __pmull_init_\p
+ __pmull_pre_\p v10
endif_yield_neon
- b _fold_64_B_loop
+ b .L_fold_64_B_loop_\@
-_fold_64_B_end:
+.L_fold_64_B_end_\@:
// at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3
@@ -208,38 +370,29 @@ _fold_64_B_end:
// constants
ldr_l q10, rk9, x8
+ __pmull_pre_\p v10
- .macro fold16, reg, rk
- pmull v8.1q, \reg\().1d, v10.1d
- pmull2 \reg\().1q, \reg\().2d, v10.2d
- .ifnb \rk
- ldr_l q10, \rk, x8
- .endif
- eor v7.16b, v7.16b, v8.16b
- eor v7.16b, v7.16b, \reg\().16b
- .endm
-
- fold16 v0, rk11
- fold16 v1, rk13
- fold16 v2, rk15
- fold16 v3, rk17
- fold16 v4, rk19
- fold16 v5, rk1
- fold16 v6
+ fold16 \p, v0, rk11
+ fold16 \p, v1, rk13
+ fold16 \p, v2, rk15
+ fold16 \p, v3, rk17
+ fold16 \p, v4, rk19
+ fold16 \p, v5, rk1
+ fold16 \p, v6
// instead of 64, we add 48 to the loop counter to save 1 instruction
// from the loop instead of a cmp instruction, we use the negative
// flag with the jl instruction
adds arg3, arg3, #(128-16)
- b.lt _final_reduction_for_128
+ b.lt .L_final_reduction_for_128_\@
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
// continue folding 16B at a time
-_16B_reduction_loop:
- pmull v8.1q, v7.1d, v10.1d
- pmull2 v7.1q, v7.2d, v10.2d
+.L_16B_reduction_loop_\@:
+ __pmull_\p v8, v7, v10
+ __pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b
ldr q0, [arg2], #16
@@ -251,22 +404,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// instead of a cmp instruction, we utilize the flags with the
// jge instruction equivalent of: cmp arg3, 16-16
// check if there is any more 16B in the buffer to be able to fold
- b.ge _16B_reduction_loop
+ b.ge .L_16B_reduction_loop_\@
// now we have 16+z bytes left to reduce, where 0<= z < 16.
// first, we reduce the data in the xmm7 register
-_final_reduction_for_128:
+.L_final_reduction_for_128_\@:
// check if any more data to fold. If not, compute the CRC of
// the final 128 bits
adds arg3, arg3, #16
- b.eq _128_done
+ b.eq .L_128_done_\@
// here we are getting data that is less than 16 bytes.
// since we know that there was data before the pointer, we can
// offset the input pointer before the actual point, to receive
// exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
+.L_get_last_two_regs_\@:
add arg2, arg2, arg3
ldr q1, [arg2, #-16]
CPU_LE( rev64 v1.16b, v1.16b )
@@ -291,47 +444,48 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
bsl v0.16b, v2.16b, v1.16b
// fold 16 Bytes
- pmull v8.1q, v7.1d, v10.1d
- pmull2 v7.1q, v7.2d, v10.2d
+ __pmull_\p v8, v7, v10
+ __pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, v0.16b
-_128_done:
+.L_128_done_\@:
// compute crc of a 128-bit value
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
+ __pmull_pre_\p v10
// 64b fold
ext v0.16b, vzr.16b, v7.16b, #8
mov v7.d[0], v7.d[1]
- pmull v7.1q, v7.1d, v10.1d
+ __pmull_\p v7, v7, v10
eor v7.16b, v7.16b, v0.16b
// 32b fold
ext v0.16b, v7.16b, vzr.16b, #4
mov v7.s[3], vzr.s[0]
- pmull2 v0.1q, v0.2d, v10.2d
+ __pmull_\p v0, v0, v10, 2
eor v7.16b, v7.16b, v0.16b
// barrett reduction
-_barrett:
ldr_l q10, rk7, x8
+ __pmull_pre_\p v10
mov v0.d[0], v7.d[1]
- pmull v0.1q, v0.1d, v10.1d
+ __pmull_\p v0, v0, v10
ext v0.16b, vzr.16b, v0.16b, #12
- pmull2 v0.1q, v0.2d, v10.2d
+ __pmull_\p v0, v0, v10, 2
ext v0.16b, vzr.16b, v0.16b, #12
eor v7.16b, v7.16b, v0.16b
mov w0, v7.s[1]
-_cleanup:
+.L_cleanup_\@:
// scale the result back to 16 bits
lsr x0, x0, #16
frame_pop
ret
-_less_than_128:
- cbz arg3, _cleanup
+.L_less_than_128_\@:
+ cbz arg3, .L_cleanup_\@
movi v0.16b, #0
mov v0.s[3], arg1_low32 // get the initial crc value
@@ -342,20 +496,21 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
cmp arg3, #16
- b.eq _128_done // exactly 16 left
- b.lt _less_than_16_left
+ b.eq .L_128_done_\@ // exactly 16 left
+ b.lt .L_less_than_16_left_\@
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
+ __pmull_pre_\p v10
// update the counter. subtract 32 instead of 16 to save one
// instruction from the loop
subs arg3, arg3, #32
- b.ge _16B_reduction_loop
+ b.ge .L_16B_reduction_loop_\@
add arg3, arg3, #16
- b _get_last_two_regs
+ b .L_get_last_two_regs_\@
-_less_than_16_left:
+.L_less_than_16_left_\@:
// shl r9, 4
adr_l x0, tbl_shf_table + 16
sub x0, x0, arg3
@@ -363,8 +518,17 @@ _less_than_16_left:
movi v9.16b, #0x80
eor v0.16b, v0.16b, v9.16b
tbl v7.16b, {v7.16b}, v0.16b
- b _128_done
-ENDPROC(crc_t10dif_pmull)
+ b .L_128_done_\@
+ .endm
+
+ENTRY(crc_t10dif_pmull_p8)
+ crc_t10dif_pmull p8
+ENDPROC(crc_t10dif_pmull_p8)
+
+ .align 5
+ENTRY(crc_t10dif_pmull_p64)
+ crc_t10dif_pmull p64
+ENDPROC(crc_t10dif_pmull_p64)
// precomputed constants
// these constants are precomputed from the poly:
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 96f0cae4a022..b461d62023f2 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -22,7 +22,10 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 buf[], u64 len);
+
+static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -85,6 +88,11 @@ static struct shash_alg crc_t10dif_alg = {
static int __init crc_t10dif_mod_init(void)
{
+ if (elf_hwcap & HWCAP_PMULL)
+ crc_t10dif_pmull = crc_t10dif_pmull_p64;
+ else
+ crc_t10dif_pmull = crc_t10dif_pmull_p8;
+
return crypto_register_shash(&crc_t10dif_alg);
}
@@ -93,8 +101,10 @@ static void __exit crc_t10dif_mod_exit(void)
crypto_unregister_shash(&crc_t10dif_alg);
}
-module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
diff --git a/arch/arm64/crypto/speck-neon-core.S b/arch/arm64/crypto/speck-neon-core.S
deleted file mode 100644
index b14463438b09..000000000000
--- a/arch/arm64/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
- .text
-
- // arguments
- ROUND_KEYS .req x0 // const {u64,u32} *round_keys
- NROUNDS .req w1 // int nrounds
- NROUNDS_X .req x1
- DST .req x2 // void *dst
- SRC .req x3 // const void *src
- NBYTES .req w4 // unsigned int nbytes
- TWEAK .req x5 // void *tweak
-
- // registers which hold the data being encrypted/decrypted
- // (underscores avoid a naming collision with ARM64 registers x0-x3)
- X_0 .req v0
- Y_0 .req v1
- X_1 .req v2
- Y_1 .req v3
- X_2 .req v4
- Y_2 .req v5
- X_3 .req v6
- Y_3 .req v7
-
- // the round key, duplicated in all lanes
- ROUND_KEY .req v8
-
- // index vector for tbl-based 8-bit rotates
- ROTATE_TABLE .req v9
- ROTATE_TABLE_Q .req q9
-
- // temporary registers
- TMP0 .req v10
- TMP1 .req v11
- TMP2 .req v12
- TMP3 .req v13
-
- // multiplication table for updating XTS tweaks
- GFMUL_TABLE .req v14
- GFMUL_TABLE_Q .req q14
-
- // next XTS tweak value(s)
- TWEAKV_NEXT .req v15
-
- // XTS tweaks for the blocks currently being encrypted/decrypted
- TWEAKV0 .req v16
- TWEAKV1 .req v17
- TWEAKV2 .req v18
- TWEAKV3 .req v19
- TWEAKV4 .req v20
- TWEAKV5 .req v21
- TWEAKV6 .req v22
- TWEAKV7 .req v23
-
- .align 4
-.Lror64_8_table:
- .octa 0x080f0e0d0c0b0a090007060504030201
-.Lror32_8_table:
- .octa 0x0c0f0e0d080b0a090407060500030201
-.Lrol64_8_table:
- .octa 0x0e0d0c0b0a09080f0605040302010007
-.Lrol32_8_table:
- .octa 0x0e0d0c0f0a09080b0605040702010003
-.Lgf128mul_table:
- .octa 0x00000000000000870000000000000001
-.Lgf64mul_table:
- .octa 0x0000000000000000000000002d361b00
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
- */
-.macro _speck_round_128bytes n, lanes
-
- // x = ror(x, 8)
- tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
- tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
- tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
- tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-
- // x += y
- add X_0.\lanes, X_0.\lanes, Y_0.\lanes
- add X_1.\lanes, X_1.\lanes, Y_1.\lanes
- add X_2.\lanes, X_2.\lanes, Y_2.\lanes
- add X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
- // x ^= k
- eor X_0.16b, X_0.16b, ROUND_KEY.16b
- eor X_1.16b, X_1.16b, ROUND_KEY.16b
- eor X_2.16b, X_2.16b, ROUND_KEY.16b
- eor X_3.16b, X_3.16b, ROUND_KEY.16b
-
- // y = rol(y, 3)
- shl TMP0.\lanes, Y_0.\lanes, #3
- shl TMP1.\lanes, Y_1.\lanes, #3
- shl TMP2.\lanes, Y_2.\lanes, #3
- shl TMP3.\lanes, Y_3.\lanes, #3
- sri TMP0.\lanes, Y_0.\lanes, #(\n - 3)
- sri TMP1.\lanes, Y_1.\lanes, #(\n - 3)
- sri TMP2.\lanes, Y_2.\lanes, #(\n - 3)
- sri TMP3.\lanes, Y_3.\lanes, #(\n - 3)
-
- // y ^= x
- eor Y_0.16b, TMP0.16b, X_0.16b
- eor Y_1.16b, TMP1.16b, X_1.16b
- eor Y_2.16b, TMP2.16b, X_2.16b
- eor Y_3.16b, TMP3.16b, X_3.16b
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes n, lanes
-
- // y ^= x
- eor TMP0.16b, Y_0.16b, X_0.16b
- eor TMP1.16b, Y_1.16b, X_1.16b
- eor TMP2.16b, Y_2.16b, X_2.16b
- eor TMP3.16b, Y_3.16b, X_3.16b
-
- // y = ror(y, 3)
- ushr Y_0.\lanes, TMP0.\lanes, #3
- ushr Y_1.\lanes, TMP1.\lanes, #3
- ushr Y_2.\lanes, TMP2.\lanes, #3
- ushr Y_3.\lanes, TMP3.\lanes, #3
- sli Y_0.\lanes, TMP0.\lanes, #(\n - 3)
- sli Y_1.\lanes, TMP1.\lanes, #(\n - 3)
- sli Y_2.\lanes, TMP2.\lanes, #(\n - 3)
- sli Y_3.\lanes, TMP3.\lanes, #(\n - 3)
-
- // x ^= k
- eor X_0.16b, X_0.16b, ROUND_KEY.16b
- eor X_1.16b, X_1.16b, ROUND_KEY.16b
- eor X_2.16b, X_2.16b, ROUND_KEY.16b
- eor X_3.16b, X_3.16b, ROUND_KEY.16b
-
- // x -= y
- sub X_0.\lanes, X_0.\lanes, Y_0.\lanes
- sub X_1.\lanes, X_1.\lanes, Y_1.\lanes
- sub X_2.\lanes, X_2.\lanes, Y_2.\lanes
- sub X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
- // x = rol(x, 8)
- tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
- tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
- tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
- tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-.endm
-
-.macro _next_xts_tweak next, cur, tmp, n
-.if \n == 64
- /*
- * Calculate the next tweak by multiplying the current one by x,
- * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
- */
- sshr \tmp\().2d, \cur\().2d, #63
- and \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
- shl \next\().2d, \cur\().2d, #1
- ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
- eor \next\().16b, \next\().16b, \tmp\().16b
-.else
- /*
- * Calculate the next two tweaks by multiplying the current ones by x^2,
- * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
- */
- ushr \tmp\().2d, \cur\().2d, #62
- shl \next\().2d, \cur\().2d, #2
- tbl \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
- eor \next\().16b, \next\().16b, \tmp\().16b
-.endif
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt n, lanes, decrypting
-
- /*
- * If decrypting, modify the ROUND_KEYS parameter to point to the last
- * round key rather than the first, since for decryption the round keys
- * are used in reverse order.
- */
-.if \decrypting
- mov NROUNDS, NROUNDS /* zero the high 32 bits */
-.if \n == 64
- add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
- sub ROUND_KEYS, ROUND_KEYS, #8
-.else
- add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
- sub ROUND_KEYS, ROUND_KEYS, #4
-.endif
-.endif
-
- // Load the index vector for tbl-based 8-bit rotates
-.if \decrypting
- ldr ROTATE_TABLE_Q, .Lrol\n\()_8_table
-.else
- ldr ROTATE_TABLE_Q, .Lror\n\()_8_table
-.endif
-
- // One-time XTS preparation
-.if \n == 64
- // Load first tweak
- ld1 {TWEAKV0.16b}, [TWEAK]
-
- // Load GF(2^128) multiplication table
- ldr GFMUL_TABLE_Q, .Lgf128mul_table
-.else
- // Load first tweak
- ld1 {TWEAKV0.8b}, [TWEAK]
-
- // Load GF(2^64) multiplication table
- ldr GFMUL_TABLE_Q, .Lgf64mul_table
-
- // Calculate second tweak, packing it together with the first
- ushr TMP0.2d, TWEAKV0.2d, #63
- shl TMP1.2d, TWEAKV0.2d, #1
- tbl TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
- eor TMP0.8b, TMP0.8b, TMP1.8b
- mov TWEAKV0.d[1], TMP0.d[0]
-.endif
-
-.Lnext_128bytes_\@:
-
- // Calculate XTS tweaks for next 128 bytes
- _next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
- _next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
- _next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
- _next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
- _next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
- _next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
- _next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
- _next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
-
- // Load the next source blocks into {X,Y}[0-3]
- ld1 {X_0.16b-Y_1.16b}, [SRC], #64
- ld1 {X_2.16b-Y_3.16b}, [SRC], #64
-
- // XOR the source blocks with their XTS tweaks
- eor TMP0.16b, X_0.16b, TWEAKV0.16b
- eor Y_0.16b, Y_0.16b, TWEAKV1.16b
- eor TMP1.16b, X_1.16b, TWEAKV2.16b
- eor Y_1.16b, Y_1.16b, TWEAKV3.16b
- eor TMP2.16b, X_2.16b, TWEAKV4.16b
- eor Y_2.16b, Y_2.16b, TWEAKV5.16b
- eor TMP3.16b, X_3.16b, TWEAKV6.16b
- eor Y_3.16b, Y_3.16b, TWEAKV7.16b
-
- /*
- * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
- * that the X[0-3] registers contain only the second halves of blocks,
- * and the Y[0-3] registers contain only the first halves of blocks.
- * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
- */
- uzp2 X_0.\lanes, TMP0.\lanes, Y_0.\lanes
- uzp1 Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
- uzp2 X_1.\lanes, TMP1.\lanes, Y_1.\lanes
- uzp1 Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
- uzp2 X_2.\lanes, TMP2.\lanes, Y_2.\lanes
- uzp1 Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
- uzp2 X_3.\lanes, TMP3.\lanes, Y_3.\lanes
- uzp1 Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
-
- // Do the cipher rounds
- mov x6, ROUND_KEYS
- mov w7, NROUNDS
-.Lnext_round_\@:
-.if \decrypting
- ld1r {ROUND_KEY.\lanes}, [x6]
- sub x6, x6, #( \n / 8 )
- _speck_unround_128bytes \n, \lanes
-.else
- ld1r {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
- _speck_round_128bytes \n, \lanes
-.endif
- subs w7, w7, #1
- bne .Lnext_round_\@
-
- // Re-interleave the 'x' and 'y' elements of each block
- zip1 TMP0.\lanes, Y_0.\lanes, X_0.\lanes
- zip2 Y_0.\lanes, Y_0.\lanes, X_0.\lanes
- zip1 TMP1.\lanes, Y_1.\lanes, X_1.\lanes
- zip2 Y_1.\lanes, Y_1.\lanes, X_1.\lanes
- zip1 TMP2.\lanes, Y_2.\lanes, X_2.\lanes
- zip2 Y_2.\lanes, Y_2.\lanes, X_2.\lanes
- zip1 TMP3.\lanes, Y_3.\lanes, X_3.\lanes
- zip2 Y_3.\lanes, Y_3.\lanes, X_3.\lanes
-
- // XOR the encrypted/decrypted blocks with the tweaks calculated earlier
- eor X_0.16b, TMP0.16b, TWEAKV0.16b
- eor Y_0.16b, Y_0.16b, TWEAKV1.16b
- eor X_1.16b, TMP1.16b, TWEAKV2.16b
- eor Y_1.16b, Y_1.16b, TWEAKV3.16b
- eor X_2.16b, TMP2.16b, TWEAKV4.16b
- eor Y_2.16b, Y_2.16b, TWEAKV5.16b
- eor X_3.16b, TMP3.16b, TWEAKV6.16b
- eor Y_3.16b, Y_3.16b, TWEAKV7.16b
- mov TWEAKV0.16b, TWEAKV_NEXT.16b
-
- // Store the ciphertext in the destination buffer
- st1 {X_0.16b-Y_1.16b}, [DST], #64
- st1 {X_2.16b-Y_3.16b}, [DST], #64
-
- // Continue if there are more 128-byte chunks remaining
- subs NBYTES, NBYTES, #128
- bne .Lnext_128bytes_\@
-
- // Store the next tweak and return
-.if \n == 64
- st1 {TWEAKV_NEXT.16b}, [TWEAK]
-.else
- st1 {TWEAKV_NEXT.8b}, [TWEAK]
-.endif
- ret
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
- _speck_xts_crypt n=64, lanes=2d, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
- _speck_xts_crypt n=64, lanes=2d, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
- _speck_xts_crypt n=32, lanes=4s, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
- _speck_xts_crypt n=32, lanes=4s, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm64/crypto/speck-neon-glue.c b/arch/arm64/crypto/speck-neon-glue.c
deleted file mode 100644
index 6e233aeb4ff4..000000000000
--- a/arch/arm64/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,282 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- * (64-bit version; based on the 32-bit version)
- *
- * Copyright (c) 2018 Google, Inc
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE 128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
- struct speck128_tfm_ctx main_key;
- struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
- u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
- const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct skcipher_request *req,
- speck128_crypt_one_t crypt_one,
- speck128_xts_crypt_many_t crypt_many)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- le128 tweak;
- int err;
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
- u8 *dst = walk.dst.virt.addr;
- const u8 *src = walk.src.virt.addr;
-
- if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
- unsigned int count;
-
- count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
- kernel_neon_begin();
- (*crypt_many)(ctx->main_key.round_keys,
- ctx->main_key.nrounds,
- dst, src, count, &tweak);
- kernel_neon_end();
- dst += count;
- src += count;
- nbytes -= count;
- }
-
- /* Handle any remainder with generic code */
- while (nbytes >= sizeof(tweak)) {
- le128_xor((le128 *)dst, (const le128 *)src, &tweak);
- (*crypt_one)(&ctx->main_key, dst, dst);
- le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
- gf128mul_x_ble(&tweak, &tweak);
-
- dst += sizeof(tweak);
- src += sizeof(tweak);
- nbytes -= sizeof(tweak);
- }
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static int speck128_xts_encrypt(struct skcipher_request *req)
-{
- return __speck128_xts_crypt(req, crypto_speck128_encrypt,
- speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct skcipher_request *req)
-{
- return __speck128_xts_crypt(req, crypto_speck128_decrypt,
- speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- keylen /= 2;
-
- err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
- if (err)
- return err;
-
- return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
- struct speck64_tfm_ctx main_key;
- struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
- u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
- const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
- speck64_xts_crypt_many_t crypt_many)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- __le64 tweak;
- int err;
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
- u8 *dst = walk.dst.virt.addr;
- const u8 *src = walk.src.virt.addr;
-
- if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
- unsigned int count;
-
- count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
- kernel_neon_begin();
- (*crypt_many)(ctx->main_key.round_keys,
- ctx->main_key.nrounds,
- dst, src, count, &tweak);
- kernel_neon_end();
- dst += count;
- src += count;
- nbytes -= count;
- }
-
- /* Handle any remainder with generic code */
- while (nbytes >= sizeof(tweak)) {
- *(__le64 *)dst = *(__le64 *)src ^ tweak;
- (*crypt_one)(&ctx->main_key, dst, dst);
- *(__le64 *)dst ^= tweak;
- tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
- ((tweak & cpu_to_le64(1ULL << 63)) ?
- 0x1B : 0));
- dst += sizeof(tweak);
- src += sizeof(tweak);
- nbytes -= sizeof(tweak);
- }
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static int speck64_xts_encrypt(struct skcipher_request *req)
-{
- return __speck64_xts_crypt(req, crypto_speck64_encrypt,
- speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct skcipher_request *req)
-{
- return __speck64_xts_crypt(req, crypto_speck64_decrypt,
- speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- keylen /= 2;
-
- err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
- if (err)
- return err;
-
- return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct skcipher_alg speck_algs[] = {
- {
- .base.cra_name = "xts(speck128)",
- .base.cra_driver_name = "xts-speck128-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = SPECK128_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct speck128_xts_tfm_ctx),
- .base.cra_alignmask = 7,
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SPECK128_128_KEY_SIZE,
- .max_keysize = 2 * SPECK128_256_KEY_SIZE,
- .ivsize = SPECK128_BLOCK_SIZE,
- .walksize = SPECK_NEON_CHUNK_SIZE,
- .setkey = speck128_xts_setkey,
- .encrypt = speck128_xts_encrypt,
- .decrypt = speck128_xts_decrypt,
- }, {
- .base.cra_name = "xts(speck64)",
- .base.cra_driver_name = "xts-speck64-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = SPECK64_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct speck64_xts_tfm_ctx),
- .base.cra_alignmask = 7,
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SPECK64_96_KEY_SIZE,
- .max_keysize = 2 * SPECK64_128_KEY_SIZE,
- .ivsize = SPECK64_BLOCK_SIZE,
- .walksize = SPECK_NEON_CHUNK_SIZE,
- .setkey = speck64_xts_setkey,
- .encrypt = speck64_xts_encrypt,
- .decrypt = speck64_xts_decrypt,
- }
-};
-
-static int __init speck_neon_module_init(void)
-{
- if (!(elf_hwcap & HWCAP_ASIMD))
- return -ENODEV;
- return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
- crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");