summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 20:40:14 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 20:40:14 +0300
commitab5c60b79ab6cc50b39bbb21b2f9fb55af900b84 (patch)
tree71fa895fbf01e3b88f26cf257d9105f9d286b631 /arch
parent5577416c39652d395a6045677f4f598564aba1cf (diff)
parent3cbfe80737c18ac6e635421ab676716a393d3074 (diff)
downloadlinux-ab5c60b79ab6cc50b39bbb21b2f9fb55af900b84.tar.xz
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Add support for allocating transforms on a specific NUMA Node - Introduce the flag CRYPTO_ALG_ALLOCATES_MEMORY for storage users Algorithms: - Drop PMULL based ghash on arm64 - Fixes for building with clang on x86 - Add sha256 helper that does the digest in one go - Add SP800-56A rev 3 validation checks to dh Drivers: - Permit users to specify NUMA node in hisilicon/zip - Add support for i.MX6 in imx-rngc - Add sa2ul crypto driver - Add BA431 hwrng driver - Add Ingenic JZ4780 and X1000 hwrng driver - Spread IRQ affinity in inside-secure and marvell/cesa" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (157 commits) crypto: sa2ul - Fix inconsistent IS_ERR and PTR_ERR hwrng: core - remove redundant initialization of variable ret crypto: x86/curve25519 - Remove unused carry variables crypto: ingenic - Add hardware RNG for Ingenic JZ4780 and X1000 dt-bindings: RNG: Add Ingenic RNG bindings. crypto: caam/qi2 - add module alias crypto: caam - add more RNG hw error codes crypto: caam/jr - remove incorrect reference to caam_jr_register() crypto: caam - silence .setkey in case of bad key length crypto: caam/qi2 - create ahash shared descriptors only once crypto: caam/qi2 - fix error reporting for caam_hash_alloc crypto: caam - remove deadcode on 32-bit platforms crypto: ccp - use generic power management crypto: xts - Replace memcpy() invocation with simple assignment crypto: marvell/cesa - irq balance crypto: inside-secure - irq balance crypto: ecc - SP800-56A rev 3 local public key validation crypto: dh - SP800-56A rev 3 local public key validation crypto: dh - check validity of Z before export lib/mpi: Add mpi_sub_ui() ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/crypto/crc32-ce-core.S2
-rw-r--r--arch/arm/crypto/ghash-ce-glue.c51
-rw-r--r--arch/arm/crypto/sha1-armv4-large.S2
-rw-r--r--arch/arm/crypto/sha256-armv4.pl2
-rw-r--r--arch/arm/crypto/sha256-core.S_shipped2
-rw-r--r--arch/arm/crypto/sha512-armv4.pl4
-rw-r--r--arch/arm/crypto/sha512-core.S_shipped4
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c257
-rw-r--r--arch/sparc/crypto/sha256_glue.c14
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S15
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S739
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S1
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S16
-rw-r--r--arch/x86/crypto/chacha_glue.c17
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S47
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S7
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c6
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S17
-rw-r--r--arch/x86/include/asm/inst.h163
19 files changed, 551 insertions, 815 deletions
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S
index 5cbd4a6fedad..3f13a76b9066 100644
--- a/arch/arm/crypto/crc32-ce-core.S
+++ b/arch/arm/crypto/crc32-ce-core.S
@@ -39,7 +39,7 @@
* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
* at:
- * http://www.intel.com/products/processor/manuals/
+ * https://www.intel.com/products/processor/manuals/
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2B: Instruction Set Reference, N-Z
*
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index a00fd329255f..f13401f3e669 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -16,6 +16,7 @@
#include <crypto/gf128mul.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
+#include <linux/jump_label.h>
#include <linux/module.h>
MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions");
@@ -27,12 +28,8 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_DIGEST_SIZE 16
struct ghash_key {
- u64 h[2];
- u64 h2[2];
- u64 h3[2];
- u64 h4[2];
-
be128 k;
+ u64 h[][2];
};
struct ghash_desc_ctx {
@@ -46,16 +43,12 @@ struct ghash_async_ctx {
};
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
-static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_p64);
static int ghash_init(struct shash_desc *desc)
{
@@ -70,7 +63,10 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
{
if (likely(crypto_simd_usable())) {
kernel_neon_begin();
- pmull_ghash_update(blocks, dg, src, key, head);
+ if (static_branch_likely(&use_p64))
+ pmull_ghash_update_p64(blocks, dg, src, key->h, head);
+ else
+ pmull_ghash_update_p8(blocks, dg, src, key->h, head);
kernel_neon_end();
} else {
be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
@@ -161,25 +157,26 @@ static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- be128 h;
if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
/* needed for the fallback */
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
- ghash_reflect(key->h, &key->k);
+ ghash_reflect(key->h[0], &key->k);
- h = key->k;
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h2, &h);
+ if (static_branch_likely(&use_p64)) {
+ be128 h = key->k;
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h3, &h);
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h[1], &h);
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h4, &h);
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h[2], &h);
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h[3], &h);
+ }
return 0;
}
@@ -195,7 +192,7 @@ static struct shash_alg ghash_alg = {
.base.cra_driver_name = "ghash-ce-sync",
.base.cra_priority = 300 - 1,
.base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
};
@@ -354,10 +351,10 @@ static int __init ghash_ce_mod_init(void)
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- if (elf_hwcap2 & HWCAP2_PMULL)
- pmull_ghash_update = pmull_ghash_update_p64;
- else
- pmull_ghash_update = pmull_ghash_update_p8;
+ if (elf_hwcap2 & HWCAP2_PMULL) {
+ ghash_alg.base.cra_ctxsize += 3 * sizeof(u64[2]);
+ static_branch_enable(&use_p64);
+ }
err = crypto_register_shash(&ghash_alg);
if (err)
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
index f82cd8cf5a09..1c8b685149f2 100644
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ b/arch/arm/crypto/sha1-armv4-large.S
@@ -13,7 +13,7 @@
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
+@ details see https://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ sha1_block procedure for ARMv4.
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
index a03cf4dfb781..9f96ff48e4a8 100644
--- a/arch/arm/crypto/sha256-armv4.pl
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -13,7 +13,7 @@
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
+# details see https://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
index 054aae0edfce..ea04b2ab0c33 100644
--- a/arch/arm/crypto/sha256-core.S_shipped
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -12,7 +12,7 @@
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
+@ details see https://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ SHA256 block procedure for ARMv4. May 2007.
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
index 788c17b56ecc..69df68981acd 100644
--- a/arch/arm/crypto/sha512-armv4.pl
+++ b/arch/arm/crypto/sha512-armv4.pl
@@ -13,7 +13,7 @@
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
+# details see https://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
@@ -43,7 +43,7 @@
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
# for further details. On side note Cortex-A15 processes one byte in
# 16 cycles.
diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped
index 710ea309769e..cb147db5cbfe 100644
--- a/arch/arm/crypto/sha512-core.S_shipped
+++ b/arch/arm/crypto/sha512-core.S_shipped
@@ -12,7 +12,7 @@
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
+@ details see https://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ SHA512 block procedure for ARMv4. September 2007.
@@ -42,7 +42,7 @@
@ terms it's 22.6 cycles per byte, which is disappointing result.
@ Technical writers asserted that 3-way S4 pipeline can sustain
@ multiple NEON instructions per cycle, but dual NEON issue could
-@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
@ for further details. On side note Cortex-A15 processes one byte in
@ 16 cycles.
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 22831d3b7f62..da1034867aaa 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -31,12 +31,8 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GCM_IV_SIZE 12
struct ghash_key {
- u64 h[2];
- u64 h2[2];
- u64 h3[2];
- u64 h4[2];
-
be128 k;
+ u64 h[][2];
};
struct ghash_desc_ctx {
@@ -51,22 +47,18 @@ struct gcm_aes_ctx {
};
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
- struct ghash_key const *k, u64 dg[],
- u8 ctr[], u32 const rk[], int rounds,
- u8 tag[]);
+ u64 const h[][2], u64 dg[], u8 ctr[],
+ u32 const rk[], int rounds, u8 tag[]);
asmlinkage void pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
- struct ghash_key const *k, u64 dg[],
- u8 ctr[], u32 const rk[], int rounds,
- u8 tag[]);
+ u64 const h[][2], u64 dg[], u8 ctr[],
+ u32 const rk[], int rounds, u8 tag[]);
static int ghash_init(struct shash_desc *desc)
{
@@ -77,48 +69,51 @@ static int ghash_init(struct shash_desc *desc)
}
static void ghash_do_update(int blocks, u64 dg[], const char *src,
- struct ghash_key *key, const char *head,
- void (*simd_update)(int blocks, u64 dg[],
- const char *src,
- struct ghash_key const *k,
- const char *head))
+ struct ghash_key *key, const char *head)
{
- if (likely(crypto_simd_usable() && simd_update)) {
- kernel_neon_begin();
- simd_update(blocks, dg, src, key, head);
- kernel_neon_end();
- } else {
- be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
+ be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
- do {
- const u8 *in = src;
-
- if (head) {
- in = head;
- blocks++;
- head = NULL;
- } else {
- src += GHASH_BLOCK_SIZE;
- }
+ do {
+ const u8 *in = src;
+
+ if (head) {
+ in = head;
+ blocks++;
+ head = NULL;
+ } else {
+ src += GHASH_BLOCK_SIZE;
+ }
- crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
- gf128mul_lle(&dst, &key->k);
- } while (--blocks);
+ crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
+ gf128mul_lle(&dst, &key->k);
+ } while (--blocks);
- dg[0] = be64_to_cpu(dst.b);
- dg[1] = be64_to_cpu(dst.a);
+ dg[0] = be64_to_cpu(dst.b);
+ dg[1] = be64_to_cpu(dst.a);
+}
+
+static __always_inline
+void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
+ struct ghash_key *key, const char *head,
+ void (*simd_update)(int blocks, u64 dg[],
+ const char *src,
+ u64 const h[][2],
+ const char *head))
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_neon_begin();
+ simd_update(blocks, dg, src, key->h, head);
+ kernel_neon_end();
+ } else {
+ ghash_do_update(blocks, dg, src, key, head);
}
}
/* avoid hogging the CPU for too long */
#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE)
-static int __ghash_update(struct shash_desc *desc, const u8 *src,
- unsigned int len,
- void (*simd_update)(int blocks, u64 dg[],
- const char *src,
- struct ghash_key const *k,
- const char *head))
+static int ghash_update(struct shash_desc *desc, const u8 *src,
+ unsigned int len)
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -143,9 +138,9 @@ static int __ghash_update(struct shash_desc *desc, const u8 *src,
do {
int chunk = min(blocks, MAX_BLOCKS);
- ghash_do_update(chunk, ctx->digest, src, key,
- partial ? ctx->buf : NULL,
- simd_update);
+ ghash_do_simd_update(chunk, ctx->digest, src, key,
+ partial ? ctx->buf : NULL,
+ pmull_ghash_update_p8);
blocks -= chunk;
src += chunk * GHASH_BLOCK_SIZE;
@@ -157,39 +152,7 @@ static int __ghash_update(struct shash_desc *desc, const u8 *src,
return 0;
}
-static int ghash_update_p8(struct shash_desc *desc, const u8 *src,
- unsigned int len)
-{
- return __ghash_update(desc, src, len, pmull_ghash_update_p8);
-}
-
-static int ghash_update_p64(struct shash_desc *desc, const u8 *src,
- unsigned int len)
-{
- return __ghash_update(desc, src, len, pmull_ghash_update_p64);
-}
-
-static int ghash_final_p8(struct shash_desc *desc, u8 *dst)
-{
- struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
- unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
-
- if (partial) {
- struct ghash_key *key = crypto_shash_ctx(desc->tfm);
-
- memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
-
- ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
- pmull_ghash_update_p8);
- }
- put_unaligned_be64(ctx->digest[1], dst);
- put_unaligned_be64(ctx->digest[0], dst + 8);
-
- *ctx = (struct ghash_desc_ctx){};
- return 0;
-}
-
-static int ghash_final_p64(struct shash_desc *desc, u8 *dst)
+static int ghash_final(struct shash_desc *desc, u8 *dst)
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -199,8 +162,8 @@ static int ghash_final_p64(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
- ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
- pmull_ghash_update_p64);
+ ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
+ pmull_ghash_update_p8);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
@@ -220,29 +183,6 @@ static void ghash_reflect(u64 h[], const be128 *k)
h[1] ^= 0xc200000000000000UL;
}
-static int __ghash_setkey(struct ghash_key *key,
- const u8 *inkey, unsigned int keylen)
-{
- be128 h;
-
- /* needed for the fallback */
- memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
-
- ghash_reflect(key->h, &key->k);
-
- h = key->k;
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h2, &h);
-
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h3, &h);
-
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h4, &h);
-
- return 0;
-}
-
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
@@ -251,38 +191,28 @@ static int ghash_setkey(struct crypto_shash *tfm,
if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- return __ghash_setkey(key, inkey, keylen);
+ /* needed for the fallback */
+ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
+
+ ghash_reflect(key->h[0], &key->k);
+ return 0;
}
-static struct shash_alg ghash_alg[] = {{
+static struct shash_alg ghash_alg = {
.base.cra_name = "ghash",
.base.cra_driver_name = "ghash-neon",
.base.cra_priority = 150,
.base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
- .base.cra_module = THIS_MODULE,
-
- .digestsize = GHASH_DIGEST_SIZE,
- .init = ghash_init,
- .update = ghash_update_p8,
- .final = ghash_final_p8,
- .setkey = ghash_setkey,
- .descsize = sizeof(struct ghash_desc_ctx),
-}, {
- .base.cra_name = "ghash",
- .base.cra_driver_name = "ghash-ce",
- .base.cra_priority = 200,
- .base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
.digestsize = GHASH_DIGEST_SIZE,
.init = ghash_init,
- .update = ghash_update_p64,
- .final = ghash_final_p64,
+ .update = ghash_update,
+ .final = ghash_final,
.setkey = ghash_setkey,
.descsize = sizeof(struct ghash_desc_ctx),
-}};
+};
static int num_rounds(struct crypto_aes_ctx *ctx)
{
@@ -301,6 +231,7 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
{
struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
u8 key[GHASH_BLOCK_SIZE];
+ be128 h;
int ret;
ret = aes_expandkey(&ctx->aes_key, inkey, keylen);
@@ -309,7 +240,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});
- return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
+ /* needed for the fallback */
+ memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE);
+
+ ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k);
+
+ h = ctx->ghash_key.k;
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[1], &h);
+
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[2], &h);
+
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[3], &h);
+
+ return 0;
}
static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -341,9 +287,9 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
int blocks = count / GHASH_BLOCK_SIZE;
- ghash_do_update(blocks, dg, src, &ctx->ghash_key,
- *buf_count ? buf : NULL,
- pmull_ghash_update_p64);
+ ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key,
+ *buf_count ? buf : NULL,
+ pmull_ghash_update_p64);
src += blocks * GHASH_BLOCK_SIZE;
count %= GHASH_BLOCK_SIZE;
@@ -387,8 +333,8 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
if (buf_count) {
memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
- ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL,
- pmull_ghash_update_p64);
+ ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL,
+ pmull_ghash_update_p64);
}
}
@@ -433,8 +379,8 @@ static int gcm_encrypt(struct aead_request *req)
}
kernel_neon_begin();
- pmull_gcm_encrypt(nbytes, dst, src, &ctx->ghash_key, dg,
- iv, ctx->aes_key.key_enc, nrounds,
+ pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc, nrounds,
tag);
kernel_neon_end();
@@ -464,7 +410,7 @@ static int gcm_encrypt(struct aead_request *req)
} while (--remaining > 0);
ghash_do_update(blocks, dg, walk.dst.virt.addr,
- &ctx->ghash_key, NULL, NULL);
+ &ctx->ghash_key, NULL);
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
@@ -483,7 +429,7 @@ static int gcm_encrypt(struct aead_request *req)
tag = (u8 *)&lengths;
ghash_do_update(1, dg, tag, &ctx->ghash_key,
- walk.nbytes ? buf : NULL, NULL);
+ walk.nbytes ? buf : NULL);
if (walk.nbytes)
err = skcipher_walk_done(&walk, 0);
@@ -547,8 +493,8 @@ static int gcm_decrypt(struct aead_request *req)
}
kernel_neon_begin();
- pmull_gcm_decrypt(nbytes, dst, src, &ctx->ghash_key, dg,
- iv, ctx->aes_key.key_enc, nrounds,
+ pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc, nrounds,
tag);
kernel_neon_end();
@@ -568,7 +514,7 @@ static int gcm_decrypt(struct aead_request *req)
u8 *dst = walk.dst.virt.addr;
ghash_do_update(blocks, dg, walk.src.virt.addr,
- &ctx->ghash_key, NULL, NULL);
+ &ctx->ghash_key, NULL);
do {
aes_encrypt(&ctx->aes_key, buf, iv);
@@ -591,7 +537,7 @@ static int gcm_decrypt(struct aead_request *req)
tag = (u8 *)&lengths;
ghash_do_update(1, dg, tag, &ctx->ghash_key,
- walk.nbytes ? buf : NULL, NULL);
+ walk.nbytes ? buf : NULL);
if (walk.nbytes) {
aes_encrypt(&ctx->aes_key, buf, iv);
@@ -635,43 +581,28 @@ static struct aead_alg gcm_aes_alg = {
.base.cra_driver_name = "gcm-aes-ce",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct gcm_aes_ctx),
+ .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) +
+ 4 * sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
};
static int __init ghash_ce_mod_init(void)
{
- int ret;
-
if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
if (cpu_have_named_feature(PMULL))
- ret = crypto_register_shashes(ghash_alg,
- ARRAY_SIZE(ghash_alg));
- else
- /* only register the first array element */
- ret = crypto_register_shash(ghash_alg);
+ return crypto_register_aead(&gcm_aes_alg);
- if (ret)
- return ret;
-
- if (cpu_have_named_feature(PMULL)) {
- ret = crypto_register_aead(&gcm_aes_alg);
- if (ret)
- crypto_unregister_shashes(ghash_alg,
- ARRAY_SIZE(ghash_alg));
- }
- return ret;
+ return crypto_register_shash(&ghash_alg);
}
static void __exit ghash_ce_mod_exit(void)
{
if (cpu_have_named_feature(PMULL))
- crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg));
+ crypto_unregister_aead(&gcm_aes_alg);
else
- crypto_unregister_shash(ghash_alg);
- crypto_unregister_aead(&gcm_aes_alg);
+ crypto_unregister_shash(&ghash_alg);
}
static const struct cpu_feature ghash_cpu_feature[] = {
diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c
index 286bc8ecf15b..ca2547df9652 100644
--- a/arch/sparc/crypto/sha256_glue.c
+++ b/arch/sparc/crypto/sha256_glue.c
@@ -156,7 +156,7 @@ static int sha256_sparc64_import(struct shash_desc *desc, const void *in)
return 0;
}
-static struct shash_alg sha256 = {
+static struct shash_alg sha256_alg = {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_sparc64_init,
.update = sha256_sparc64_update,
@@ -174,7 +174,7 @@ static struct shash_alg sha256 = {
}
};
-static struct shash_alg sha224 = {
+static struct shash_alg sha224_alg = {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_sparc64_init,
.update = sha256_sparc64_update,
@@ -206,13 +206,13 @@ static bool __init sparc64_has_sha256_opcode(void)
static int __init sha256_sparc64_mod_init(void)
{
if (sparc64_has_sha256_opcode()) {
- int ret = crypto_register_shash(&sha224);
+ int ret = crypto_register_shash(&sha224_alg);
if (ret < 0)
return ret;
- ret = crypto_register_shash(&sha256);
+ ret = crypto_register_shash(&sha256_alg);
if (ret < 0) {
- crypto_unregister_shash(&sha224);
+ crypto_unregister_shash(&sha224_alg);
return ret;
}
@@ -225,8 +225,8 @@ static int __init sha256_sparc64_mod_init(void)
static void __exit sha256_sparc64_mod_fini(void)
{
- crypto_unregister_shash(&sha224);
- crypto_unregister_shash(&sha256);
+ crypto_unregister_shash(&sha224_alg);
+ crypto_unregister_shash(&sha256_alg);
}
module_init(sha256_sparc64_mod_init);
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index ec437db1fa54..3f0fc7dd87d7 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -63,7 +63,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
#define VMOVDQ vmovdqu
@@ -127,10 +126,6 @@ ddq_add_8:
/* generate a unique variable for ddq_add_x */
-.macro setddq n
- var_ddq_add = ddq_add_\n
-.endm
-
/* generate a unique variable for xmm register */
.macro setxdata n
var_xdata = %xmm\n
@@ -140,9 +135,7 @@ ddq_add_8:
.macro club name, id
.altmacro
- .if \name == DDQ_DATA
- setddq %\id
- .elseif \name == XDATA
+ .if \name == XDATA
setxdata %\id
.endif
.noaltmacro
@@ -165,9 +158,8 @@ ddq_add_8:
.set i, 1
.rept (by - 1)
- club DDQ_DATA, i
club XDATA, i
- vpaddq var_ddq_add(%rip), xcounter, var_xdata
+ vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
vptest ddq_low_msk(%rip), var_xdata
jnz 1f
vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
@@ -180,8 +172,7 @@ ddq_add_8:
vmovdqa 1*16(p_keys), xkeyA
vpxor xkey0, xdata0, xdata0
- club DDQ_DATA, by
- vpaddq var_ddq_add(%rip), xcounter, xcounter
+ vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
vptest ddq_low_msk(%rip), xcounter
jnz 1f
vpaddq ddq_high_add_1(%rip), xcounter, xcounter
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 54e7d15dbd0d..1852b19a73a0 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -26,7 +26,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
#include <asm/frame.h>
#include <asm/nospec-branch.h>
@@ -201,7 +200,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
mov \SUBKEY, %r12
movdqu (%r12), \TMP3
movdqa SHUF_MASK(%rip), \TMP2
- PSHUFB_XMM \TMP2, \TMP3
+ pshufb \TMP2, \TMP3
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
@@ -263,10 +262,10 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
movdqa SHUF_MASK(%rip), %xmm2
- PSHUFB_XMM %xmm2, %xmm0
+ pshufb %xmm2, %xmm0
movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
- PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
movdqu HashKey(%arg2), %xmm13
CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
@@ -347,7 +346,7 @@ _zero_cipher_left_\@:
paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
movdqu %xmm0, CurCount(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm0
+ pshufb %xmm10, %xmm0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
movdqu %xmm0, PBlockEncKey(%arg2)
@@ -377,7 +376,7 @@ _large_enough_update_\@:
# get the appropriate shuffle mask
movdqu (%r12), %xmm2
# shift right 16-r13 bytes
- PSHUFB_XMM %xmm2, %xmm1
+ pshufb %xmm2, %xmm1
_data_read_\@:
lea ALL_F+16(%rip), %r12
@@ -393,12 +392,12 @@ _data_read_\@:
.ifc \operation, dec
pand %xmm1, %xmm2
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10 ,%xmm2
+ pshufb %xmm10 ,%xmm2
pxor %xmm2, %xmm8
.else
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10,%xmm0
+ pshufb %xmm10,%xmm0
pxor %xmm0, %xmm8
.endif
@@ -408,17 +407,17 @@ _data_read_\@:
# GHASH computation for the last <16 byte block
movdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm0 back to output as ciphertext
- PSHUFB_XMM %xmm10, %xmm0
+ pshufb %xmm10, %xmm0
.endif
# Output %r13 bytes
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
mov %al, (%arg3, %r11, 1)
@@ -449,7 +448,7 @@ _partial_done\@:
movd %r12d, %xmm15 # len(A) in %xmm15
mov InLen(%arg2), %r12
shl $3, %r12 # len(C) in bits (*128)
- MOVQ_R64_XMM %r12, %xmm1
+ movq %r12, %xmm1
pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
@@ -457,7 +456,7 @@ _partial_done\@:
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# final GHASH computation
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm8
+ pshufb %xmm10, %xmm8
movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
@@ -470,7 +469,7 @@ _return_T_\@:
cmp $8, %r11
jl _T_4_\@
_T_8_\@:
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
@@ -518,9 +517,9 @@ _return_T_done_\@:
pshufd $78, \HK, \TMP3
pxor \GH, \TMP2 # TMP2 = a1+a0
pxor \HK, \TMP3 # TMP3 = b1+b0
- PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
- PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
+ pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \HK, \GH # GH = a0*b0
+ pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
pxor \GH, \TMP2
pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
movdqa \TMP2, \TMP3
@@ -570,7 +569,7 @@ _return_T_done_\@:
cmp $8, \DLEN
jl _read_lt8_\@
mov (\DPTR), %rax
- MOVQ_R64_XMM %rax, \XMMDst
+ movq %rax, \XMMDst
sub $8, \DLEN
jz _done_read_partial_block_\@
xor %eax, %eax
@@ -579,7 +578,7 @@ _read_next_byte_\@:
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_\@
- MOVQ_R64_XMM %rax, \XMM1
+ movq %rax, \XMM1
pslldq $8, \XMM1
por \XMM1, \XMMDst
jmp _done_read_partial_block_\@
@@ -590,7 +589,7 @@ _read_next_byte_lt8_\@:
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
jnz _read_next_byte_lt8_\@
- MOVQ_R64_XMM %rax, \XMMDst
+ movq %rax, \XMMDst
_done_read_partial_block_\@:
.endm
@@ -608,7 +607,7 @@ _done_read_partial_block_\@:
jl _get_AAD_rest\@
_get_AAD_blocks\@:
movdqu (%r10), \TMP7
- PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pshufb %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP7, \TMP6
GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
add $16, %r10
@@ -624,7 +623,7 @@ _get_AAD_rest\@:
je _get_AAD_done\@
READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
- PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pshufb %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP6, \TMP7
GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
movdqu \TMP7, \TMP6
@@ -667,7 +666,7 @@ _data_read_\@: # Finished reading in data
# r16-r13 is the number of bytes in plaintext mod 16)
add %r13, %r12
movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
+ pshufb %xmm2, %xmm9 # shift right r13 bytes
.ifc \operation, dec
movdqa %xmm1, %xmm3
@@ -689,8 +688,8 @@ _no_extra_mask_1_\@:
pand %xmm1, %xmm3
movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm3
- PSHUFB_XMM %xmm2, %xmm3
+ pshufb %xmm10, %xmm3
+ pshufb %xmm2, %xmm3
pxor %xmm3, \AAD_HASH
cmp $0, %r10
@@ -724,8 +723,8 @@ _no_extra_mask_2_\@:
pand %xmm1, %xmm9
movdqa SHUF_MASK(%rip), %xmm1
- PSHUFB_XMM %xmm1, %xmm9
- PSHUFB_XMM %xmm2, %xmm9
+ pshufb %xmm1, %xmm9
+ pshufb %xmm2, %xmm9
pxor %xmm9, \AAD_HASH
cmp $0, %r10
@@ -744,8 +743,8 @@ _encode_done_\@:
movdqa SHUF_MASK(%rip), %xmm10
# shuffle xmm9 back to output as ciphertext
- PSHUFB_XMM %xmm10, %xmm9
- PSHUFB_XMM %xmm2, %xmm9
+ pshufb %xmm10, %xmm9
+ pshufb %xmm2, %xmm9
.endif
# output encrypted Bytes
cmp $0, %r10
@@ -759,14 +758,14 @@ _partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
_count_set_\@:
movdqa %xmm9, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
cmp $8, %r13
jle _less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
+ movq %xmm0, %rax
sub $8, %r13
_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
@@ -810,7 +809,7 @@ _partial_block_done_\@:
.else
MOVADQ \XMM0, %xmm\index
.endif
- PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+ pshufb %xmm14, %xmm\index # perform a 16 byte swap
pxor \TMP2, %xmm\index
.endr
lea 0x10(%arg1),%r10
@@ -821,7 +820,7 @@ _partial_block_done_\@:
aes_loop_initial_\@:
MOVADQ (%r10),\TMP1
.irpc index, \i_seq
- AESENC \TMP1, %xmm\index
+ aesenc \TMP1, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -829,7 +828,7 @@ aes_loop_initial_\@:
MOVADQ (%r10), \TMP1
.irpc index, \i_seq
- AESENCLAST \TMP1, %xmm\index # Last Round
+ aesenclast \TMP1, %xmm\index # Last Round
.endr
.irpc index, \i_seq
movdqu (%arg4 , %r11, 1), \TMP1
@@ -841,7 +840,7 @@ aes_loop_initial_\@:
.ifc \operation, dec
movdqa \TMP1, %xmm\index
.endif
- PSHUFB_XMM %xmm14, %xmm\index
+ pshufb %xmm14, %xmm\index
# prepare plaintext/ciphertext for GHASH computation
.endr
@@ -876,19 +875,19 @@ aes_loop_initial_\@:
MOVADQ ONE(%RIP),\TMP1
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM1
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pshufb %xmm14, \XMM1 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM2
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ pshufb %xmm14, \XMM2 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM3
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ pshufb %xmm14, \XMM3 # perform a 16 byte swap
paddd \TMP1, \XMM0 # INCR Y0
MOVADQ \XMM0, \XMM4
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+ pshufb %xmm14, \XMM4 # perform a 16 byte swap
MOVADQ 0(%arg1),\TMP1
pxor \TMP1, \XMM1
@@ -897,17 +896,17 @@ aes_loop_initial_\@:
pxor \TMP1, \XMM4
.irpc index, 1234 # do 4 rounds
movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
.endr
.irpc index, 56789 # do next 5 rounds
movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
.endr
lea 0xa0(%arg1),%r10
mov keysize,%eax
@@ -918,7 +917,7 @@ aes_loop_initial_\@:
aes_loop_pre_\@:
MOVADQ (%r10),\TMP2
.irpc index, 1234
- AESENC \TMP2, %xmm\index
+ aesenc \TMP2, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -926,10 +925,10 @@ aes_loop_pre_\@:
aes_loop_pre_done\@:
MOVADQ (%r10), \TMP2
- AESENCLAST \TMP2, \XMM1
- AESENCLAST \TMP2, \XMM2
- AESENCLAST \TMP2, \XMM3
- AESENCLAST \TMP2, \XMM4
+ aesenclast \TMP2, \XMM1
+ aesenclast \TMP2, \XMM2
+ aesenclast \TMP2, \XMM3
+ aesenclast \TMP2, \XMM4
movdqu 16*0(%arg4 , %r11 , 1), \TMP1
pxor \TMP1, \XMM1
.ifc \operation, dec
@@ -961,12 +960,12 @@ aes_loop_pre_done\@:
.endif
add $64, %r11
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pshufb %xmm14, \XMM1 # perform a 16 byte swap
pxor \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+ pshufb %xmm14, \XMM2 # perform a 16 byte swap
+ pshufb %xmm14, \XMM3 # perform a 16 byte swap
+ pshufb %xmm14, \XMM4 # perform a 16 byte swap
_initial_blocks_done\@:
@@ -978,7 +977,7 @@ _initial_blocks_done\@:
* arg1, %arg3, %arg4 are used as pointers only, not modified
* %r11 is the data offset value
*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM1, \XMM5
@@ -994,7 +993,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
movdqu HashKey_4(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM2
@@ -1002,51 +1001,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM0, \XMM3
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor (%arg1), \XMM1
pxor (%arg1), \XMM2
pxor (%arg1), \XMM3
pxor (%arg1), \XMM4
movdqu HashKey_4_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 2
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 2
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2
movdqu HashKey_3(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 3
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 3
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
movaps 0x40(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 4
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 4
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_3_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 5
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 5
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM6, \XMM5
@@ -1058,25 +1057,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
# Multiply TMP5 * HashKey using karatsuba
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x60(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 6
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 6
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
movaps 0x70(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 7
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 7
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_2_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 8
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 8
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM7, \XMM5
@@ -1089,13 +1088,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2
movdqu HashKey(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 9
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 9
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
lea 0xa0(%arg1),%r10
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
@@ -1105,7 +1104,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
aes_loop_par_enc\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
- AESENC \TMP3, %xmm\index
+ aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -1113,12 +1112,12 @@ aes_loop_par_enc\@:
aes_loop_par_enc_done\@:
MOVADQ (%r10), \TMP3
- AESENCLAST \TMP3, \XMM1 # Round 10
- AESENCLAST \TMP3, \XMM2
- AESENCLAST \TMP3, \XMM3
- AESENCLAST \TMP3, \XMM4
+ aesenclast \TMP3, \XMM1 # Round 10
+ aesenclast \TMP3, \XMM2
+ aesenclast \TMP3, \XMM3
+ aesenclast \TMP3, \XMM4
movdqu HashKey_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
movdqu 16(%arg4,%r11,1), \TMP3
@@ -1131,10 +1130,10 @@ aes_loop_par_enc_done\@:
movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor \TMP4, \TMP1
pxor \XMM8, \XMM5
@@ -1186,7 +1185,7 @@ aes_loop_par_enc_done\@:
* arg1, %arg3, %arg4 are used as pointers only, not modified
* %r11 is the data offset value
*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
+.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM1, \XMM5
@@ -1202,7 +1201,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pxor \XMM5, \TMP6
paddd ONE(%rip), \XMM0 # INCR CNT
movdqu HashKey_4(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
movdqa \XMM0, \XMM1
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM2
@@ -1210,51 +1209,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
movdqa \XMM0, \XMM3
paddd ONE(%rip), \XMM0 # INCR CNT
movdqa \XMM0, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor (%arg1), \XMM1
pxor (%arg1), \XMM2
pxor (%arg1), \XMM3
pxor (%arg1), \XMM4
movdqu HashKey_4_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
movaps 0x10(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 1
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 2
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
+ aesenc \TMP1, \XMM1 # Round 2
+ aesenc \TMP1, \XMM2
+ aesenc \TMP1, \XMM3
+ aesenc \TMP1, \XMM4
movdqa \XMM6, \TMP1
pshufd $78, \XMM6, \TMP2
pxor \XMM6, \TMP2
movdqu HashKey_3(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
movaps 0x30(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 3
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 3
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
movaps 0x40(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 4
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 4
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_3_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x50(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 5
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 5
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM6, \XMM5
@@ -1266,25 +1265,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
# Multiply TMP5 * HashKey using karatsuba
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x60(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 6
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 6
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
movaps 0x70(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 7
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 7
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
movdqu HashKey_2_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movaps 0x80(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 8
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
+ aesenc \TMP3, \XMM1 # Round 8
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
pxor \TMP1, \TMP4
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
pxor \XMM7, \XMM5
@@ -1297,13 +1296,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
pshufd $78, \XMM8, \TMP2
pxor \XMM8, \TMP2
movdqu HashKey(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
movaps 0x90(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 9
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ aesenc \TMP3, \XMM1 # Round 9
+ aesenc \TMP3, \XMM2
+ aesenc \TMP3, \XMM3
+ aesenc \TMP3, \XMM4
+ pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
lea 0xa0(%arg1),%r10
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
@@ -1313,7 +1312,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
aes_loop_par_dec\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
- AESENC \TMP3, %xmm\index
+ aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
@@ -1321,12 +1320,12 @@ aes_loop_par_dec\@:
aes_loop_par_dec_done\@:
MOVADQ (%r10), \TMP3
- AESENCLAST \TMP3, \XMM1 # last round
- AESENCLAST \TMP3, \XMM2
- AESENCLAST \TMP3, \XMM3
- AESENCLAST \TMP3, \XMM4
+ aesenclast \TMP3, \XMM1 # last round
+ aesenclast \TMP3, \XMM2
+ aesenclast \TMP3, \XMM3
+ aesenclast \TMP3, \XMM4
movdqu HashKey_k(%arg2), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqu (%arg4,%r11,1), \TMP3
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
@@ -1343,10 +1342,10 @@ aes_loop_par_dec_done\@:
pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
movdqa \TMP3, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+ pshufb %xmm15, \XMM1 # perform a 16 byte swap
+ pshufb %xmm15, \XMM2 # perform a 16 byte swap
+ pshufb %xmm15, \XMM3 # perform a 16 byte swap
+ pshufb %xmm15, \XMM4 # perform a 16 byte swap
pxor \TMP4, \TMP1
pxor \XMM8, \XMM5
@@ -1402,10 +1401,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM1, \TMP2
pxor \XMM1, \TMP2
movdqu HashKey_4(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
movdqu HashKey_4_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
movdqa \XMM1, \XMMDst
movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
@@ -1415,10 +1414,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM2, \TMP2
pxor \XMM2, \TMP2
movdqu HashKey_3(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
movdqu HashKey_3_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM2, \XMMDst
pxor \TMP2, \XMM1
@@ -1430,10 +1429,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM3, \TMP2
pxor \XMM3, \TMP2
movdqu HashKey_2(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
movdqu HashKey_2_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM3, \XMMDst
pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
@@ -1443,10 +1442,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pshufd $78, \XMM4, \TMP2
pxor \XMM4, \TMP2
movdqu HashKey(%arg2), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
+ pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
movdqu HashKey_k(%arg2), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
pxor \TMP1, \TMP6
pxor \XMM4, \XMMDst
pxor \XMM1, \TMP2
@@ -1504,13 +1503,13 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
_esb_loop_\@:
MOVADQ (%r10),\TMP1
- AESENC \TMP1,\XMM0
+ aesenc \TMP1,\XMM0
add $16,%r10
sub $1,%eax
jnz _esb_loop_\@
MOVADQ (%r10),\TMP1
- AESENCLAST \TMP1,\XMM0
+ aesenclast \TMP1,\XMM0
.endm
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
@@ -1849,72 +1848,72 @@ SYM_FUNC_START(aesni_set_key)
movups 0x10(UKEYP), %xmm2 # other user key
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
call _key_expansion_256a
- AESKEYGENASSIST 0x1 %xmm0 %xmm1
+ aeskeygenassist $0x1, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
call _key_expansion_256a
- AESKEYGENASSIST 0x2 %xmm0 %xmm1
+ aeskeygenassist $0x2, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
call _key_expansion_256a
- AESKEYGENASSIST 0x4 %xmm0 %xmm1
+ aeskeygenassist $0x4, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
call _key_expansion_256a
- AESKEYGENASSIST 0x8 %xmm0 %xmm1
+ aeskeygenassist $0x8, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
call _key_expansion_256a
- AESKEYGENASSIST 0x10 %xmm0 %xmm1
+ aeskeygenassist $0x10, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
call _key_expansion_256a
- AESKEYGENASSIST 0x20 %xmm0 %xmm1
+ aeskeygenassist $0x20, %xmm0, %xmm1
call _key_expansion_256b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
movq 0x10(UKEYP), %xmm2 # other user key
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
call _key_expansion_192a
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
call _key_expansion_192b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
call _key_expansion_192a
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
call _key_expansion_192b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
call _key_expansion_192a
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
call _key_expansion_192b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
call _key_expansion_192a
- AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
+ aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
call _key_expansion_192b
jmp .Ldec_key
.Lenc_key128:
- AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
+ aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
call _key_expansion_128
- AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
+ aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
call _key_expansion_128
- AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
+ aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
call _key_expansion_128
- AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
+ aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
call _key_expansion_128
- AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
+ aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
call _key_expansion_128
- AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
+ aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
call _key_expansion_128
- AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
+ aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
call _key_expansion_128
- AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
+ aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
call _key_expansion_128
- AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
+ aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
call _key_expansion_128
- AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
+ aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
sub $0x10, TKEYP
@@ -1927,7 +1926,7 @@ SYM_FUNC_START(aesni_set_key)
.align 4
.Ldec_key_loop:
movaps (KEYP), %xmm0
- AESIMC %xmm0 %xmm1
+ aesimc %xmm0, %xmm1
movaps %xmm1, (UKEYP)
add $0x10, KEYP
sub $0x10, UKEYP
@@ -1988,37 +1987,37 @@ SYM_FUNC_START_LOCAL(_aesni_enc1)
je .Lenc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps -0x50(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
.align 4
.Lenc192:
movaps -0x40(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps -0x30(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
.align 4
.Lenc128:
movaps -0x20(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps -0x10(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps (TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x10(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x20(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x30(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x40(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x50(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x60(TKEYP), KEY
- AESENC KEY STATE
+ aesenc KEY, STATE
movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE
+ aesenclast KEY, STATE
ret
SYM_FUNC_END(_aesni_enc1)
@@ -2054,79 +2053,79 @@ SYM_FUNC_START_LOCAL(_aesni_enc4)
je .L4enc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
#.align 4
.L4enc192:
movaps -0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
#.align 4
.L4enc128:
movaps -0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps (TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE1 # last round
- AESENCLAST KEY STATE2
- AESENCLAST KEY STATE3
- AESENCLAST KEY STATE4
+ aesenclast KEY, STATE1 # last round
+ aesenclast KEY, STATE2
+ aesenclast KEY, STATE3
+ aesenclast KEY, STATE4
ret
SYM_FUNC_END(_aesni_enc4)
@@ -2178,37 +2177,37 @@ SYM_FUNC_START_LOCAL(_aesni_dec1)
je .Ldec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
.align 4
.Ldec192:
movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
.align 4
.Ldec128:
movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps (TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE
+ aesdec KEY, STATE
movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE
+ aesdeclast KEY, STATE
ret
SYM_FUNC_END(_aesni_dec1)
@@ -2244,79 +2243,79 @@ SYM_FUNC_START_LOCAL(_aesni_dec4)
je .L4dec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
.align 4
.L4dec192:
movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
.align 4
.L4dec128:
movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps (TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE1 # last round
- AESDECLAST KEY STATE2
- AESDECLAST KEY STATE3
- AESDECLAST KEY STATE4
+ aesdeclast KEY, STATE1 # last round
+ aesdeclast KEY, STATE2
+ aesdeclast KEY, STATE3
+ aesdeclast KEY, STATE4
ret
SYM_FUNC_END(_aesni_dec4)
@@ -2599,10 +2598,10 @@ SYM_FUNC_END(aesni_cbc_dec)
SYM_FUNC_START_LOCAL(_aesni_inc_init)
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
- PSHUFB_XMM BSWAP_MASK CTR
+ pshufb BSWAP_MASK, CTR
mov $1, TCTR_LOW
- MOVQ_R64_XMM TCTR_LOW INC
- MOVQ_R64_XMM CTR TCTR_LOW
+ movq TCTR_LOW, INC
+ movq CTR, TCTR_LOW
ret
SYM_FUNC_END(_aesni_inc_init)
@@ -2630,7 +2629,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc)
psrldq $8, INC
.Linc_low:
movaps CTR, IV
- PSHUFB_XMM BSWAP_MASK IV
+ pshufb BSWAP_MASK, IV
ret
SYM_FUNC_END(_aesni_inc)
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 0cea33295287..5fee47956f3b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -120,7 +120,6 @@
##
#include <linux/linkage.h>
-#include <asm/inst.h>
# constants in mergeable sections, linker can reorder and merge
.section .rodata.cst16.POLY, "aM", @progbits, 16
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
index a38ab2512a6f..ca1788bfee16 100644
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -120,10 +120,10 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
FRAME_BEGIN
# x0..3 = s0..3
- movdqa 0x00(%rdi),%xmm0
- movdqa 0x10(%rdi),%xmm1
- movdqa 0x20(%rdi),%xmm2
- movdqa 0x30(%rdi),%xmm3
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
@@ -205,10 +205,10 @@ SYM_FUNC_START(hchacha_block_ssse3)
# %edx: nrounds
FRAME_BEGIN
- movdqa 0x00(%rdi),%xmm0
- movdqa 0x10(%rdi),%xmm1
- movdqa 0x20(%rdi),%xmm2
- movdqa 0x30(%rdi),%xmm3
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
mov %edx,%r8d
call chacha_permute
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 22250091cdbe..e67a59130025 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -14,8 +14,6 @@
#include <linux/module.h>
#include <asm/simd.h>
-#define CHACHA_STATE_ALIGN 16
-
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
@@ -124,8 +122,6 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
-
if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
hchacha_block_generic(state, stream, nrounds);
} else {
@@ -138,8 +134,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
-
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
@@ -147,8 +141,6 @@ EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
-
if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
@@ -170,15 +162,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_simd_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
- u32 *state, state_buf[16 + 2] __aligned(8);
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
-
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
@@ -217,12 +206,10 @@ static int xchacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *state, state_buf[16 + 2] __aligned(8);
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct chacha_ctx subctx;
u8 real_iv[16];
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
chacha_init_generic(state, ctx->key, req->iv);
if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index 9fd28ff65bc2..6e7d4c4d3208 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -38,7 +38,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
.section .rodata
@@ -129,17 +128,17 @@ loop_64:/* 64 bytes Full cache line folding */
#ifdef __x86_64__
movdqa %xmm4, %xmm8
#endif
- PCLMULQDQ 00, CONSTANT, %xmm1
- PCLMULQDQ 00, CONSTANT, %xmm2
- PCLMULQDQ 00, CONSTANT, %xmm3
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm2
+ pclmulqdq $0x00, CONSTANT, %xmm3
#ifdef __x86_64__
- PCLMULQDQ 00, CONSTANT, %xmm4
+ pclmulqdq $0x00, CONSTANT, %xmm4
#endif
- PCLMULQDQ 0x11, CONSTANT, %xmm5
- PCLMULQDQ 0x11, CONSTANT, %xmm6
- PCLMULQDQ 0x11, CONSTANT, %xmm7
+ pclmulqdq $0x11, CONSTANT, %xmm5
+ pclmulqdq $0x11, CONSTANT, %xmm6
+ pclmulqdq $0x11, CONSTANT, %xmm7
#ifdef __x86_64__
- PCLMULQDQ 0x11, CONSTANT, %xmm8
+ pclmulqdq $0x11, CONSTANT, %xmm8
#endif
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
@@ -149,8 +148,8 @@ loop_64:/* 64 bytes Full cache line folding */
#else
/* xmm8 unsupported for x32 */
movdqa %xmm4, %xmm5
- PCLMULQDQ 00, CONSTANT, %xmm4
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm4
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm4
#endif
@@ -172,20 +171,20 @@ less_64:/* Folding cache line into 128bit */
prefetchnta (BUF)
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor %xmm2, %xmm1
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor %xmm3, %xmm1
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor %xmm4, %xmm1
@@ -193,8 +192,8 @@ less_64:/* Folding cache line into 128bit */
jb fold_64
loop_16:/* Folding rest buffer into 128bit */
movdqa %xmm1, %xmm5
- PCLMULQDQ 0x00, CONSTANT, %xmm1
- PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pclmulqdq $0x00, CONSTANT, %xmm1
+ pclmulqdq $0x11, CONSTANT, %xmm5
pxor %xmm5, %xmm1
pxor (BUF), %xmm1
sub $0x10, LEN
@@ -205,7 +204,7 @@ loop_16:/* Folding rest buffer into 128bit */
fold_64:
/* perform the last 64 bit fold, also adds 32 zeroes
* to the input stream */
- PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+ pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
psrldq $0x08, %xmm1
pxor CONSTANT, %xmm1
@@ -220,7 +219,7 @@ fold_64:
#endif
psrldq $0x04, %xmm2
pand %xmm3, %xmm1
- PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm1
pxor %xmm2, %xmm1
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
@@ -231,11 +230,11 @@ fold_64:
#endif
movdqa %xmm1, %xmm2
pand %xmm3, %xmm1
- PCLMULQDQ 0x10, CONSTANT, %xmm1
+ pclmulqdq $0x10, CONSTANT, %xmm1
pand %xmm3, %xmm1
- PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pclmulqdq $0x00, CONSTANT, %xmm1
pxor %xmm2, %xmm1
- PEXTRD 0x01, %xmm1, %eax
+ pextrd $0x01, %xmm1, %eax
ret
SYM_FUNC_END(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 8501ec4532f4..884dc767b051 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -43,7 +43,6 @@
* SOFTWARE.
*/
-#include <asm/inst.h>
#include <linux/linkage.h>
#include <asm/nospec-branch.h>
@@ -170,7 +169,7 @@ continue_block:
## branch into array
lea jump_table(%rip), %bufp
- movzxw (%bufp, %rax, 2), len
+ movzwq (%bufp, %rax, 2), len
lea crc_array(%rip), %bufp
lea (%bufp, len, 1), %bufp
JMP_NOSPEC bufp
@@ -225,10 +224,10 @@ LABEL crc_ %i
subq %rax, tmp # tmp -= rax*24
movq crc_init, %xmm1 # CRC for block 1
- PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
+ pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
- PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1
+ pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
pxor %xmm2,%xmm1
movq %xmm1, %rax
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 8a17621f7d3a..8acbb6584a37 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -948,10 +948,8 @@ static void store_felem(u64 *b, u64 *f)
{
u64 f30 = f[3U];
u64 top_bit0 = f30 >> (u32)63U;
- u64 carry0;
u64 f31;
u64 top_bit;
- u64 carry;
u64 f0;
u64 f1;
u64 f2;
@@ -970,11 +968,11 @@ static void store_felem(u64 *b, u64 *f)
u64 o2;
u64 o3;
f[3U] = f30 & (u64)0x7fffffffffffffffU;
- carry0 = add_scalar(f, f, (u64)19U * top_bit0);
+ add_scalar(f, f, (u64)19U * top_bit0);
f31 = f[3U];
top_bit = f31 >> (u32)63U;
f[3U] = f31 & (u64)0x7fffffffffffffffU;
- carry = add_scalar(f, f, (u64)19U * top_bit);
+ add_scalar(f, f, (u64)19U * top_bit);
f0 = f[0U];
f1 = f[1U];
f2 = f[2U];
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index bb9735fbb865..99ac25e18e09 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -14,7 +14,6 @@
*/
#include <linux/linkage.h>
-#include <asm/inst.h>
#include <asm/frame.h>
.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
@@ -51,9 +50,9 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
pxor DATA, T2
pxor SHASH, T3
- PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
- PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
- PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
+ pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0
+ pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1
+ pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0)
pxor DATA, T2
pxor T1, T2 # T2 = a0 * b1 + a1 * b0
@@ -95,9 +94,9 @@ SYM_FUNC_START(clmul_ghash_mul)
movups (%rdi), DATA
movups (%rsi), SHASH
movaps .Lbswap_mask, BSWAP
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
call __clmul_gf128mul_ble
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
movups DATA, (%rdi)
FRAME_END
ret
@@ -114,18 +113,18 @@ SYM_FUNC_START(clmul_ghash_update)
movaps .Lbswap_mask, BSWAP
movups (%rdi), DATA
movups (%rcx), SHASH
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
.align 4
.Lupdate_loop:
movups (%rsi), IN1
- PSHUFB_XMM BSWAP IN1
+ pshufb BSWAP, IN1
pxor IN1, DATA
call __clmul_gf128mul_ble
sub $16, %rdx
add $16, %rsi
cmp $16, %rdx
jge .Lupdate_loop
- PSHUFB_XMM BSWAP DATA
+ pshufb BSWAP, DATA
movups DATA, (%rdi)
.Lupdate_just_ret:
FRAME_END
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index f5a796da07f8..438ccd4f3cc4 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -12,7 +12,6 @@
#define REG_TYPE_R32 0
#define REG_TYPE_R64 1
-#define REG_TYPE_XMM 2
#define REG_TYPE_INVALID 100
.macro R32_NUM opd r32
@@ -123,77 +122,18 @@
#endif
.endm
- .macro XMM_NUM opd xmm
- \opd = REG_NUM_INVALID
- .ifc \xmm,%xmm0
- \opd = 0
- .endif
- .ifc \xmm,%xmm1
- \opd = 1
- .endif
- .ifc \xmm,%xmm2
- \opd = 2
- .endif
- .ifc \xmm,%xmm3
- \opd = 3
- .endif
- .ifc \xmm,%xmm4
- \opd = 4
- .endif
- .ifc \xmm,%xmm5
- \opd = 5
- .endif
- .ifc \xmm,%xmm6
- \opd = 6
- .endif
- .ifc \xmm,%xmm7
- \opd = 7
- .endif
- .ifc \xmm,%xmm8
- \opd = 8
- .endif
- .ifc \xmm,%xmm9
- \opd = 9
- .endif
- .ifc \xmm,%xmm10
- \opd = 10
- .endif
- .ifc \xmm,%xmm11
- \opd = 11
- .endif
- .ifc \xmm,%xmm12
- \opd = 12
- .endif
- .ifc \xmm,%xmm13
- \opd = 13
- .endif
- .ifc \xmm,%xmm14
- \opd = 14
- .endif
- .ifc \xmm,%xmm15
- \opd = 15
- .endif
- .endm
-
.macro REG_TYPE type reg
R32_NUM reg_type_r32 \reg
R64_NUM reg_type_r64 \reg
- XMM_NUM reg_type_xmm \reg
.if reg_type_r64 <> REG_NUM_INVALID
\type = REG_TYPE_R64
.elseif reg_type_r32 <> REG_NUM_INVALID
\type = REG_TYPE_R32
- .elseif reg_type_xmm <> REG_NUM_INVALID
- \type = REG_TYPE_XMM
.else
\type = REG_TYPE_INVALID
.endif
.endm
- .macro PFX_OPD_SIZE
- .byte 0x66
- .endm
-
.macro PFX_REX opd1 opd2 W=0
.if ((\opd1 | \opd2) & 8) || \W
.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
@@ -203,109 +143,6 @@
.macro MODRM mod opd1 opd2
.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
.endm
-
- .macro PSHUFB_XMM xmm1 xmm2
- XMM_NUM pshufb_opd1 \xmm1
- XMM_NUM pshufb_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX pshufb_opd1 pshufb_opd2
- .byte 0x0f, 0x38, 0x00
- MODRM 0xc0 pshufb_opd1 pshufb_opd2
- .endm
-
- .macro PCLMULQDQ imm8 xmm1 xmm2
- XMM_NUM clmul_opd1 \xmm1
- XMM_NUM clmul_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX clmul_opd1 clmul_opd2
- .byte 0x0f, 0x3a, 0x44
- MODRM 0xc0 clmul_opd1 clmul_opd2
- .byte \imm8
- .endm
-
- .macro PEXTRD imm8 xmm gpr
- R32_NUM extrd_opd1 \gpr
- XMM_NUM extrd_opd2 \xmm
- PFX_OPD_SIZE
- PFX_REX extrd_opd1 extrd_opd2
- .byte 0x0f, 0x3a, 0x16
- MODRM 0xc0 extrd_opd1 extrd_opd2
- .byte \imm8
- .endm
-
- .macro AESKEYGENASSIST rcon xmm1 xmm2
- XMM_NUM aeskeygen_opd1 \xmm1
- XMM_NUM aeskeygen_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX aeskeygen_opd1 aeskeygen_opd2
- .byte 0x0f, 0x3a, 0xdf
- MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
- .byte \rcon
- .endm
-
- .macro AESIMC xmm1 xmm2
- XMM_NUM aesimc_opd1 \xmm1
- XMM_NUM aesimc_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX aesimc_opd1 aesimc_opd2
- .byte 0x0f, 0x38, 0xdb
- MODRM 0xc0 aesimc_opd1 aesimc_opd2
- .endm
-
- .macro AESENC xmm1 xmm2
- XMM_NUM aesenc_opd1 \xmm1
- XMM_NUM aesenc_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX aesenc_opd1 aesenc_opd2
- .byte 0x0f, 0x38, 0xdc
- MODRM 0xc0 aesenc_opd1 aesenc_opd2
- .endm
-
- .macro AESENCLAST xmm1 xmm2
- XMM_NUM aesenclast_opd1 \xmm1
- XMM_NUM aesenclast_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX aesenclast_opd1 aesenclast_opd2
- .byte 0x0f, 0x38, 0xdd
- MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
- .endm
-
- .macro AESDEC xmm1 xmm2
- XMM_NUM aesdec_opd1 \xmm1
- XMM_NUM aesdec_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX aesdec_opd1 aesdec_opd2
- .byte 0x0f, 0x38, 0xde
- MODRM 0xc0 aesdec_opd1 aesdec_opd2
- .endm
-
- .macro AESDECLAST xmm1 xmm2
- XMM_NUM aesdeclast_opd1 \xmm1
- XMM_NUM aesdeclast_opd2 \xmm2
- PFX_OPD_SIZE
- PFX_REX aesdeclast_opd1 aesdeclast_opd2
- .byte 0x0f, 0x38, 0xdf
- MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
- .endm
-
- .macro MOVQ_R64_XMM opd1 opd2
- REG_TYPE movq_r64_xmm_opd1_type \opd1
- .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
- XMM_NUM movq_r64_xmm_opd1 \opd1
- R64_NUM movq_r64_xmm_opd2 \opd2
- .else
- R64_NUM movq_r64_xmm_opd1 \opd1
- XMM_NUM movq_r64_xmm_opd2 \opd2
- .endif
- PFX_OPD_SIZE
- PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
- .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
- .byte 0x0f, 0x7e
- .else
- .byte 0x0f, 0x6e
- .endif
- MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
- .endm
#endif
#endif