summaryrefslogtreecommitdiff
path: root/arch/x86/crypto/aesni-intel_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1970
1 files changed, 85 insertions, 1885 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 411d8c83e88a..eb153eff9331 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -10,16 +10,7 @@
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
- * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
- * interface for 64-bit kernels.
- * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
- * Aidan O'Mahony (aidan.o.mahony@intel.com)
- * Adrian Hoban <adrian.hoban@intel.com>
- * James Guilford (james.guilford@intel.com)
- * Gabriele Paoloni <gabriele.paoloni@intel.com>
- * Tadeusz Struk (tadeusz.struk@intel.com)
- * Wajdi Feghali (wajdi.k.feghali@intel.com)
- * Copyright (c) 2010, Intel Corporation.
+ * Copyright (c) 2010, Intel Corporation.
*
* Ported x86_64 version to x86:
* Author: Mathias Krause <minipli@googlemail.com>
@@ -27,103 +18,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
-
-/*
- * The following macros are used to move an (un)aligned 16 byte value to/from
- * an XMM register. This can done for either FP or integer values, for FP use
- * movaps (move aligned packed single) or integer use movdqa (move double quad
- * aligned). It doesn't make a performance difference which instruction is used
- * since Nehalem (original Core i7) was released. However, the movaps is a byte
- * shorter, so that is the one we'll use for now. (same for unaligned).
- */
-#define MOVADQ movaps
-#define MOVUDQ movups
-
-#ifdef __x86_64__
-
-# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY: .octa 0xC2000000000000000000000000000001
-.section .rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE: .octa 0x00000001000000000000000000000001
-
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
-.section .rodata.cst16.MASK1, "aM", @progbits, 16
-.align 16
-MASK1: .octa 0x0000000000000000ffffffffffffffff
-.section .rodata.cst16.MASK2, "aM", @progbits, 16
-.align 16
-MASK2: .octa 0xffffffffffffffff0000000000000000
-.section .rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE: .octa 0x00000000000000000000000000000001
-.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
-.align 16
-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
-.section .rodata.cst16.dec, "aM", @progbits, 16
-.align 16
-dec: .octa 0x1
-.section .rodata.cst16.enc, "aM", @progbits, 16
-.align 16
-enc: .octa 0x2
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK,
-# and zero should follow ALL_F
-.section .rodata, "a", @progbits
-.align 16
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F: .octa 0xffffffffffffffffffffffffffffffff
- .octa 0x00000000000000000000000000000000
-
-.text
-
-
-#define STACK_OFFSET 8*3
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-#define HashKey 16*6 // store HashKey <<1 mod poly here
-#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
-#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
-#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
-#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
- // bits of HashKey <<1 mod poly here
- //(for Karatsuba purposes)
-#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
- // bits of HashKey^2 <<1 mod poly here
- // (for Karatsuba purposes)
-#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
- // bits of HashKey^3 <<1 mod poly here
- // (for Karatsuba purposes)
-#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
- // bits of HashKey^4 <<1 mod poly here
- // (for Karatsuba purposes)
-
-#define arg1 rdi
-#define arg2 rsi
-#define arg3 rdx
-#define arg4 rcx
-#define arg5 r8
-#define arg6 r9
-#define arg7 STACK_OFFSET+8(%rsp)
-#define arg8 STACK_OFFSET+16(%rsp)
-#define arg9 STACK_OFFSET+24(%rsp)
-#define arg10 STACK_OFFSET+32(%rsp)
-#define arg11 STACK_OFFSET+40(%rsp)
-#define keysize 2*15*16(%arg1)
-#endif
-
#define STATE1 %xmm0
#define STATE2 %xmm4
@@ -170,1587 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define TKEYP T1
#endif
-.macro FUNC_SAVE
- push %r12
- push %r13
- push %r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-.endm
-
-
-.macro FUNC_RESTORE
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-# Precompute hashkeys.
-# Input: Hash subkey.
-# Output: HashKeys stored in gcm_context_data. Only needs to be called
-# once per key.
-# clobbers r12, and tmp xmm registers.
-.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
- mov \SUBKEY, %r12
- movdqu (%r12), \TMP3
- movdqa SHUF_MASK(%rip), \TMP2
- pshufb \TMP2, \TMP3
-
- # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
- movdqa \TMP3, \TMP2
- psllq $1, \TMP3
- psrlq $63, \TMP2
- movdqa \TMP2, \TMP1
- pslldq $8, \TMP2
- psrldq $8, \TMP1
- por \TMP2, \TMP3
-
- # reduce HashKey<<1
-
- pshufd $0x24, \TMP1, \TMP2
- pcmpeqd TWOONE(%rip), \TMP2
- pand POLY(%rip), \TMP2
- pxor \TMP2, \TMP3
- movdqu \TMP3, HashKey(%arg2)
-
- movdqa \TMP3, \TMP5
- pshufd $78, \TMP3, \TMP1
- pxor \TMP3, \TMP1
- movdqu \TMP1, HashKey_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^2<<1 (mod poly)
- movdqu \TMP5, HashKey_2(%arg2)
-# HashKey_2 = HashKey^2<<1 (mod poly)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_2_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
- movdqu \TMP5, HashKey_3(%arg2)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_3_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
- movdqu \TMP5, HashKey_4(%arg2)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_4_k(%arg2)
-.endm
-
-# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
-# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
-.macro GCM_INIT Iv SUBKEY AAD AADLEN
- mov \AADLEN, %r11
- mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11d, %r11d
- mov %r11, InLen(%arg2) # ctx_data.in_length = 0
- mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
- mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
- mov \Iv, %rax
- movdqu (%rax), %xmm0
- movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
-
- movdqa SHUF_MASK(%rip), %xmm2
- pshufb %xmm2, %xmm0
- movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
-
- PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
- movdqu HashKey(%arg2), %xmm13
-
- CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
- %xmm4, %xmm5, %xmm6
-.endm
-
-# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
-# struct has been initialized by GCM_INIT.
-# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
-# Clobbers rax, r10-r13, and xmm0-xmm15
-.macro GCM_ENC_DEC operation
- movdqu AadHash(%arg2), %xmm8
- movdqu HashKey(%arg2), %xmm13
- add %arg5, InLen(%arg2)
-
- xor %r11d, %r11d # initialise the data pointer offset as zero
- PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
-
- sub %r11, %arg5 # sub partial block data used
- mov %arg5, %r13 # save the number of bytes
-
- and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
- mov %r13, %r12
- # Encrypt/Decrypt first few blocks
-
- and $(3<<4), %r12
- jz .L_initial_num_blocks_is_0_\@
- cmp $(2<<4), %r12
- jb .L_initial_num_blocks_is_1_\@
- je .L_initial_num_blocks_is_2_\@
-.L_initial_num_blocks_is_3_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
- sub $48, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_2_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
- sub $32, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_1_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
- sub $16, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_0_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
-.L_initial_blocks_\@:
-
- # Main loop - Encrypt/Decrypt remaining blocks
-
- test %r13, %r13
- je .L_zero_cipher_left_\@
- sub $64, %r13
- je .L_four_cipher_left_\@
-.L_crypt_by_4_\@:
- GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
- %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
- %xmm7, %xmm8, enc
- add $64, %r11
- sub $64, %r13
- jne .L_crypt_by_4_\@
-.L_four_cipher_left_\@:
- GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-.L_zero_cipher_left_\@:
- movdqu %xmm8, AadHash(%arg2)
- movdqu %xmm0, CurCount(%arg2)
-
- mov %arg5, %r13
- and $15, %r13 # %r13 = arg5 (mod 16)
- je .L_multiple_of_16_bytes_\@
-
- mov %r13, PBlockLen(%arg2)
-
- # Handle the last <16 Byte block separately
- paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
- movdqu %xmm0, CurCount(%arg2)
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm0
-
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- movdqu %xmm0, PBlockEncKey(%arg2)
-
- cmp $16, %arg5
- jge .L_large_enough_update_\@
-
- lea (%arg4,%r11,1), %r10
- mov %r13, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
- jmp .L_data_read_\@
-
-.L_large_enough_update_\@:
- sub $16, %r11
- add %r13, %r11
-
- # receive the last <16 Byte block
- movdqu (%arg4, %r11, 1), %xmm1
-
- sub %r13, %r11
- add $16, %r11
-
- lea SHIFT_MASK+16(%rip), %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- sub %r13, %r12
- # get the appropriate shuffle mask
- movdqu (%r12), %xmm2
- # shift right 16-r13 bytes
- pshufb %xmm2, %xmm1
-
-.L_data_read_\@:
- lea ALL_F+16(%rip), %r12
- sub %r13, %r12
-
-.ifc \operation, dec
- movdqa %xmm1, %xmm2
-.endif
- pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
- movdqu (%r12), %xmm1
- # get the appropriate mask to mask out top 16-r13 bytes of xmm0
- pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
-.ifc \operation, dec
- pand %xmm1, %xmm2
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10 ,%xmm2
-
- pxor %xmm2, %xmm8
-.else
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10,%xmm0
-
- pxor %xmm0, %xmm8
-.endif
-
- movdqu %xmm8, AadHash(%arg2)
-.ifc \operation, enc
- # GHASH computation for the last <16 byte block
- movdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm0 back to output as ciphertext
- pshufb %xmm10, %xmm0
-.endif
-
- # Output %r13 bytes
- movq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
- mov %rax, (%arg3 , %r11, 1)
- add $8, %r11
- psrldq $8, %xmm0
- movq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- mov %al, (%arg3, %r11, 1)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_multiple_of_16_bytes_\@:
-.endm
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
- movdqu AadHash(%arg2), %xmm8
- movdqu HashKey(%arg2), %xmm13
-
- mov PBlockLen(%arg2), %r12
-
- test %r12, %r12
- je .L_partial_done\@
-
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
- mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- movd %r12d, %xmm15 # len(A) in %xmm15
- mov InLen(%arg2), %r12
- shl $3, %r12 # len(C) in bits (*128)
- movq %r12, %xmm1
-
- pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
- pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
- pxor %xmm15, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # final GHASH computation
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm8
-
- movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
- pxor %xmm8, %xmm0
-.L_return_T_\@:
- mov \AUTHTAG, %r10 # %r10 = authTag
- mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
- cmp $16, %r11
- je .L_T_16_\@
- cmp $8, %r11
- jl .L_T_4_\@
-.L_T_8_\@:
- movq %xmm0, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- psrldq $8, %xmm0
- test %r11, %r11
- je .L_return_T_done_\@
-.L_T_4_\@:
- movd %xmm0, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- psrldq $4, %xmm0
- test %r11, %r11
- je .L_return_T_done_\@
-.L_T_123_\@:
- movd %xmm0, %eax
- cmp $2, %r11
- jl .L_T_1_\@
- mov %ax, (%r10)
- cmp $2, %r11
- je .L_return_T_done_\@
- add $2, %r10
- sar $16, %eax
-.L_T_1_\@:
- mov %al, (%r10)
- jmp .L_return_T_done_\@
-.L_T_16_\@:
- movdqu %xmm0, (%r10)
-.L_return_T_done_\@:
-.endm
-
-#ifdef __x86_64__
-/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-*
-*
-* Input: A and B (128-bits each, bit-reflected)
-* Output: C = A*B*x mod poly, (i.e. >>1 )
-* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-*
-*/
-.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
- movdqa \GH, \TMP1
- pshufd $78, \GH, \TMP2
- pshufd $78, \HK, \TMP3
- pxor \GH, \TMP2 # TMP2 = a1+a0
- pxor \HK, \TMP3 # TMP3 = b1+b0
- pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \HK, \GH # GH = a0*b0
- pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
- pxor \GH, \TMP2
- pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \GH
- pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
-
- # first phase of the reduction
-
- movdqa \GH, \TMP2
- movdqa \GH, \TMP3
- movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- pslld $31, \TMP2 # packed right shift <<31
- pslld $30, \TMP3 # packed right shift <<30
- pslld $25, \TMP4 # packed right shift <<25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift TMP5 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \GH
-
- # second phase of the reduction
-
- movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- movdqa \GH,\TMP3
- movdqa \GH,\TMP4
- psrld $1,\TMP2 # packed left shift >>1
- psrld $2,\TMP3 # packed left shift >>2
- psrld $7,\TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \GH
- pxor \TMP1, \GH # result is in TMP1
-.endm
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN and XMM1
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
- cmp $8, \DLEN
- jl .L_read_lt8_\@
- mov (\DPTR), %rax
- movq %rax, \XMMDst
- sub $8, \DLEN
- jz .L_done_read_partial_block_\@
- xor %eax, %eax
-.L_read_next_byte_\@:
- shl $8, %rax
- mov 7(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_\@
- movq %rax, \XMM1
- pslldq $8, \XMM1
- por \XMM1, \XMMDst
- jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
- xor %eax, %eax
-.L_read_next_byte_lt8_\@:
- shl $8, %rax
- mov -1(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_lt8_\@
- movq %rax, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
-# clobbers r10-11, xmm14
-.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
- TMP6 TMP7
- MOVADQ SHUF_MASK(%rip), %xmm14
- mov \AAD, %r10 # %r10 = AAD
- mov \AADLEN, %r11 # %r11 = aadLen
- pxor \TMP7, \TMP7
- pxor \TMP6, \TMP6
-
- cmp $16, %r11
- jl .L_get_AAD_rest\@
-.L_get_AAD_blocks\@:
- movdqu (%r10), \TMP7
- pshufb %xmm14, \TMP7 # byte-reflect the AAD data
- pxor \TMP7, \TMP6
- GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
- add $16, %r10
- sub $16, %r11
- cmp $16, %r11
- jge .L_get_AAD_blocks\@
-
- movdqu \TMP6, \TMP7
-
- /* read the last <16B of AAD */
-.L_get_AAD_rest\@:
- test %r11, %r11
- je .L_get_AAD_done\@
-
- READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
- pshufb %xmm14, \TMP7 # byte-reflect the AAD data
- pxor \TMP6, \TMP7
- GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
- movdqu \TMP7, \TMP6
-
-.L_get_AAD_done\@:
- movdqu \TMP6, AadHash(%arg2)
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
- AAD_HASH operation
- mov PBlockLen(%arg2), %r13
- test %r13, %r13
- je .L_partial_block_done_\@ # Leave Macro if no partial blocks
- # Read in input data without over reading
- cmp $16, \PLAIN_CYPH_LEN
- jl .L_fewer_than_16_bytes_\@
- movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp .L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
- mov \PLAIN_CYPH_LEN, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
-
- mov PBlockLen(%arg2), %r13
-
-.L_data_read_\@: # Finished reading in data
-
- movdqu PBlockEncKey(%arg2), %xmm9
- movdqu HashKey(%arg2), %xmm13
-
- lea SHIFT_MASK(%rip), %r12
-
- # adjust the shuffle mask pointer to be able to shift r13 bytes
- # r16-r13 is the number of bytes in plaintext mod 16)
- add %r13, %r12
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- pshufb %xmm2, %xmm9 # shift right r13 bytes
-
-.ifc \operation, dec
- movdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_1_\@
- sub %r10, %r12
-.L_no_extra_mask_1_\@:
-
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
-
- pand %xmm1, %xmm3
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm3
- pshufb %xmm2, %xmm3
- pxor %xmm3, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_1_\@
-
- # GHASH computation for the last <16 Byte block
- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax, %eax
-
- mov %rax, PBlockLen(%arg2)
- jmp .L_dec_done_\@
-.L_partial_incomplete_1_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_dec_done_\@:
- movdqu \AAD_HASH, AadHash(%arg2)
-.else
- pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_2_\@
- sub %r10, %r12
-.L_no_extra_mask_2_\@:
-
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- pand %xmm1, %xmm9
-
- movdqa SHUF_MASK(%rip), %xmm1
- pshufb %xmm1, %xmm9
- pshufb %xmm2, %xmm9
- pxor %xmm9, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_2_\@
-
- # GHASH computation for the last <16 Byte block
- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax, %eax
-
- mov %rax, PBlockLen(%arg2)
- jmp .L_encode_done_\@
-.L_partial_incomplete_2_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_encode_done_\@:
- movdqu \AAD_HASH, AadHash(%arg2)
-
- movdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm9 back to output as ciphertext
- pshufb %xmm10, %xmm9
- pshufb %xmm2, %xmm9
-.endif
- # output encrypted Bytes
- test %r10, %r10
- jl .L_partial_fill_\@
- mov %r13, %r12
- mov $16, %r13
- # Set r13 to be the number of bytes to write out
- sub %r12, %r13
- jmp .L_count_set_\@
-.L_partial_fill_\@:
- mov \PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
- movdqa %xmm9, %xmm0
- movq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
-
- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $8, \DATA_OFFSET
- psrldq $8, %xmm0
- movq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $1, \DATA_OFFSET
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg2, %arg3 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
- XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
- MOVADQ SHUF_MASK(%rip), %xmm14
-
- movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
-
- # start AES for num_initial_blocks blocks
-
- movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-
- MOVADQ ONE(%RIP),\TMP1
- MOVADQ 0(%arg1),\TMP2
-.irpc index, \i_seq
- paddd \TMP1, \XMM0 # INCR Y0
-.ifc \operation, dec
- movdqa \XMM0, %xmm\index
-.else
- MOVADQ \XMM0, %xmm\index
-.endif
- pshufb %xmm14, %xmm\index # perform a 16 byte swap
- pxor \TMP2, %xmm\index
-.endr
- lea 0x10(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
-
-.Laes_loop_initial_\@:
- MOVADQ (%r10),\TMP1
-.irpc index, \i_seq
- aesenc \TMP1, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_initial_\@
-
- MOVADQ (%r10), \TMP1
-.irpc index, \i_seq
- aesenclast \TMP1, %xmm\index # Last Round
-.endr
-.irpc index, \i_seq
- movdqu (%arg4 , %r11, 1), \TMP1
- pxor \TMP1, %xmm\index
- movdqu %xmm\index, (%arg3 , %r11, 1)
- # write back plaintext/ciphertext for num_initial_blocks
- add $16, %r11
-
-.ifc \operation, dec
- movdqa \TMP1, %xmm\index
-.endif
- pshufb %xmm14, %xmm\index
-
- # prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
- # apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
- pxor %xmm5, %xmm6
- GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
- cmp $64, %r13
- jl .L_initial_blocks_done\@
- # no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
- MOVADQ ONE(%RIP),\TMP1
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM1
- pshufb %xmm14, \XMM1 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM2
- pshufb %xmm14, \XMM2 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM3
- pshufb %xmm14, \XMM3 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM4
- pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
- MOVADQ 0(%arg1),\TMP1
- pxor \TMP1, \XMM1
- pxor \TMP1, \XMM2
- pxor \TMP1, \XMM3
- pxor \TMP1, \XMM4
-.irpc index, 1234 # do 4 rounds
- movaps 0x10*\index(%arg1), \TMP1
- aesenc \TMP1, \XMM1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
-.endr
-.irpc index, 56789 # do next 5 rounds
- movaps 0x10*\index(%arg1), \TMP1
- aesenc \TMP1, \XMM1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
-.endr
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_pre_done\@
-
-.Laes_loop_pre_\@:
- MOVADQ (%r10),\TMP2
-.irpc index, 1234
- aesenc \TMP2, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_pre_\@
-
-.Laes_loop_pre_done\@:
- MOVADQ (%r10), \TMP2
- aesenclast \TMP2, \XMM1
- aesenclast \TMP2, \XMM2
- aesenclast \TMP2, \XMM3
- aesenclast \TMP2, \XMM4
- movdqu 16*0(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM1
-.ifc \operation, dec
- movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM1
-.endif
- movdqu 16*1(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM2
-.ifc \operation, dec
- movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM2
-.endif
- movdqu 16*2(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM3
-.ifc \operation, dec
- movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM3
-.endif
- movdqu 16*3(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM4
-.ifc \operation, dec
- movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM4
-.else
- movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
- movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
- movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
- movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
-.endif
-
- add $64, %r11
- pshufb %xmm14, \XMM1 # perform a 16 byte swap
- pxor \XMMDst, \XMM1
-# combine GHASHed value with the corresponding ciphertext
- pshufb %xmm14, \XMM2 # perform a 16 byte swap
- pshufb %xmm14, \XMM3 # perform a 16 byte swap
- pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
-.L_initial_blocks_done\@:
-
-.endm
-
-/*
-* encrypt 4 blocks at a time
-* ghash the 4 previously encrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
-
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
-
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqu HashKey_4_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 2
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 3
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 4
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_3_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 5
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
-
- # Multiply TMP5 * HashKey using karatsuba
-
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 6
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 7
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_2_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 8
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
-
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
-
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 9
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_par_enc_done\@
-
-.Laes_loop_par_enc\@:
- MOVADQ (%r10),\TMP3
-.irpc index, 1234
- aesenc \TMP3, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_par_enc\@
-
-.Laes_loop_par_enc_done\@:
- MOVADQ (%r10), \TMP3
- aesenclast \TMP3, \XMM1 # Round 10
- aesenclast \TMP3, \XMM2
- aesenclast \TMP3, \XMM3
- aesenclast \TMP3, \XMM4
- movdqu HashKey_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu 16(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu 32(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu 48(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
-
- # first phase of reduction
-
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
-
- # second phase of reduction
-
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
-
- pxor \XMM5, \XMM1
-.endm
-
-/*
-* decrypt 4 blocks at a time
-* ghash the 4 previously decrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
-
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
-
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqu HashKey_4_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 2
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 3
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 4
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_3_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 5
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
-
- # Multiply TMP5 * HashKey using karatsuba
-
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 6
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 7
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_2_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 8
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
-
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
-
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 9
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_par_dec_done\@
-
-.Laes_loop_par_dec\@:
- MOVADQ (%r10),\TMP3
-.irpc index, 1234
- aesenc \TMP3, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_par_dec\@
-
-.Laes_loop_par_dec_done\@:
- MOVADQ (%r10), \TMP3
- aesenclast \TMP3, \XMM1 # last round
- aesenclast \TMP3, \XMM2
- aesenclast \TMP3, \XMM3
- aesenclast \TMP3, \XMM4
- movdqu HashKey_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM1
- movdqu 16(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM2
- movdqu 32(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM3
- movdqu 48(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
-
- # first phase of reduction
-
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
-
- # second phase of reduction
-
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
-
- pxor \XMM5, \XMM1
-.endm
-
-/* GHASH the last 4 ciphertext blocks. */
-.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
-TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
-
- # Multiply TMP6 * HashKey (using Karatsuba)
-
- movdqa \XMM1, \TMP6
- pshufd $78, \XMM1, \TMP2
- pxor \XMM1, \TMP2
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
- movdqu HashKey_4_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqa \XMM1, \XMMDst
- movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
-
- movdqa \XMM2, \TMP1
- pshufd $78, \XMM2, \TMP2
- pxor \XMM2, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
- movdqu HashKey_3_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM2, \XMMDst
- pxor \TMP2, \XMM1
-# results accumulated in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
-
- movdqa \XMM3, \TMP1
- pshufd $78, \XMM3, \TMP2
- pxor \XMM3, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
- movdqu HashKey_2_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM3, \XMMDst
- pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
- movdqa \XMM4, \TMP1
- pshufd $78, \XMM4, \TMP2
- pxor \XMM4, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
- movdqu HashKey_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM4, \XMMDst
- pxor \XMM1, \TMP2
- pxor \TMP6, \TMP2
- pxor \XMMDst, \TMP2
- # middle section of the temp results combined as in karatsuba algorithm
- movdqa \TMP2, \TMP4
- pslldq $8, \TMP4 # left shift TMP4 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP4, \XMMDst
- pxor \TMP2, \TMP6
-# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
- # first phase of the reduction
- movdqa \XMMDst, \TMP2
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
-# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
- pslld $31, \TMP2 # packed right shifting << 31
- pslld $30, \TMP3 # packed right shifting << 30
- pslld $25, \TMP4 # packed right shifting << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP7
- psrldq $4, \TMP7 # right shift TMP7 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \XMMDst
-
- # second phase of the reduction
- movdqa \XMMDst, \TMP2
- # make 3 copies of XMMDst for doing 3 shift operations
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
- psrld $1, \TMP2 # packed left shift >> 1
- psrld $2, \TMP3 # packed left shift >> 2
- psrld $7, \TMP4 # packed left shift >> 7
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- pxor \TMP7, \TMP2
- pxor \TMP2, \XMMDst
- pxor \TMP6, \XMMDst # reduced result is in XMMDst
-.endm
-
-
-/* Encryption of a single block
-* uses eax & r10
-*/
-
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
-
- pxor (%arg1), \XMM0
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
- lea 16(%arg1), %r10 # get first expanded key address
-
-_esb_loop_\@:
- MOVADQ (%r10),\TMP1
- aesenc \TMP1,\XMM0
- add $16,%r10
- sub $1,%eax
- jnz _esb_loop_\@
-
- MOVADQ (%r10),\TMP1
- aesenclast \TMP1,\XMM0
-.endm
-/*****************************************************************************
-* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Plaintext output. Encrypt in-place is allowed.
-* const u8 *in, // Ciphertext input
-* u64 plaintext_len, // Length of data in bytes for decryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
-* // given authentication tag and only return the plaintext if they match.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
-* // (most likely), 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the first
-* set of 11 keys in the data structure void *aes_ctx
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-*
-*****************************************************************************/
-SYM_FUNC_START(aesni_gcm_dec)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC dec
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec)
-
-
-/*****************************************************************************
-* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the
-* first set of 11 keys in the data structure void *aes_ctx
-*
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-***************************************************************************/
-SYM_FUNC_START(aesni_gcm_enc)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC enc
-
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc)
-
-/*****************************************************************************
-* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len) // Length of AAD in bytes.
-*/
-SYM_FUNC_START(aesni_gcm_init)
- FUNC_SAVE
- GCM_INIT %arg3, %arg4,%arg5, %arg6
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init)
-
-/*****************************************************************************
-* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_enc_update)
- FUNC_SAVE
- GCM_ENC_DEC enc
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update)
-
-/*****************************************************************************
-* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_dec_update)
- FUNC_SAVE
- GCM_ENC_DEC dec
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update)
-
-/*****************************************************************************
-* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*/
-SYM_FUNC_START(aesni_gcm_finalize)
- FUNC_SAVE
- GCM_COMPLETE %arg3 %arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize)
-
-#endif
-
SYM_FUNC_START_LOCAL(_key_expansion_256a)
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -1820,8 +133,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
SYM_FUNC_END(_key_expansion_256b)
/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
+ * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
*/
SYM_FUNC_START(aesni_set_key)
FRAME_BEGIN
@@ -1926,7 +239,6 @@ SYM_FUNC_START(aesni_set_key)
sub $0x10, UKEYP
cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor AREG, AREG
#ifndef __x86_64__
popl KEYP
#endif
@@ -2826,28 +1138,24 @@ SYM_FUNC_END(aesni_ctr_enc)
.previous
/*
- * _aesni_gf128mul_x_ble: internal ABI
- * Multiply in GF(2^128) for XTS IVs
+ * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
* input:
* IV: current IV
* GF128MUL_MASK == mask with 0x87 and 0x01
* output:
* IV: next IV
* changed:
- * CTR: == temporary value
+ * KEY: == temporary value
*/
-#define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, KEY; \
- paddq IV, IV; \
- psrad $31, KEY; \
- pand GF128MUL_MASK, KEY; \
- pxor KEY, IV;
+.macro _aesni_gf128mul_x_ble
+ pshufd $0x13, IV, KEY
+ paddq IV, IV
+ psrad $31, KEY
+ pand GF128MUL_MASK, KEY
+ pxor KEY, IV
+.endm
-/*
- * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_encrypt)
+.macro _aesni_xts_crypt enc
FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
@@ -2866,35 +1174,46 @@ SYM_FUNC_START(aesni_xts_encrypt)
movups (IVP), IV
mov 480(KEYP), KLEN
+.if !\enc
+ add $240, KEYP
+
+ test $15, LEN
+ jz .Lxts_loop4\@
+ sub $16, LEN
+.endif
-.Lxts_enc_loop4:
+.Lxts_loop4\@:
sub $64, LEN
- jl .Lxts_enc_1x
+ jl .Lxts_1x\@
movdqa IV, STATE1
movdqu 0x00(INP), IN
pxor IN, STATE1
movdqu IV, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE2
movdqu 0x10(INP), IN
pxor IN, STATE2
movdqu IV, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE3
movdqu 0x20(INP), IN
pxor IN, STATE3
movdqu IV, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE4
movdqu 0x30(INP), IN
pxor IN, STATE4
movdqu IV, 0x30(OUTP)
+.if \enc
call _aesni_enc4
+.else
+ call _aesni_dec4
+.endif
movdqu 0x00(OUTP), IN
pxor IN, STATE1
@@ -2912,17 +1231,17 @@ SYM_FUNC_START(aesni_xts_encrypt)
pxor IN, STATE4
movdqu STATE4, 0x30(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
add $64, INP
add $64, OUTP
test LEN, LEN
- jnz .Lxts_enc_loop4
+ jnz .Lxts_loop4\@
-.Lxts_enc_ret_iv:
+.Lxts_ret_iv\@:
movups IV, (IVP)
-.Lxts_enc_ret:
+.Lxts_ret\@:
#ifndef __x86_64__
popl KLEN
popl KEYP
@@ -2932,201 +1251,60 @@ SYM_FUNC_START(aesni_xts_encrypt)
FRAME_END
RET
-.Lxts_enc_1x:
+.Lxts_1x\@:
add $64, LEN
- jz .Lxts_enc_ret_iv
+ jz .Lxts_ret_iv\@
+.if \enc
sub $16, LEN
- jl .Lxts_enc_cts4
+ jl .Lxts_cts4\@
+.endif
-.Lxts_enc_loop1:
+.Lxts_loop1\@:
movdqu (INP), STATE
+.if \enc
pxor IV, STATE
call _aesni_enc1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_enc_out
-
+.else
add $16, INP
sub $16, LEN
- jl .Lxts_enc_cts1
-
- movdqu STATE, (OUTP)
- add $16, OUTP
- jmp .Lxts_enc_loop1
-
-.Lxts_enc_out:
- movdqu STATE, (OUTP)
- jmp .Lxts_enc_ret_iv
-
-.Lxts_enc_cts4:
- movdqa STATE4, STATE
- sub $16, OUTP
-
-.Lxts_enc_cts1:
-#ifndef __x86_64__
- lea .Lcts_permute_table, T1
-#else
- lea .Lcts_permute_table(%rip), T1
-#endif
- add LEN, INP /* rewind input pointer */
- add $16, LEN /* # bytes in final block */
- movups (INP), IN1
-
- mov T1, IVP
- add $32, IVP
- add LEN, T1
- sub LEN, IVP
- add OUTP, LEN
-
- movups (T1), %xmm4
- movaps STATE, IN2
- pshufb %xmm4, STATE
- movups STATE, (LEN)
-
- movups (IVP), %xmm0
- pshufb %xmm0, IN1
- pblendvb IN2, IN1
- movaps IN1, STATE
-
+ jl .Lxts_cts1\@
pxor IV, STATE
- call _aesni_enc1
+ call _aesni_dec1
+.endif
pxor IV, STATE
+ _aesni_gf128mul_x_ble
- movups STATE, (OUTP)
- jmp .Lxts_enc_ret
-SYM_FUNC_END(aesni_xts_encrypt)
-
-/*
- * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_decrypt)
- FRAME_BEGIN
-#ifndef __x86_64__
- pushl IVP
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
- movl (FRAME_OFFSET+24)(%esp), OUTP # dst
- movl (FRAME_OFFSET+28)(%esp), INP # src
- movl (FRAME_OFFSET+32)(%esp), LEN # len
- movl (FRAME_OFFSET+36)(%esp), IVP # iv
- movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
-#else
- movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
-#endif
- movups (IVP), IV
-
- mov 480(KEYP), KLEN
- add $240, KEYP
-
- test $15, LEN
- jz .Lxts_dec_loop4
- sub $16, LEN
-
-.Lxts_dec_loop4:
- sub $64, LEN
- jl .Lxts_dec_1x
-
- movdqa IV, STATE1
- movdqu 0x00(INP), IN
- pxor IN, STATE1
- movdqu IV, 0x00(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x10(INP), IN
- pxor IN, STATE2
- movdqu IV, 0x10(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x20(INP), IN
- pxor IN, STATE3
- movdqu IV, 0x20(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x30(INP), IN
- pxor IN, STATE4
- movdqu IV, 0x30(OUTP)
-
- call _aesni_dec4
-
- movdqu 0x00(OUTP), IN
- pxor IN, STATE1
- movdqu STATE1, 0x00(OUTP)
-
- movdqu 0x10(OUTP), IN
- pxor IN, STATE2
- movdqu STATE2, 0x10(OUTP)
-
- movdqu 0x20(OUTP), IN
- pxor IN, STATE3
- movdqu STATE3, 0x20(OUTP)
-
- movdqu 0x30(OUTP), IN
- pxor IN, STATE4
- movdqu STATE4, 0x30(OUTP)
-
- _aesni_gf128mul_x_ble()
-
- add $64, INP
- add $64, OUTP
test LEN, LEN
- jnz .Lxts_dec_loop4
-
-.Lxts_dec_ret_iv:
- movups IV, (IVP)
-
-.Lxts_dec_ret:
-#ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- popl IVP
-#endif
- FRAME_END
- RET
-
-.Lxts_dec_1x:
- add $64, LEN
- jz .Lxts_dec_ret_iv
-
-.Lxts_dec_loop1:
- movdqu (INP), STATE
+ jz .Lxts_out\@
+.if \enc
add $16, INP
sub $16, LEN
- jl .Lxts_dec_cts1
-
- pxor IV, STATE
- call _aesni_dec1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_dec_out
+ jl .Lxts_cts1\@
+.endif
movdqu STATE, (OUTP)
add $16, OUTP
- jmp .Lxts_dec_loop1
+ jmp .Lxts_loop1\@
-.Lxts_dec_out:
+.Lxts_out\@:
movdqu STATE, (OUTP)
- jmp .Lxts_dec_ret_iv
+ jmp .Lxts_ret_iv\@
-.Lxts_dec_cts1:
+.if \enc
+.Lxts_cts4\@:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+.Lxts_cts1\@:
+.else
+.Lxts_cts1\@:
movdqa IV, STATE4
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
pxor IV, STATE
call _aesni_dec1
pxor IV, STATE
-
+.endif
#ifndef __x86_64__
lea .Lcts_permute_table, T1
#else
@@ -3152,10 +1330,32 @@ SYM_FUNC_START(aesni_xts_decrypt)
pblendvb IN2, IN1
movaps IN1, STATE
+.if \enc
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+.else
pxor STATE4, STATE
call _aesni_dec1
pxor STATE4, STATE
+.endif
movups STATE, (OUTP)
- jmp .Lxts_dec_ret
-SYM_FUNC_END(aesni_xts_decrypt)
+ jmp .Lxts_ret\@
+.endm
+
+/*
+ * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_enc)
+ _aesni_xts_crypt 1
+SYM_FUNC_END(aesni_xts_enc)
+
+/*
+ * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_dec)
+ _aesni_xts_crypt 0
+SYM_FUNC_END(aesni_xts_dec)