diff options
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 1970 |
1 files changed, 85 insertions, 1885 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 411d8c83e88a..eb153eff9331 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -10,16 +10,7 @@ * Vinodh Gopal <vinodh.gopal@intel.com> * Kahraman Akdemir * - * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD - * interface for 64-bit kernels. - * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) - * Aidan O'Mahony (aidan.o.mahony@intel.com) - * Adrian Hoban <adrian.hoban@intel.com> - * James Guilford (james.guilford@intel.com) - * Gabriele Paoloni <gabriele.paoloni@intel.com> - * Tadeusz Struk (tadeusz.struk@intel.com) - * Wajdi Feghali (wajdi.k.feghali@intel.com) - * Copyright (c) 2010, Intel Corporation. + * Copyright (c) 2010, Intel Corporation. * * Ported x86_64 version to x86: * Author: Mathias Krause <minipli@googlemail.com> @@ -27,103 +18,6 @@ #include <linux/linkage.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> - -/* - * The following macros are used to move an (un)aligned 16 byte value to/from - * an XMM register. This can done for either FP or integer values, for FP use - * movaps (move aligned packed single) or integer use movdqa (move double quad - * aligned). It doesn't make a performance difference which instruction is used - * since Nehalem (original Core i7) was released. However, the movaps is a byte - * shorter, so that is the one we'll use for now. (same for unaligned). - */ -#define MOVADQ movaps -#define MOVUDQ movups - -#ifdef __x86_64__ - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F -.section .rodata.cst16.MASK1, "aM", @progbits, 16 -.align 16 -MASK1: .octa 0x0000000000000000ffffffffffffffff -.section .rodata.cst16.MASK2, "aM", @progbits, 16 -.align 16 -MASK2: .octa 0xffffffffffffffff0000000000000000 -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 -.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 -.align 16 -F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 -.section .rodata.cst16.dec, "aM", @progbits, 16 -.align 16 -dec: .octa 0x1 -.section .rodata.cst16.enc, "aM", @progbits, 16 -.align 16 -enc: .octa 0x2 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, -# and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define STACK_OFFSET 8*3 - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 -#define HashKey 16*6 // store HashKey <<1 mod poly here -#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here -#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 - // bits of HashKey <<1 mod poly here - //(for Karatsuba purposes) -#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 - // bits of HashKey^2 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 - // bits of HashKey^3 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 - // bits of HashKey^4 <<1 mod poly here - // (for Karatsuba purposes) - -#define arg1 rdi -#define arg2 rsi -#define arg3 rdx -#define arg4 rcx -#define arg5 r8 -#define arg6 r9 -#define arg7 STACK_OFFSET+8(%rsp) -#define arg8 STACK_OFFSET+16(%rsp) -#define arg9 STACK_OFFSET+24(%rsp) -#define arg10 STACK_OFFSET+32(%rsp) -#define arg11 STACK_OFFSET+40(%rsp) -#define keysize 2*15*16(%arg1) -#endif - #define STATE1 %xmm0 #define STATE2 %xmm4 @@ -170,1587 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define TKEYP T1 #endif -.macro FUNC_SAVE - push %r12 - push %r13 - push %r14 -# -# states of %xmm registers %xmm6:%xmm15 not saved -# all %xmm registers are clobbered -# -.endm - - -.macro FUNC_RESTORE - pop %r14 - pop %r13 - pop %r12 -.endm - -# Precompute hashkeys. -# Input: Hash subkey. -# Output: HashKeys stored in gcm_context_data. Only needs to be called -# once per key. -# clobbers r12, and tmp xmm registers. -.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 - mov \SUBKEY, %r12 - movdqu (%r12), \TMP3 - movdqa SHUF_MASK(%rip), \TMP2 - pshufb \TMP2, \TMP3 - - # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - - movdqa \TMP3, \TMP2 - psllq $1, \TMP3 - psrlq $63, \TMP2 - movdqa \TMP2, \TMP1 - pslldq $8, \TMP2 - psrldq $8, \TMP1 - por \TMP2, \TMP3 - - # reduce HashKey<<1 - - pshufd $0x24, \TMP1, \TMP2 - pcmpeqd TWOONE(%rip), \TMP2 - pand POLY(%rip), \TMP2 - pxor \TMP2, \TMP3 - movdqu \TMP3, HashKey(%arg2) - - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqu \TMP1, HashKey_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqu \TMP5, HashKey_2(%arg2) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_2_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_3(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_3_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_4(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_4_k(%arg2) -.endm - -# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. -# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 -.macro GCM_INIT Iv SUBKEY AAD AADLEN - mov \AADLEN, %r11 - mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(%arg2) # ctx_data.in_length = 0 - mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 - mov \Iv, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv - - movdqa SHUF_MASK(%rip), %xmm2 - pshufb %xmm2, %xmm0 - movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 - movdqu HashKey(%arg2), %xmm13 - - CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ - %xmm4, %xmm5, %xmm6 -.endm - -# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context -# struct has been initialized by GCM_INIT. -# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK -# Clobbers rax, r10-r13, and xmm0-xmm15 -.macro GCM_ENC_DEC operation - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - add %arg5, InLen(%arg2) - - xor %r11d, %r11d # initialise the data pointer offset as zero - PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation - - sub %r11, %arg5 # sub partial block data used - mov %arg5, %r13 # save the number of bytes - - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) - mov %r13, %r12 - # Encrypt/Decrypt first few blocks - - and $(3<<4), %r12 - jz .L_initial_num_blocks_is_0_\@ - cmp $(2<<4), %r12 - jb .L_initial_num_blocks_is_1_\@ - je .L_initial_num_blocks_is_2_\@ -.L_initial_num_blocks_is_3_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation - sub $48, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_2_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation - sub $32, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_1_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation - sub $16, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_0_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -.L_initial_blocks_\@: - - # Main loop - Encrypt/Decrypt remaining blocks - - test %r13, %r13 - je .L_zero_cipher_left_\@ - sub $64, %r13 - je .L_four_cipher_left_\@ -.L_crypt_by_4_\@: - GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ - %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ - %xmm7, %xmm8, enc - add $64, %r11 - sub $64, %r13 - jne .L_crypt_by_4_\@ -.L_four_cipher_left_\@: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -.L_zero_cipher_left_\@: - movdqu %xmm8, AadHash(%arg2) - movdqu %xmm0, CurCount(%arg2) - - mov %arg5, %r13 - and $15, %r13 # %r13 = arg5 (mod 16) - je .L_multiple_of_16_bytes_\@ - - mov %r13, PBlockLen(%arg2) - - # Handle the last <16 Byte block separately - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - movdqu %xmm0, CurCount(%arg2) - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - movdqu %xmm0, PBlockEncKey(%arg2) - - cmp $16, %arg5 - jge .L_large_enough_update_\@ - - lea (%arg4,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp .L_data_read_\@ - -.L_large_enough_update_\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - movdqu (%arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - movdqu (%r12), %xmm2 - # shift right 16-r13 bytes - pshufb %xmm2, %xmm1 - -.L_data_read_\@: - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - -.ifc \operation, dec - movdqa %xmm1, %xmm2 -.endif - pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 -.ifc \operation, dec - pand %xmm1, %xmm2 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10 ,%xmm2 - - pxor %xmm2, %xmm8 -.else - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10,%xmm0 - - pxor %xmm0, %xmm8 -.endif - - movdqu %xmm8, AadHash(%arg2) -.ifc \operation, enc - # GHASH computation for the last <16 byte block - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm0 back to output as ciphertext - pshufb %xmm10, %xmm0 -.endif - - # Output %r13 bytes - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - mov %rax, (%arg3 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - mov %al, (%arg3, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_multiple_of_16_bytes_\@: -.endm - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - - mov PBlockLen(%arg2), %r12 - - test %r12, %r12 - je .L_partial_done\@ - - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - movd %r12d, %xmm15 # len(A) in %xmm15 - mov InLen(%arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - movq %r12, %xmm1 - - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm8 - - movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) - pxor %xmm8, %xmm0 -.L_return_T_\@: - mov \AUTHTAG, %r10 # %r10 = authTag - mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je .L_T_16_\@ - cmp $8, %r11 - jl .L_T_4_\@ -.L_T_8_\@: - movq %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_4_\@: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_123_\@: - movd %xmm0, %eax - cmp $2, %r11 - jl .L_T_1_\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done_\@ - add $2, %r10 - sar $16, %eax -.L_T_1_\@: - mov %al, (%r10) - jmp .L_return_T_done_\@ -.L_T_16_\@: - movdqu %xmm0, (%r10) -.L_return_T_done_\@: -.endm - -#ifdef __x86_64__ -/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -* -* -* Input: A and B (128-bits each, bit-reflected) -* Output: C = A*B*x mod poly, (i.e. >>1 ) -* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -* -*/ -.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 - movdqa \GH, \TMP1 - pshufd $78, \GH, \TMP2 - pshufd $78, \HK, \TMP3 - pxor \GH, \TMP2 # TMP2 = a1+a0 - pxor \HK, \TMP3 # TMP3 = b1+b0 - pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \HK, \GH # GH = a0*b0 - pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) - pxor \GH, \TMP2 - pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \GH - pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK - - # first phase of the reduction - - movdqa \GH, \TMP2 - movdqa \GH, \TMP3 - movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - pslld $31, \TMP2 # packed right shift <<31 - pslld $30, \TMP3 # packed right shift <<30 - pslld $25, \TMP4 # packed right shift <<25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift TMP5 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \GH - - # second phase of the reduction - - movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - movdqa \GH,\TMP3 - movdqa \GH,\TMP4 - psrld $1,\TMP2 # packed left shift >>1 - psrld $2,\TMP3 # packed left shift >>2 - psrld $7,\TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \GH - pxor \TMP1, \GH # result is in TMP1 -.endm - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN and XMM1 -.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - movq %rax, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - movq %rax, \XMM1 - pslldq $8, \XMM1 - por \XMM1, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - movq %rax, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -# clobbers r10-11, xmm14 -.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ - TMP6 TMP7 - MOVADQ SHUF_MASK(%rip), %xmm14 - mov \AAD, %r10 # %r10 = AAD - mov \AADLEN, %r11 # %r11 = aadLen - pxor \TMP7, \TMP7 - pxor \TMP6, \TMP6 - - cmp $16, %r11 - jl .L_get_AAD_rest\@ -.L_get_AAD_blocks\@: - movdqu (%r10), \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP7, \TMP6 - GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - add $16, %r10 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - - movdqu \TMP6, \TMP7 - - /* read the last <16B of AAD */ -.L_get_AAD_rest\@: - test %r11, %r11 - je .L_get_AAD_done\@ - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP6, \TMP7 - GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - movdqu \TMP7, \TMP6 - -.L_get_AAD_done\@: - movdqu \TMP6, AadHash(%arg2) -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH operation - mov PBlockLen(%arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 - - mov PBlockLen(%arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - movdqu PBlockEncKey(%arg2), %xmm9 - movdqu HashKey(%arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm9 # shift right r13 bytes - -.ifc \operation, dec - movdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 - - pand %xmm1, %xmm3 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm3 - pshufb %xmm2, %xmm3 - pxor %xmm3, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_dec_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) -.else - pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 - - movdqa SHUF_MASK(%rip), %xmm1 - pshufb %xmm1, %xmm9 - pshufb %xmm2, %xmm9 - pxor %xmm9, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_encode_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) - - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - pshufb %xmm10, %xmm9 - pshufb %xmm2, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - movdqa %xmm9, %xmm0 - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -/* -* if a = number of total plaintext bytes -* b = floor(a/16) -* num_initial_blocks = b mod 4 -* encrypt the initial num_initial_blocks blocks and apply ghash on -* the ciphertext -* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers -* are clobbered -* arg1, %arg2, %arg3 are used as a pointer only, not modified -*/ - - -.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ - XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - - movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 - - # start AES for num_initial_blocks blocks - - movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 - -.if (\i == 5) || (\i == 6) || (\i == 7) - - MOVADQ ONE(%RIP),\TMP1 - MOVADQ 0(%arg1),\TMP2 -.irpc index, \i_seq - paddd \TMP1, \XMM0 # INCR Y0 -.ifc \operation, dec - movdqa \XMM0, %xmm\index -.else - MOVADQ \XMM0, %xmm\index -.endif - pshufb %xmm14, %xmm\index # perform a 16 byte swap - pxor \TMP2, %xmm\index -.endr - lea 0x10(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - -.Laes_loop_initial_\@: - MOVADQ (%r10),\TMP1 -.irpc index, \i_seq - aesenc \TMP1, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_initial_\@ - - MOVADQ (%r10), \TMP1 -.irpc index, \i_seq - aesenclast \TMP1, %xmm\index # Last Round -.endr -.irpc index, \i_seq - movdqu (%arg4 , %r11, 1), \TMP1 - pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg3 , %r11, 1) - # write back plaintext/ciphertext for num_initial_blocks - add $16, %r11 - -.ifc \operation, dec - movdqa \TMP1, %xmm\index -.endif - pshufb %xmm14, %xmm\index - - # prepare plaintext/ciphertext for GHASH computation -.endr -.endif - - # apply GHASH on num_initial_blocks blocks - -.if \i == 5 - pxor %xmm5, %xmm6 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 6 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 7 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.endif - cmp $64, %r13 - jl .L_initial_blocks_done\@ - # no need for precomputed values -/* -* -* Precomputations for HashKey parallel with encryption of first 4 blocks. -* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i -*/ - MOVADQ ONE(%RIP),\TMP1 - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM1 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM2 - pshufb %xmm14, \XMM2 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM3 - pshufb %xmm14, \XMM3 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM4 - pshufb %xmm14, \XMM4 # perform a 16 byte swap - - MOVADQ 0(%arg1),\TMP1 - pxor \TMP1, \XMM1 - pxor \TMP1, \XMM2 - pxor \TMP1, \XMM3 - pxor \TMP1, \XMM4 -.irpc index, 1234 # do 4 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr -.irpc index, 56789 # do next 5 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_pre_done\@ - -.Laes_loop_pre_\@: - MOVADQ (%r10),\TMP2 -.irpc index, 1234 - aesenc \TMP2, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_pre_\@ - -.Laes_loop_pre_done\@: - MOVADQ (%r10), \TMP2 - aesenclast \TMP2, \XMM1 - aesenclast \TMP2, \XMM2 - aesenclast \TMP2, \XMM3 - aesenclast \TMP2, \XMM4 - movdqu 16*0(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM1 -.ifc \operation, dec - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM1 -.endif - movdqu 16*1(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM2 -.ifc \operation, dec - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM2 -.endif - movdqu 16*2(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM3 -.ifc \operation, dec - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM3 -.endif - movdqu 16*3(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM4 -.ifc \operation, dec - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM4 -.else - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) -.endif - - add $64, %r11 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - pxor \XMMDst, \XMM1 -# combine GHASHed value with the corresponding ciphertext - pshufb %xmm14, \XMM2 # perform a 16 byte swap - pshufb %xmm14, \XMM3 # perform a 16 byte swap - pshufb %xmm14, \XMM4 # perform a 16 byte swap - -.L_initial_blocks_done\@: - -.endm - -/* -* encrypt 4 blocks at a time -* ghash the 4 previously encrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_enc_done\@ - -.Laes_loop_par_enc\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_enc\@ - -.Laes_loop_par_enc_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # Round 10 - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* -* decrypt 4 blocks at a time -* ghash the 4 previously decrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_dec_done\@ - -.Laes_loop_par_dec\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_dec\@ - -.Laes_loop_par_dec_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # last round - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM1 - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM2 - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM3 - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* GHASH the last 4 ciphertext blocks. */ -.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ -TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst - - # Multiply TMP6 * HashKey (using Karatsuba) - - movdqa \XMM1, \TMP6 - pshufd $78, \XMM1, \TMP2 - pxor \XMM1, \TMP2 - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqu HashKey_4_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqa \XMM1, \XMMDst - movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM2, \TMP1 - pshufd $78, \XMM2, \TMP2 - pxor \XMM2, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqu HashKey_3_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM2, \XMMDst - pxor \TMP2, \XMM1 -# results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM3, \TMP1 - pshufd $78, \XMM3, \TMP2 - pxor \XMM3, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqu HashKey_2_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM3, \XMMDst - pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - movdqa \XMM4, \TMP1 - pshufd $78, \XMM4, \TMP2 - pxor \XMM4, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqu HashKey_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM4, \XMMDst - pxor \XMM1, \TMP2 - pxor \TMP6, \TMP2 - pxor \XMMDst, \TMP2 - # middle section of the temp results combined as in karatsuba algorithm - movdqa \TMP2, \TMP4 - pslldq $8, \TMP4 # left shift TMP4 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP4, \XMMDst - pxor \TMP2, \TMP6 -# TMP6:XMMDst holds the result of the accumulated carry-less multiplications - # first phase of the reduction - movdqa \XMMDst, \TMP2 - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 -# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently - pslld $31, \TMP2 # packed right shifting << 31 - pslld $30, \TMP3 # packed right shifting << 30 - pslld $25, \TMP4 # packed right shifting << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP7 - psrldq $4, \TMP7 # right shift TMP7 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \XMMDst - - # second phase of the reduction - movdqa \XMMDst, \TMP2 - # make 3 copies of XMMDst for doing 3 shift operations - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 - psrld $1, \TMP2 # packed left shift >> 1 - psrld $2, \TMP3 # packed left shift >> 2 - psrld $7, \TMP4 # packed left shift >> 7 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - pxor \TMP7, \TMP2 - pxor \TMP2, \XMMDst - pxor \TMP6, \XMMDst # reduced result is in XMMDst -.endm - - -/* Encryption of a single block -* uses eax & r10 -*/ - -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 - - pxor (%arg1), \XMM0 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - lea 16(%arg1), %r10 # get first expanded key address - -_esb_loop_\@: - MOVADQ (%r10),\TMP1 - aesenc \TMP1,\XMM0 - add $16,%r10 - sub $1,%eax - jnz _esb_loop_\@ - - MOVADQ (%r10),\TMP1 - aesenclast \TMP1,\XMM0 -.endm -/***************************************************************************** -* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Plaintext output. Encrypt in-place is allowed. -* const u8 *in, // Ciphertext input -* u64 plaintext_len, // Length of data in bytes for decryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the -* // given authentication tag and only return the plaintext if they match. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 -* // (most likely), 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the first -* set of 11 keys in the data structure void *aes_ctx -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -* -*****************************************************************************/ -SYM_FUNC_START(aesni_gcm_dec) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC dec - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec) - - -/***************************************************************************** -* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the -* first set of 11 keys in the data structure void *aes_ctx -* -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -***************************************************************************/ -SYM_FUNC_START(aesni_gcm_enc) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC enc - - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc) - -/***************************************************************************** -* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len) // Length of AAD in bytes. -*/ -SYM_FUNC_START(aesni_gcm_init) - FUNC_SAVE - GCM_INIT %arg3, %arg4,%arg5, %arg6 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init) - -/***************************************************************************** -* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_enc_update) - FUNC_SAVE - GCM_ENC_DEC enc - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update) - -/***************************************************************************** -* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_dec_update) - FUNC_SAVE - GCM_ENC_DEC dec - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update) - -/***************************************************************************** -* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -*/ -SYM_FUNC_START(aesni_gcm_finalize) - FUNC_SAVE - GCM_COMPLETE %arg3 %arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize) - -#endif - SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -1820,8 +133,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b) SYM_FUNC_END(_key_expansion_256b) /* - * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - * unsigned int key_len) + * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + * unsigned int key_len) */ SYM_FUNC_START(aesni_set_key) FRAME_BEGIN @@ -1926,7 +239,6 @@ SYM_FUNC_START(aesni_set_key) sub $0x10, UKEYP cmp TKEYP, KEYP jb .Ldec_key_loop - xor AREG, AREG #ifndef __x86_64__ popl KEYP #endif @@ -2826,28 +1138,24 @@ SYM_FUNC_END(aesni_ctr_enc) .previous /* - * _aesni_gf128mul_x_ble: internal ABI - * Multiply in GF(2^128) for XTS IVs + * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs * input: * IV: current IV * GF128MUL_MASK == mask with 0x87 and 0x01 * output: * IV: next IV * changed: - * CTR: == temporary value + * KEY: == temporary value */ -#define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, KEY; \ - paddq IV, IV; \ - psrad $31, KEY; \ - pand GF128MUL_MASK, KEY; \ - pxor KEY, IV; +.macro _aesni_gf128mul_x_ble + pshufd $0x13, IV, KEY + paddq IV, IV + psrad $31, KEY + pand GF128MUL_MASK, KEY + pxor KEY, IV +.endm -/* - * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_encrypt) +.macro _aesni_xts_crypt enc FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2866,35 +1174,46 @@ SYM_FUNC_START(aesni_xts_encrypt) movups (IVP), IV mov 480(KEYP), KLEN +.if !\enc + add $240, KEYP + + test $15, LEN + jz .Lxts_loop4\@ + sub $16, LEN +.endif -.Lxts_enc_loop4: +.Lxts_loop4\@: sub $64, LEN - jl .Lxts_enc_1x + jl .Lxts_1x\@ movdqa IV, STATE1 movdqu 0x00(INP), IN pxor IN, STATE1 movdqu IV, 0x00(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE2 movdqu 0x10(INP), IN pxor IN, STATE2 movdqu IV, 0x10(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE3 movdqu 0x20(INP), IN pxor IN, STATE3 movdqu IV, 0x20(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE4 movdqu 0x30(INP), IN pxor IN, STATE4 movdqu IV, 0x30(OUTP) +.if \enc call _aesni_enc4 +.else + call _aesni_dec4 +.endif movdqu 0x00(OUTP), IN pxor IN, STATE1 @@ -2912,17 +1231,17 @@ SYM_FUNC_START(aesni_xts_encrypt) pxor IN, STATE4 movdqu STATE4, 0x30(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble add $64, INP add $64, OUTP test LEN, LEN - jnz .Lxts_enc_loop4 + jnz .Lxts_loop4\@ -.Lxts_enc_ret_iv: +.Lxts_ret_iv\@: movups IV, (IVP) -.Lxts_enc_ret: +.Lxts_ret\@: #ifndef __x86_64__ popl KLEN popl KEYP @@ -2932,201 +1251,60 @@ SYM_FUNC_START(aesni_xts_encrypt) FRAME_END RET -.Lxts_enc_1x: +.Lxts_1x\@: add $64, LEN - jz .Lxts_enc_ret_iv + jz .Lxts_ret_iv\@ +.if \enc sub $16, LEN - jl .Lxts_enc_cts4 + jl .Lxts_cts4\@ +.endif -.Lxts_enc_loop1: +.Lxts_loop1\@: movdqu (INP), STATE +.if \enc pxor IV, STATE call _aesni_enc1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_enc_out - +.else add $16, INP sub $16, LEN - jl .Lxts_enc_cts1 - - movdqu STATE, (OUTP) - add $16, OUTP - jmp .Lxts_enc_loop1 - -.Lxts_enc_out: - movdqu STATE, (OUTP) - jmp .Lxts_enc_ret_iv - -.Lxts_enc_cts4: - movdqa STATE4, STATE - sub $16, OUTP - -.Lxts_enc_cts1: -#ifndef __x86_64__ - lea .Lcts_permute_table, T1 -#else - lea .Lcts_permute_table(%rip), T1 -#endif - add LEN, INP /* rewind input pointer */ - add $16, LEN /* # bytes in final block */ - movups (INP), IN1 - - mov T1, IVP - add $32, IVP - add LEN, T1 - sub LEN, IVP - add OUTP, LEN - - movups (T1), %xmm4 - movaps STATE, IN2 - pshufb %xmm4, STATE - movups STATE, (LEN) - - movups (IVP), %xmm0 - pshufb %xmm0, IN1 - pblendvb IN2, IN1 - movaps IN1, STATE - + jl .Lxts_cts1\@ pxor IV, STATE - call _aesni_enc1 + call _aesni_dec1 +.endif pxor IV, STATE + _aesni_gf128mul_x_ble - movups STATE, (OUTP) - jmp .Lxts_enc_ret -SYM_FUNC_END(aesni_xts_encrypt) - -/* - * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_decrypt) - FRAME_BEGIN -#ifndef __x86_64__ - pushl IVP - pushl LEN - pushl KEYP - pushl KLEN - movl (FRAME_OFFSET+20)(%esp), KEYP # ctx - movl (FRAME_OFFSET+24)(%esp), OUTP # dst - movl (FRAME_OFFSET+28)(%esp), INP # src - movl (FRAME_OFFSET+32)(%esp), LEN # len - movl (FRAME_OFFSET+36)(%esp), IVP # iv - movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK -#else - movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK -#endif - movups (IVP), IV - - mov 480(KEYP), KLEN - add $240, KEYP - - test $15, LEN - jz .Lxts_dec_loop4 - sub $16, LEN - -.Lxts_dec_loop4: - sub $64, LEN - jl .Lxts_dec_1x - - movdqa IV, STATE1 - movdqu 0x00(INP), IN - pxor IN, STATE1 - movdqu IV, 0x00(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x10(INP), IN - pxor IN, STATE2 - movdqu IV, 0x10(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x20(INP), IN - pxor IN, STATE3 - movdqu IV, 0x20(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE4 - movdqu 0x30(INP), IN - pxor IN, STATE4 - movdqu IV, 0x30(OUTP) - - call _aesni_dec4 - - movdqu 0x00(OUTP), IN - pxor IN, STATE1 - movdqu STATE1, 0x00(OUTP) - - movdqu 0x10(OUTP), IN - pxor IN, STATE2 - movdqu STATE2, 0x10(OUTP) - - movdqu 0x20(OUTP), IN - pxor IN, STATE3 - movdqu STATE3, 0x20(OUTP) - - movdqu 0x30(OUTP), IN - pxor IN, STATE4 - movdqu STATE4, 0x30(OUTP) - - _aesni_gf128mul_x_ble() - - add $64, INP - add $64, OUTP test LEN, LEN - jnz .Lxts_dec_loop4 - -.Lxts_dec_ret_iv: - movups IV, (IVP) - -.Lxts_dec_ret: -#ifndef __x86_64__ - popl KLEN - popl KEYP - popl LEN - popl IVP -#endif - FRAME_END - RET - -.Lxts_dec_1x: - add $64, LEN - jz .Lxts_dec_ret_iv - -.Lxts_dec_loop1: - movdqu (INP), STATE + jz .Lxts_out\@ +.if \enc add $16, INP sub $16, LEN - jl .Lxts_dec_cts1 - - pxor IV, STATE - call _aesni_dec1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_dec_out + jl .Lxts_cts1\@ +.endif movdqu STATE, (OUTP) add $16, OUTP - jmp .Lxts_dec_loop1 + jmp .Lxts_loop1\@ -.Lxts_dec_out: +.Lxts_out\@: movdqu STATE, (OUTP) - jmp .Lxts_dec_ret_iv + jmp .Lxts_ret_iv\@ -.Lxts_dec_cts1: +.if \enc +.Lxts_cts4\@: + movdqa STATE4, STATE + sub $16, OUTP +.Lxts_cts1\@: +.else +.Lxts_cts1\@: movdqa IV, STATE4 - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble pxor IV, STATE call _aesni_dec1 pxor IV, STATE - +.endif #ifndef __x86_64__ lea .Lcts_permute_table, T1 #else @@ -3152,10 +1330,32 @@ SYM_FUNC_START(aesni_xts_decrypt) pblendvb IN2, IN1 movaps IN1, STATE +.if \enc + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE +.else pxor STATE4, STATE call _aesni_dec1 pxor STATE4, STATE +.endif movups STATE, (OUTP) - jmp .Lxts_dec_ret -SYM_FUNC_END(aesni_xts_decrypt) + jmp .Lxts_ret\@ +.endm + +/* + * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_enc) + _aesni_xts_crypt 1 +SYM_FUNC_END(aesni_xts_enc) + +/* + * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_dec) + _aesni_xts_crypt 0 +SYM_FUNC_END(aesni_xts_dec) |