zfs-builds-mm/zfs-0.8.1/module/icp/asm-x86_64/sha2/sha256_impl.S

2064 lines
37 KiB
ArmAsm
Raw Normal View History

2019-07-06 23:40:11 +02:00
/*
* ====================================================================
* Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project. Rights for redistribution and usage in source and binary
* forms are granted according to the OpenSSL license.
* ====================================================================
*
* sha256/512_block procedure for x86_64.
*
* 40% improvement over compiler-generated code on Opteron. On EM64T
* sha256 was observed to run >80% faster and sha512 - >40%. No magical
* tricks, just straight implementation... I really wonder why gcc
* [being armed with inline assembler] fails to generate as fast code.
* The only thing which is cool about this module is that it's very
* same instruction sequence used for both SHA-256 and SHA-512. In
* former case the instructions operate on 32-bit operands, while in
* latter - on 64-bit ones. All I had to do is to get one flavor right,
* the other one passed the test right away:-)
*
* sha256_block runs in ~1005 cycles on Opteron, which gives you
* asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
* frequency in GHz. sha512_block runs in ~1275 cycles, which results
* in 128*1000/1275=100MBps per GHz. Is there room for improvement?
* Well, if you compare it to IA-64 implementation, which maintains
* X[16] in register bank[!], tends to 4 instructions per CPU clock
* cycle and runs in 1003 cycles, 1275 is very good result for 3-way
* issue Opteron pipeline and X[16] maintained in memory. So that *if*
* there is a way to improve it, *then* the only way would be to try to
* offload X[16] updates to SSE unit, but that would require "deeper"
* loop unroll, which in turn would naturally cause size blow-up, not
* to mention increased complexity! And once again, only *if* it's
* actually possible to noticeably improve overall ILP, instruction
* level parallelism, on a given CPU implementation in this case.
*
* Special note on Intel EM64T. While Opteron CPU exhibits perfect
* performance ratio of 1.5 between 64- and 32-bit flavors [see above],
* [currently available] EM64T CPUs apparently are far from it. On the
* contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
* sha256_block:-( This is presumably because 64-bit shifts/rotates
* apparently are not atomic instructions, but implemented in microcode.
*/
/*
* OpenSolaris OS modifications
*
* Sun elects to use this software under the BSD license.
*
* This source originates from OpenSSL file sha512-x86_64.pl at
* ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
* (presumably for future OpenSSL release 0.9.8h), with these changes:
*
* 1. Added perl "use strict" and declared variables.
*
* 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
* /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
*
* 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
* assemblers). Replaced the .picmeup macro with assembler code.
*
* 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
* at the beginning of SHA2_CTX (the next field is 8-byte aligned).
*/
/*
* This file was generated by a perl script (sha512-x86_64.pl) that were
* used to generate sha256 and sha512 variants from the same code base.
* The comments from the original file have been pasted above.
*/
#if defined(lint) || defined(__lint)
#include <sys/stdint.h>
#include <sha2/sha2.h>
/* ARGSUSED */
void
SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
{
}
#else
#define _ASM
#include <sys/asm_linkage.h>
ENTRY_NP(SHA256TransformBlocks)
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%rbp # copy %rsp
shl $4,%rdx # num*16
sub $16*4+4*8,%rsp
lea (%rsi,%rdx,4),%rdx # inp+num*16*4
and $-64,%rsp # align stack frame
add $8,%rdi # Skip OpenSolaris field, "algotype"
mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg
mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg
mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg
mov %rbp,16*4+3*8(%rsp) # save copy of %rsp
#.picmeup %rbp
# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
# the address of the "next" instruction into the target register
# (%rbp). This generates these 2 instructions:
lea .Llea(%rip),%rbp
#nop # .picmeup generates a nop for mod 8 alignment--not needed here
.Llea:
lea K256-.(%rbp),%rbp
mov 4*0(%rdi),%eax
mov 4*1(%rdi),%ebx
mov 4*2(%rdi),%ecx
mov 4*3(%rdi),%edx
mov 4*4(%rdi),%r8d
mov 4*5(%rdi),%r9d
mov 4*6(%rdi),%r10d
mov 4*7(%rdi),%r11d
jmp .Lloop
.align 16
.Lloop:
xor %rdi,%rdi
mov 4*0(%rsi),%r12d
bswap %r12d
mov %r8d,%r13d
mov %r8d,%r14d
mov %r9d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r10d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r8d,%r15d # (f^g)&e
mov %r12d,0(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r11d,%r12d # T1+=h
mov %eax,%r11d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %eax,%r13d
mov %eax,%r14d
ror $2,%r11d
ror $13,%r13d
mov %eax,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r11d
ror $9,%r13d
or %ecx,%r14d # a|c
xor %r13d,%r11d # h=Sigma0(a)
and %ecx,%r15d # a&c
add %r12d,%edx # d+=T1
and %ebx,%r14d # (a|c)&b
add %r12d,%r11d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r11d # h+=Maj(a,b,c)
mov 4*1(%rsi),%r12d
bswap %r12d
mov %edx,%r13d
mov %edx,%r14d
mov %r8d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r9d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %edx,%r15d # (f^g)&e
mov %r12d,4(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r10d,%r12d # T1+=h
mov %r11d,%r10d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r11d,%r13d
mov %r11d,%r14d
ror $2,%r10d
ror $13,%r13d
mov %r11d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r10d
ror $9,%r13d
or %ebx,%r14d # a|c
xor %r13d,%r10d # h=Sigma0(a)
and %ebx,%r15d # a&c
add %r12d,%ecx # d+=T1
and %eax,%r14d # (a|c)&b
add %r12d,%r10d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r10d # h+=Maj(a,b,c)
mov 4*2(%rsi),%r12d
bswap %r12d
mov %ecx,%r13d
mov %ecx,%r14d
mov %edx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r8d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ecx,%r15d # (f^g)&e
mov %r12d,8(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r9d,%r12d # T1+=h
mov %r10d,%r9d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r10d,%r13d
mov %r10d,%r14d
ror $2,%r9d
ror $13,%r13d
mov %r10d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r9d
ror $9,%r13d
or %eax,%r14d # a|c
xor %r13d,%r9d # h=Sigma0(a)
and %eax,%r15d # a&c
add %r12d,%ebx # d+=T1
and %r11d,%r14d # (a|c)&b
add %r12d,%r9d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r9d # h+=Maj(a,b,c)
mov 4*3(%rsi),%r12d
bswap %r12d
mov %ebx,%r13d
mov %ebx,%r14d
mov %ecx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %edx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ebx,%r15d # (f^g)&e
mov %r12d,12(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r8d,%r12d # T1+=h
mov %r9d,%r8d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r9d,%r13d
mov %r9d,%r14d
ror $2,%r8d
ror $13,%r13d
mov %r9d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r8d
ror $9,%r13d
or %r11d,%r14d # a|c
xor %r13d,%r8d # h=Sigma0(a)
and %r11d,%r15d # a&c
add %r12d,%eax # d+=T1
and %r10d,%r14d # (a|c)&b
add %r12d,%r8d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r8d # h+=Maj(a,b,c)
mov 4*4(%rsi),%r12d
bswap %r12d
mov %eax,%r13d
mov %eax,%r14d
mov %ebx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ecx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %eax,%r15d # (f^g)&e
mov %r12d,16(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %edx,%r12d # T1+=h
mov %r8d,%edx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r8d,%r13d
mov %r8d,%r14d
ror $2,%edx
ror $13,%r13d
mov %r8d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%edx
ror $9,%r13d
or %r10d,%r14d # a|c
xor %r13d,%edx # h=Sigma0(a)
and %r10d,%r15d # a&c
add %r12d,%r11d # d+=T1
and %r9d,%r14d # (a|c)&b
add %r12d,%edx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%edx # h+=Maj(a,b,c)
mov 4*5(%rsi),%r12d
bswap %r12d
mov %r11d,%r13d
mov %r11d,%r14d
mov %eax,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ebx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r11d,%r15d # (f^g)&e
mov %r12d,20(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ecx,%r12d # T1+=h
mov %edx,%ecx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %edx,%r13d
mov %edx,%r14d
ror $2,%ecx
ror $13,%r13d
mov %edx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ecx
ror $9,%r13d
or %r9d,%r14d # a|c
xor %r13d,%ecx # h=Sigma0(a)
and %r9d,%r15d # a&c
add %r12d,%r10d # d+=T1
and %r8d,%r14d # (a|c)&b
add %r12d,%ecx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ecx # h+=Maj(a,b,c)
mov 4*6(%rsi),%r12d
bswap %r12d
mov %r10d,%r13d
mov %r10d,%r14d
mov %r11d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %eax,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r10d,%r15d # (f^g)&e
mov %r12d,24(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ebx,%r12d # T1+=h
mov %ecx,%ebx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ecx,%r13d
mov %ecx,%r14d
ror $2,%ebx
ror $13,%r13d
mov %ecx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ebx
ror $9,%r13d
or %r8d,%r14d # a|c
xor %r13d,%ebx # h=Sigma0(a)
and %r8d,%r15d # a&c
add %r12d,%r9d # d+=T1
and %edx,%r14d # (a|c)&b
add %r12d,%ebx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ebx # h+=Maj(a,b,c)
mov 4*7(%rsi),%r12d
bswap %r12d
mov %r9d,%r13d
mov %r9d,%r14d
mov %r10d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r11d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r9d,%r15d # (f^g)&e
mov %r12d,28(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %eax,%r12d # T1+=h
mov %ebx,%eax
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ebx,%r13d
mov %ebx,%r14d
ror $2,%eax
ror $13,%r13d
mov %ebx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%eax
ror $9,%r13d
or %edx,%r14d # a|c
xor %r13d,%eax # h=Sigma0(a)
and %edx,%r15d # a&c
add %r12d,%r8d # d+=T1
and %ecx,%r14d # (a|c)&b
add %r12d,%eax # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%eax # h+=Maj(a,b,c)
mov 4*8(%rsi),%r12d
bswap %r12d
mov %r8d,%r13d
mov %r8d,%r14d
mov %r9d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r10d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r8d,%r15d # (f^g)&e
mov %r12d,32(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r11d,%r12d # T1+=h
mov %eax,%r11d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %eax,%r13d
mov %eax,%r14d
ror $2,%r11d
ror $13,%r13d
mov %eax,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r11d
ror $9,%r13d
or %ecx,%r14d # a|c
xor %r13d,%r11d # h=Sigma0(a)
and %ecx,%r15d # a&c
add %r12d,%edx # d+=T1
and %ebx,%r14d # (a|c)&b
add %r12d,%r11d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r11d # h+=Maj(a,b,c)
mov 4*9(%rsi),%r12d
bswap %r12d
mov %edx,%r13d
mov %edx,%r14d
mov %r8d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r9d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %edx,%r15d # (f^g)&e
mov %r12d,36(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r10d,%r12d # T1+=h
mov %r11d,%r10d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r11d,%r13d
mov %r11d,%r14d
ror $2,%r10d
ror $13,%r13d
mov %r11d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r10d
ror $9,%r13d
or %ebx,%r14d # a|c
xor %r13d,%r10d # h=Sigma0(a)
and %ebx,%r15d # a&c
add %r12d,%ecx # d+=T1
and %eax,%r14d # (a|c)&b
add %r12d,%r10d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r10d # h+=Maj(a,b,c)
mov 4*10(%rsi),%r12d
bswap %r12d
mov %ecx,%r13d
mov %ecx,%r14d
mov %edx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r8d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ecx,%r15d # (f^g)&e
mov %r12d,40(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r9d,%r12d # T1+=h
mov %r10d,%r9d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r10d,%r13d
mov %r10d,%r14d
ror $2,%r9d
ror $13,%r13d
mov %r10d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r9d
ror $9,%r13d
or %eax,%r14d # a|c
xor %r13d,%r9d # h=Sigma0(a)
and %eax,%r15d # a&c
add %r12d,%ebx # d+=T1
and %r11d,%r14d # (a|c)&b
add %r12d,%r9d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r9d # h+=Maj(a,b,c)
mov 4*11(%rsi),%r12d
bswap %r12d
mov %ebx,%r13d
mov %ebx,%r14d
mov %ecx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %edx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ebx,%r15d # (f^g)&e
mov %r12d,44(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r8d,%r12d # T1+=h
mov %r9d,%r8d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r9d,%r13d
mov %r9d,%r14d
ror $2,%r8d
ror $13,%r13d
mov %r9d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r8d
ror $9,%r13d
or %r11d,%r14d # a|c
xor %r13d,%r8d # h=Sigma0(a)
and %r11d,%r15d # a&c
add %r12d,%eax # d+=T1
and %r10d,%r14d # (a|c)&b
add %r12d,%r8d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r8d # h+=Maj(a,b,c)
mov 4*12(%rsi),%r12d
bswap %r12d
mov %eax,%r13d
mov %eax,%r14d
mov %ebx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ecx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %eax,%r15d # (f^g)&e
mov %r12d,48(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %edx,%r12d # T1+=h
mov %r8d,%edx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r8d,%r13d
mov %r8d,%r14d
ror $2,%edx
ror $13,%r13d
mov %r8d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%edx
ror $9,%r13d
or %r10d,%r14d # a|c
xor %r13d,%edx # h=Sigma0(a)
and %r10d,%r15d # a&c
add %r12d,%r11d # d+=T1
and %r9d,%r14d # (a|c)&b
add %r12d,%edx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%edx # h+=Maj(a,b,c)
mov 4*13(%rsi),%r12d
bswap %r12d
mov %r11d,%r13d
mov %r11d,%r14d
mov %eax,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ebx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r11d,%r15d # (f^g)&e
mov %r12d,52(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ecx,%r12d # T1+=h
mov %edx,%ecx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %edx,%r13d
mov %edx,%r14d
ror $2,%ecx
ror $13,%r13d
mov %edx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ecx
ror $9,%r13d
or %r9d,%r14d # a|c
xor %r13d,%ecx # h=Sigma0(a)
and %r9d,%r15d # a&c
add %r12d,%r10d # d+=T1
and %r8d,%r14d # (a|c)&b
add %r12d,%ecx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ecx # h+=Maj(a,b,c)
mov 4*14(%rsi),%r12d
bswap %r12d
mov %r10d,%r13d
mov %r10d,%r14d
mov %r11d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %eax,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r10d,%r15d # (f^g)&e
mov %r12d,56(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ebx,%r12d # T1+=h
mov %ecx,%ebx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ecx,%r13d
mov %ecx,%r14d
ror $2,%ebx
ror $13,%r13d
mov %ecx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ebx
ror $9,%r13d
or %r8d,%r14d # a|c
xor %r13d,%ebx # h=Sigma0(a)
and %r8d,%r15d # a&c
add %r12d,%r9d # d+=T1
and %edx,%r14d # (a|c)&b
add %r12d,%ebx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ebx # h+=Maj(a,b,c)
mov 4*15(%rsi),%r12d
bswap %r12d
mov %r9d,%r13d
mov %r9d,%r14d
mov %r10d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r11d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r9d,%r15d # (f^g)&e
mov %r12d,60(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %eax,%r12d # T1+=h
mov %ebx,%eax
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ebx,%r13d
mov %ebx,%r14d
ror $2,%eax
ror $13,%r13d
mov %ebx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%eax
ror $9,%r13d
or %edx,%r14d # a|c
xor %r13d,%eax # h=Sigma0(a)
and %edx,%r15d # a&c
add %r12d,%r8d # d+=T1
and %ecx,%r14d # (a|c)&b
add %r12d,%eax # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%eax # h+=Maj(a,b,c)
jmp .Lrounds_16_xx
.align 16
.Lrounds_16_xx:
mov 4(%rsp),%r13d
mov 56(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 36(%rsp),%r12d
add 0(%rsp),%r12d
mov %r8d,%r13d
mov %r8d,%r14d
mov %r9d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r10d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r8d,%r15d # (f^g)&e
mov %r12d,0(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r11d,%r12d # T1+=h
mov %eax,%r11d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %eax,%r13d
mov %eax,%r14d
ror $2,%r11d
ror $13,%r13d
mov %eax,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r11d
ror $9,%r13d
or %ecx,%r14d # a|c
xor %r13d,%r11d # h=Sigma0(a)
and %ecx,%r15d # a&c
add %r12d,%edx # d+=T1
and %ebx,%r14d # (a|c)&b
add %r12d,%r11d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r11d # h+=Maj(a,b,c)
mov 8(%rsp),%r13d
mov 60(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 40(%rsp),%r12d
add 4(%rsp),%r12d
mov %edx,%r13d
mov %edx,%r14d
mov %r8d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r9d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %edx,%r15d # (f^g)&e
mov %r12d,4(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r10d,%r12d # T1+=h
mov %r11d,%r10d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r11d,%r13d
mov %r11d,%r14d
ror $2,%r10d
ror $13,%r13d
mov %r11d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r10d
ror $9,%r13d
or %ebx,%r14d # a|c
xor %r13d,%r10d # h=Sigma0(a)
and %ebx,%r15d # a&c
add %r12d,%ecx # d+=T1
and %eax,%r14d # (a|c)&b
add %r12d,%r10d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r10d # h+=Maj(a,b,c)
mov 12(%rsp),%r13d
mov 0(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 44(%rsp),%r12d
add 8(%rsp),%r12d
mov %ecx,%r13d
mov %ecx,%r14d
mov %edx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r8d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ecx,%r15d # (f^g)&e
mov %r12d,8(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r9d,%r12d # T1+=h
mov %r10d,%r9d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r10d,%r13d
mov %r10d,%r14d
ror $2,%r9d
ror $13,%r13d
mov %r10d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r9d
ror $9,%r13d
or %eax,%r14d # a|c
xor %r13d,%r9d # h=Sigma0(a)
and %eax,%r15d # a&c
add %r12d,%ebx # d+=T1
and %r11d,%r14d # (a|c)&b
add %r12d,%r9d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r9d # h+=Maj(a,b,c)
mov 16(%rsp),%r13d
mov 4(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 48(%rsp),%r12d
add 12(%rsp),%r12d
mov %ebx,%r13d
mov %ebx,%r14d
mov %ecx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %edx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ebx,%r15d # (f^g)&e
mov %r12d,12(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r8d,%r12d # T1+=h
mov %r9d,%r8d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r9d,%r13d
mov %r9d,%r14d
ror $2,%r8d
ror $13,%r13d
mov %r9d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r8d
ror $9,%r13d
or %r11d,%r14d # a|c
xor %r13d,%r8d # h=Sigma0(a)
and %r11d,%r15d # a&c
add %r12d,%eax # d+=T1
and %r10d,%r14d # (a|c)&b
add %r12d,%r8d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r8d # h+=Maj(a,b,c)
mov 20(%rsp),%r13d
mov 8(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 52(%rsp),%r12d
add 16(%rsp),%r12d
mov %eax,%r13d
mov %eax,%r14d
mov %ebx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ecx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %eax,%r15d # (f^g)&e
mov %r12d,16(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %edx,%r12d # T1+=h
mov %r8d,%edx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r8d,%r13d
mov %r8d,%r14d
ror $2,%edx
ror $13,%r13d
mov %r8d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%edx
ror $9,%r13d
or %r10d,%r14d # a|c
xor %r13d,%edx # h=Sigma0(a)
and %r10d,%r15d # a&c
add %r12d,%r11d # d+=T1
and %r9d,%r14d # (a|c)&b
add %r12d,%edx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%edx # h+=Maj(a,b,c)
mov 24(%rsp),%r13d
mov 12(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 56(%rsp),%r12d
add 20(%rsp),%r12d
mov %r11d,%r13d
mov %r11d,%r14d
mov %eax,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ebx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r11d,%r15d # (f^g)&e
mov %r12d,20(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ecx,%r12d # T1+=h
mov %edx,%ecx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %edx,%r13d
mov %edx,%r14d
ror $2,%ecx
ror $13,%r13d
mov %edx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ecx
ror $9,%r13d
or %r9d,%r14d # a|c
xor %r13d,%ecx # h=Sigma0(a)
and %r9d,%r15d # a&c
add %r12d,%r10d # d+=T1
and %r8d,%r14d # (a|c)&b
add %r12d,%ecx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ecx # h+=Maj(a,b,c)
mov 28(%rsp),%r13d
mov 16(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 60(%rsp),%r12d
add 24(%rsp),%r12d
mov %r10d,%r13d
mov %r10d,%r14d
mov %r11d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %eax,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r10d,%r15d # (f^g)&e
mov %r12d,24(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ebx,%r12d # T1+=h
mov %ecx,%ebx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ecx,%r13d
mov %ecx,%r14d
ror $2,%ebx
ror $13,%r13d
mov %ecx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ebx
ror $9,%r13d
or %r8d,%r14d # a|c
xor %r13d,%ebx # h=Sigma0(a)
and %r8d,%r15d # a&c
add %r12d,%r9d # d+=T1
and %edx,%r14d # (a|c)&b
add %r12d,%ebx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ebx # h+=Maj(a,b,c)
mov 32(%rsp),%r13d
mov 20(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 0(%rsp),%r12d
add 28(%rsp),%r12d
mov %r9d,%r13d
mov %r9d,%r14d
mov %r10d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r11d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r9d,%r15d # (f^g)&e
mov %r12d,28(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %eax,%r12d # T1+=h
mov %ebx,%eax
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ebx,%r13d
mov %ebx,%r14d
ror $2,%eax
ror $13,%r13d
mov %ebx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%eax
ror $9,%r13d
or %edx,%r14d # a|c
xor %r13d,%eax # h=Sigma0(a)
and %edx,%r15d # a&c
add %r12d,%r8d # d+=T1
and %ecx,%r14d # (a|c)&b
add %r12d,%eax # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%eax # h+=Maj(a,b,c)
mov 36(%rsp),%r13d
mov 24(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 4(%rsp),%r12d
add 32(%rsp),%r12d
mov %r8d,%r13d
mov %r8d,%r14d
mov %r9d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r10d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r8d,%r15d # (f^g)&e
mov %r12d,32(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r11d,%r12d # T1+=h
mov %eax,%r11d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %eax,%r13d
mov %eax,%r14d
ror $2,%r11d
ror $13,%r13d
mov %eax,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r11d
ror $9,%r13d
or %ecx,%r14d # a|c
xor %r13d,%r11d # h=Sigma0(a)
and %ecx,%r15d # a&c
add %r12d,%edx # d+=T1
and %ebx,%r14d # (a|c)&b
add %r12d,%r11d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r11d # h+=Maj(a,b,c)
mov 40(%rsp),%r13d
mov 28(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 8(%rsp),%r12d
add 36(%rsp),%r12d
mov %edx,%r13d
mov %edx,%r14d
mov %r8d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r9d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %edx,%r15d # (f^g)&e
mov %r12d,36(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r10d,%r12d # T1+=h
mov %r11d,%r10d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r11d,%r13d
mov %r11d,%r14d
ror $2,%r10d
ror $13,%r13d
mov %r11d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r10d
ror $9,%r13d
or %ebx,%r14d # a|c
xor %r13d,%r10d # h=Sigma0(a)
and %ebx,%r15d # a&c
add %r12d,%ecx # d+=T1
and %eax,%r14d # (a|c)&b
add %r12d,%r10d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r10d # h+=Maj(a,b,c)
mov 44(%rsp),%r13d
mov 32(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 12(%rsp),%r12d
add 40(%rsp),%r12d
mov %ecx,%r13d
mov %ecx,%r14d
mov %edx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r8d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ecx,%r15d # (f^g)&e
mov %r12d,40(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r9d,%r12d # T1+=h
mov %r10d,%r9d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r10d,%r13d
mov %r10d,%r14d
ror $2,%r9d
ror $13,%r13d
mov %r10d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r9d
ror $9,%r13d
or %eax,%r14d # a|c
xor %r13d,%r9d # h=Sigma0(a)
and %eax,%r15d # a&c
add %r12d,%ebx # d+=T1
and %r11d,%r14d # (a|c)&b
add %r12d,%r9d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r9d # h+=Maj(a,b,c)
mov 48(%rsp),%r13d
mov 36(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 16(%rsp),%r12d
add 44(%rsp),%r12d
mov %ebx,%r13d
mov %ebx,%r14d
mov %ecx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %edx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %ebx,%r15d # (f^g)&e
mov %r12d,44(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %r8d,%r12d # T1+=h
mov %r9d,%r8d
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r9d,%r13d
mov %r9d,%r14d
ror $2,%r8d
ror $13,%r13d
mov %r9d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%r8d
ror $9,%r13d
or %r11d,%r14d # a|c
xor %r13d,%r8d # h=Sigma0(a)
and %r11d,%r15d # a&c
add %r12d,%eax # d+=T1
and %r10d,%r14d # (a|c)&b
add %r12d,%r8d # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%r8d # h+=Maj(a,b,c)
mov 52(%rsp),%r13d
mov 40(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 20(%rsp),%r12d
add 48(%rsp),%r12d
mov %eax,%r13d
mov %eax,%r14d
mov %ebx,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ecx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %eax,%r15d # (f^g)&e
mov %r12d,48(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %edx,%r12d # T1+=h
mov %r8d,%edx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %r8d,%r13d
mov %r8d,%r14d
ror $2,%edx
ror $13,%r13d
mov %r8d,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%edx
ror $9,%r13d
or %r10d,%r14d # a|c
xor %r13d,%edx # h=Sigma0(a)
and %r10d,%r15d # a&c
add %r12d,%r11d # d+=T1
and %r9d,%r14d # (a|c)&b
add %r12d,%edx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%edx # h+=Maj(a,b,c)
mov 56(%rsp),%r13d
mov 44(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 24(%rsp),%r12d
add 52(%rsp),%r12d
mov %r11d,%r13d
mov %r11d,%r14d
mov %eax,%r15d
ror $6,%r13d
ror $11,%r14d
xor %ebx,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r11d,%r15d # (f^g)&e
mov %r12d,52(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ecx,%r12d # T1+=h
mov %edx,%ecx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %edx,%r13d
mov %edx,%r14d
ror $2,%ecx
ror $13,%r13d
mov %edx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ecx
ror $9,%r13d
or %r9d,%r14d # a|c
xor %r13d,%ecx # h=Sigma0(a)
and %r9d,%r15d # a&c
add %r12d,%r10d # d+=T1
and %r8d,%r14d # (a|c)&b
add %r12d,%ecx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ecx # h+=Maj(a,b,c)
mov 60(%rsp),%r13d
mov 48(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 28(%rsp),%r12d
add 56(%rsp),%r12d
mov %r10d,%r13d
mov %r10d,%r14d
mov %r11d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %eax,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r10d,%r15d # (f^g)&e
mov %r12d,56(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %ebx,%r12d # T1+=h
mov %ecx,%ebx
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ecx,%r13d
mov %ecx,%r14d
ror $2,%ebx
ror $13,%r13d
mov %ecx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%ebx
ror $9,%r13d
or %r8d,%r14d # a|c
xor %r13d,%ebx # h=Sigma0(a)
and %r8d,%r15d # a&c
add %r12d,%r9d # d+=T1
and %edx,%r14d # (a|c)&b
add %r12d,%ebx # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%ebx # h+=Maj(a,b,c)
mov 0(%rsp),%r13d
mov 52(%rsp),%r12d
mov %r13d,%r15d
shr $3,%r13d
ror $7,%r15d
xor %r15d,%r13d
ror $11,%r15d
xor %r15d,%r13d # sigma0(X[(i+1)&0xf])
mov %r12d,%r14d
shr $10,%r12d
ror $17,%r14d
xor %r14d,%r12d
ror $2,%r14d
xor %r14d,%r12d # sigma1(X[(i+14)&0xf])
add %r13d,%r12d
add 32(%rsp),%r12d
add 60(%rsp),%r12d
mov %r9d,%r13d
mov %r9d,%r14d
mov %r10d,%r15d
ror $6,%r13d
ror $11,%r14d
xor %r11d,%r15d # f^g
xor %r14d,%r13d
ror $14,%r14d
and %r9d,%r15d # (f^g)&e
mov %r12d,60(%rsp)
xor %r14d,%r13d # Sigma1(e)
xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g
add %eax,%r12d # T1+=h
mov %ebx,%eax
add %r13d,%r12d # T1+=Sigma1(e)
add %r15d,%r12d # T1+=Ch(e,f,g)
mov %ebx,%r13d
mov %ebx,%r14d
ror $2,%eax
ror $13,%r13d
mov %ebx,%r15d
add (%rbp,%rdi,4),%r12d # T1+=K[round]
xor %r13d,%eax
ror $9,%r13d
or %edx,%r14d # a|c
xor %r13d,%eax # h=Sigma0(a)
and %edx,%r15d # a&c
add %r12d,%r8d # d+=T1
and %ecx,%r14d # (a|c)&b
add %r12d,%eax # h+=T1
or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1(%rdi),%rdi # round++
add %r14d,%eax # h+=Maj(a,b,c)
cmp $64,%rdi
jb .Lrounds_16_xx
mov 16*4+0*8(%rsp),%rdi
lea 16*4(%rsi),%rsi
add 4*0(%rdi),%eax
add 4*1(%rdi),%ebx
add 4*2(%rdi),%ecx
add 4*3(%rdi),%edx
add 4*4(%rdi),%r8d
add 4*5(%rdi),%r9d
add 4*6(%rdi),%r10d
add 4*7(%rdi),%r11d
cmp 16*4+2*8(%rsp),%rsi
mov %eax,4*0(%rdi)
mov %ebx,4*1(%rdi)
mov %ecx,4*2(%rdi)
mov %edx,4*3(%rdi)
mov %r8d,4*4(%rdi)
mov %r9d,4*5(%rdi)
mov %r10d,4*6(%rdi)
mov %r11d,4*7(%rdi)
jb .Lloop
mov 16*4+3*8(%rsp),%rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
ret
SET_SIZE(SHA256TransformBlocks)
.data
.align 64
.type K256,@object
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
#endif /* !lint && !__lint */
#ifdef __ELF__
.section .note.GNU-stack,"",%progbits
#endif