diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index ff812e7..e2eaf44 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -35,6 +35,8 @@ namespace RandomX { static const char* regE[4] = { "xmm4", "xmm5", "xmm6", "xmm7" }; static const char* regA[4] = { "xmm8", "xmm9", "xmm10", "xmm11" }; + static const char* fsumInstr[4] = { "paddb", "paddw", "paddd", "paddq" }; + static const char* regA4 = "xmm12"; static const char* dblMin = "xmm13"; static const char* absMask = "xmm14"; @@ -365,6 +367,7 @@ namespace RandomX { instr.dst %= 4; instr.src %= 4; asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; + //asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; } //5 uOPs @@ -380,6 +383,7 @@ namespace RandomX { instr.dst %= 4; instr.src %= 4; asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; + //asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; } //5 uOPs @@ -391,9 +395,9 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_FPNEG_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_CFSUM_R(Instruction& instr, int i) { instr.dst %= 4; - asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl; + asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; } //1 uOPs @@ -538,7 +542,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(FPNEG_R) + INST_HANDLE(CFSUM_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 5abebc1..5abf707 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -69,7 +69,7 @@ namespace RandomX { void h_FADD_M(Instruction&, int); void h_FSUB_R(Instruction&, int); void h_FSUB_M(Instruction&, int); - void h_FPNEG_R(Instruction&, int); + void h_CFSUM_R(Instruction&, int); void h_FMUL_R(Instruction&, int); void h_FMUL_M(Instruction&, int); void h_FDIV_R(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index ebacf42..3bf3371 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -44,7 +44,7 @@ namespace RandomX { } static uint64_t getSmallPositiveFloatBits(uint64_t entropy) { - auto exponent = entropy >> 60; //0..15 + auto exponent = entropy >> 59; //0..31 auto mantissa = entropy & mantissaMask; exponent += exponentBias; exponent &= exponentMask; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 18017e7..5784c99 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -247,9 +247,9 @@ namespace RandomX { os << std::endl; } - void Instruction::h_FPNEG_R(std::ostream& os) const { + void Instruction::h_CFSUM_R(std::ostream& os) const { auto dstIndex = dst % 4; - os << "f" << dstIndex << std::endl; + os << "f" << dstIndex << ", " << (1 << ((mod % 4) + 3)) << std::endl; } void Instruction::h_FMUL_R(std::ostream& os) const { @@ -370,7 +370,7 @@ namespace RandomX { INST_NAME(FADD_M) INST_NAME(FSUB_R) INST_NAME(FSUB_M) - INST_NAME(FPNEG_R) + INST_NAME(CFSUM_R) //Floating point group E INST_NAME(FMUL_R) @@ -421,7 +421,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(FPNEG_R) + INST_HANDLE(CFSUM_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index f530bbc..4f9e178 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -54,7 +54,7 @@ namespace RandomX { constexpr int FADD_M = 22; constexpr int FSUB_R = 23; constexpr int FSUB_M = 24; - constexpr int FPNEG_R = 25; + constexpr int CFSUM_R = 25; constexpr int FMUL_R = 26; constexpr int FMUL_M = 27; constexpr int FDIV_R = 28; @@ -116,7 +116,7 @@ namespace RandomX { void h_FADD_M(std::ostream&) const; void h_FSUB_R(std::ostream&) const; void h_FSUB_M(std::ostream&) const; - void h_FPNEG_R(std::ostream&) const; + void h_CFSUM_R(std::ostream&) const; void h_FMUL_R(std::ostream&) const; void h_FMUL_M(std::ostream&) const; void h_FDIV_R(std::ostream&) const; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index e891a27..de803be 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -87,7 +87,7 @@ namespace RandomX { ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff - ; xmm15 -> sign mask 0x80000000000000008000000000000000 + ; xmm15 -> unused */ @@ -178,6 +178,8 @@ namespace RandomX { static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; static const uint8_t REX_ANDPS_XMM12[] = { 0x41, 0x0f, 0x54, 0xe6 }; + static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; + static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -615,6 +617,9 @@ namespace RandomX { instr.src %= 4; emit(REX_ADDPD); emitByte(0xc0 + instr.src + 8 * instr.dst); + //emit(REX_PADD); + //emitByte(PADD_OPCODES[instr.mod % 4]); + //emitByte(0xf8 + instr.dst); } void JitCompilerX86::h_FADD_M(Instruction& instr) { @@ -630,6 +635,9 @@ namespace RandomX { instr.src %= 4; emit(REX_SUBPD); emitByte(0xc0 + instr.src + 8 * instr.dst); + //emit(REX_PADD); + //emitByte(PADD_OPCODES[instr.mod % 4]); + //emitByte(0xf8 + instr.dst); } void JitCompilerX86::h_FSUB_M(Instruction& instr) { @@ -640,7 +648,7 @@ namespace RandomX { emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_FPNEG_R(Instruction& instr) { + void JitCompilerX86::h_CFSUM_R(Instruction& instr) { instr.dst %= 4; emit(REX_XORPS); emitByte(0xc7 + 8 * instr.dst); @@ -794,7 +802,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(FPNEG_R) + INST_HANDLE(CFSUM_R) INST_HANDLE(FMUL_R) INST_HANDLE(FMUL_M) INST_HANDLE(FDIV_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 4303cfd..feba888 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -115,7 +115,7 @@ namespace RandomX { void h_FADD_M(Instruction&); void h_FSUB_R(Instruction&); void h_FSUB_M(Instruction&); - void h_FPNEG_R(Instruction&); + void h_CFSUM_R(Instruction&); void h_FMUL_R(Instruction&); void h_FMUL_M(Instruction&); void h_FDIV_R(Instruction&); diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index a0acebc..bd2bbdd 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -12,6 +12,10 @@ mulpd xmm1, xmm5 mulpd xmm2, xmm6 mulpd xmm3, xmm7 + ;# xorpd xmm0, xmm15 + ;# xorpd xmm1, xmm15 + ;# xorpd xmm2, xmm15 + ;# xorpd xmm3, xmm15 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index 3a994ab..74c2a08 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -18,5 +18,5 @@ movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - movapd xmm15, xmmword ptr [signMask] + ;# xorpd xmm15, xmm15 diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ac49e50..ff43578 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -54,7 +54,7 @@ executeProgram PROC ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask - ; xmm15 -> sign mask + ; xmm15 -> unused ; store callee-saved registers push rbx @@ -104,7 +104,7 @@ executeProgram PROC movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - movapd xmm15, xmmword ptr [signMask] + ;# xorps xmm15, xmm15 jmp program_begin diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 32225e7..3998d07 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -21,10 +21,10 @@ along with RandomX. If not, see. //Integer #define WT_IADD_R 12 -#define WT_IADD_M 3 -#define WT_IADD_RC 12 +#define WT_IADD_M 7 +#define WT_IADD_RC 16 #define WT_ISUB_R 12 -#define WT_ISUB_M 3 +#define WT_ISUB_M 7 #define WT_IMUL_9C 9 #define WT_IMUL_R 16 #define WT_IMUL_M 4 @@ -35,10 +35,10 @@ along with RandomX. If not, see. #define WT_IDIV_C 4 #define WT_ISDIV_C 4 #define WT_INEG_R 2 -#define WT_IXOR_R 12 +#define WT_IXOR_R 16 #define WT_IXOR_M 4 -#define WT_IROR_R 10 -#define WT_IROL_R 10 +#define WT_IROR_R 8 +#define WT_IROL_R 8 #define WT_ISWAP_R 4 //Common floating point @@ -49,22 +49,22 @@ along with RandomX. If not, see. #define WT_FADD_M 5 #define WT_FSUB_R 20 #define WT_FSUB_M 5 -#define WT_FPNEG_R 6 //Floating point group E -#define WT_FMUL_R 16 -#define WT_FMUL_M 4 -#define WT_FDIV_R 7 -#define WT_FDIV_M 1 +#define WT_FMUL_R 20 +#define WT_FMUL_M 0 +#define WT_FDIV_R 0 +#define WT_FDIV_M 4 #define WT_FSQRT_R 6 //Control #define WT_COND_R 7 #define WT_COND_M 1 #define WT_CFROUND 1 +#define WT_CFSUM_R 0 //Store -#define WT_ISTORE 18 +#define WT_ISTORE 16 #define WT_FSTORE 0 #define WT_NOP 0 @@ -74,7 +74,7 @@ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \ -WT_FPNEG_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ +WT_CFSUM_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ WT_FSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, diff --git a/src/program.inc b/src/program.inc index e4de06f..ba4b937 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,768 +1,731 @@ - ; IMUL_R r0, r7 - imul r8, r15 - ; ISMULH_R r2, r1 - mov rax, r10 - imul r9 - mov r10, rdx - ; IMUL_R r2, r4 - imul r10, r12 - ; IADD_R r7, r0 - add r15, r8 - ; FPSQRT_R e0 - sqrtpd xmm4, xmm4 - ; IMUL_R r3, r6 - imul r11, r14 - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IMULH_M r6, L1[r3] - mov ecx, r11d - and ecx, 16376 - mov rax, r14 - mul qword ptr [rsi+rcx] - mov r14, rdx - ; IMUL_R r5, r1 - imul r13, r9 - ; FPADD_M f0, L2[r6] - mov eax, r14d + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IADD_RC r2, r5, -1621224194 + lea r10, [r10+r13-1621224194] + ; ISTORE L2[r2], r7 + mov eax, r10d and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; IROR_R r4, r3 - mov ecx, r11d - ror r12, cl - ; IXOR_M r4, L3[984888] - xor r12, qword ptr [rsi+984888] - ; IROR_R r0, r3 - mov ecx, r11d - ror r8, cl - ; IROR_R r0, r4 - mov ecx, r12d - ror r8, cl - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IMUL_R r0, r2 - imul r8, r10 - ; ISUB_M r0, L1[r3] - mov eax, r11d + mov qword ptr [rsi+rax], r15 + ; FSUB_M f2, L1[r2] + mov eax, r10d and eax, 16376 - sub r8, qword ptr [rsi+rax] - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; ISWAP_R r7, r4 - xchg r15, r12 - ; IDIV_C r1, 3690475308 - mov rax, r9 - shr rax, 2 - mov rcx, 5367070356934653253 - mul rcx - shr rdx, 28 - add r9, rdx - ; IROL_R r4, r2 - mov ecx, r10d - rol r12, cl - ; IMUL_M r5, L1[r4] + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; IMUL_9C r6, -1003503212 + lea r14, [r14+r14*8-1003503212] + ; FSUB_R f1, a0 + subpd xmm1, xmm8 + ; IXOR_M r5, L2[r3] + mov eax, r11d + and eax, 262136 + xor r13, qword ptr [rsi+rax] + ; FSUB_M f2, L1[r4] mov eax, r12d and eax, 16376 - imul r13, qword ptr [rsi+rax] - ; IROL_R r4, r7 - mov ecx, r15d - rol r12, cl - ; ISUB_R r3, r1 - sub r11, r9 - ; IADD_R r7, r0 - add r15, r8 - ; IADD_M r1, L1[r3] - mov eax, r11d + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; ISDIV_C r0, 1400272688 + mov rax, 7072565507528518045 + imul r8 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax + add r8, rdx + ; IMUL_M r3, L1[r7] + mov eax, r15d and eax, 16376 - add r9, qword ptr [rsi+rax] - ; FPMUL_R e2, a2 - mulpd xmm6, xmm10 - ; IADD_R r6, -1115286770 - add r14, -1115286770 - ; FPDIV_R e2, a3 - divpd xmm6, xmm11 - maxpd xmm6, xmm13 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; IXOR_R r3, r7 - xor r11, r15 + imul r11, qword ptr [rsi+rax] + ; IROL_R r2, r3 + mov ecx, r11d + rol r10, cl + ; IMULH_R r6, r0 + mov rax, r14 + mul r8 + mov r14, rdx + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IADD_RC r3, r4, -52260428 + lea r11, [r11+r12-52260428] + ; IADD_R r7, -1138617760 + add r15, -1138617760 + ; IXOR_M r2, L1[r6] + mov eax, r14d + and eax, 16376 + xor r10, qword ptr [rsi+rax] + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; IXOR_R r7, r1 + xor r15, r9 + ; COND_R r2, lt(r7, -41618808) + xor ecx, ecx + cmp r15d, -41618808 + setl cl + add r10, rcx + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; COND_R r4, sg(r1, -961190365) + xor ecx, ecx + cmp r9d, -961190365 + sets cl + add r12, rcx + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; ISTORE L1[r6], r2 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; ISUB_R r6, r5 + sub r14, r13 + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] ; ISTORE L1[r4], r3 mov eax, r12d and eax, 16376 mov qword ptr [rsi+rax], r11 - ; IROR_R r3, r6 - mov ecx, r14d - ror r11, cl - ; ISMULH_R r0, r6 - mov rax, r8 - imul r14 - mov r8, rdx - ; IROR_R r6, r5 - mov ecx, r13d - ror r14, cl - ; IMULH_M r6, L2[r0] - mov ecx, r8d - and ecx, 262136 - mov rax, r14 - mul qword ptr [rsi+rcx] - mov r14, rdx - ; ISUB_R r2, 1512125960 - sub r10, 1512125960 + ; COND_M r6, sg(L1[r6], 1048782623) + xor ecx, ecx + mov eax, r14d + and eax, 16376 + cmp dword ptr [rsi+rax], 1048782623 + sets cl + add r14, rcx + ; FSQRT_R e0 + sqrtpd xmm4, xmm4 + ; INEG_R r2 + neg r10 + ; FSQRT_R e1 + sqrtpd xmm5, xmm5 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 ; IMUL_R r7, r6 imul r15, r14 - ; IMULH_R r6, r7 - mov rax, r14 - mul r15 - mov r14, rdx - ; ISUB_R r4, r1 - sub r12, r9 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IXOR_R r5, r2 - xor r13, r10 - ; FPADD_M f2, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; IMULH_R r6, r1 - mov rax, r14 - mul r9 - mov r14, rdx - ; ISUB_M r5, L1[r0] - mov eax, r8d - and eax, 16376 - sub r13, qword ptr [rsi+rax] - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IMUL_R r4, r6 - imul r12, r14 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; ISUB_R r3, r2 - sub r11, r10 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IROL_R r7, r0 - mov ecx, r8d - rol r15, cl - ; FPSUB_R f3, a2 - subpd xmm3, xmm10 - ; IROL_R r3, r7 - mov ecx, r15d - rol r11, cl - ; ISWAP_R r5, r7 - xchg r13, r15 - ; IDIV_C r5, 749951529 - mov rax, 13205547200481862341 - mul r13 - shr rdx, 29 - add r13, rdx - ; FPADD_R f3, a0 - addpd xmm3, xmm8 - ; IMUL_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; FPADD_R f1, a1 - addpd xmm1, xmm9 - ; IROR_R r2, 60 - ror r10, 60 - ; IROR_R r5, r4 - mov ecx, r12d - ror r13, cl - ; FPADD_R f2, a0 - addpd xmm2, xmm8 - ; IXOR_M r4, L1[r6] - mov eax, r14d - and eax, 16376 - xor r12, qword ptr [rsi+rax] - ; IXOR_R r2, r6 - xor r10, r14 - ; FPADD_M f3, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; ISUB_R r7, r6 - sub r15, r14 - ; IMUL_9C r2, -962375579 - lea r10, [r10+r10*8-962375579] - ; FPSUB_R f3, a2 - subpd xmm3, xmm10 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; IMUL_R r1, r5 - imul r9, r13 - ; IMUL_R r6, r4 - imul r14, r12 - ; ISWAP_R r0, r2 - xchg r8, r10 - ; ISUB_R r6, r5 - sub r14, r13 - ; FPSUB_R f2, a1 - subpd xmm2, xmm9 - ; ISDIV_C r6, 652931802 - mov rax, -3278972671018643631 - imul r14 - xor eax, eax - add rdx, r14 - sar rdx, 29 - sets al - add rdx, rax - add r14, rdx - ; IMUL_9C r5, -1142924545 - lea r13, [r13+r13*8-1142924545] - ; ISUB_R r7, 1085161834 - sub r15, 1085161834 - ; IMUL_R r4, r6 - imul r12, r14 - ; FPMUL_M e1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; FPMUL_M e3, L2[r1] - mov eax, r9d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; COND_R r2, lt(r5, 1635027096) - xor ecx, ecx - cmp r13d, 1635027096 - setl cl - add r10, rcx - ; IMUL_R r5, -1219696062 - imul r13, -1219696062 - ; IXOR_R r5, r0 - xor r13, r8 - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; FPADD_R f3, a2 - addpd xmm3, xmm10 - ; FPSUB_R f1, a3 - subpd xmm1, xmm11 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; FPDIV_R e1, a3 - divpd xmm5, xmm11 - maxpd xmm5, xmm13 - ; IXOR_M r6, L1[r0] - mov eax, r8d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; ISUB_R r7, r4 - sub r15, r12 - ; ISUB_M r6, L1[r1] - mov eax, r9d - and eax, 16376 - sub r14, qword ptr [rsi+rax] - ; ISTORE L1[r5], r3 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IMUL_R r5, r1 - imul r13, r9 - ; IROR_R r3, r2 - mov ecx, r10d - ror r11, cl - ; IMUL_R r4, r7 - imul r12, r15 - ; ISDIV_C r6, -54134756 - mov rax, 7012869325244995177 - imul r14 - xor eax, eax - sub rdx, r14 - sar rdx, 25 - sets al - add rdx, rax - add r14, rdx - ; FPMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FPSUB_M f2, L2[r4] - mov eax, r12d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; IMUL_R r0, r5 - imul r8, r13 - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; COND_R r5, be(r4, 1545677311) - xor ecx, ecx - cmp r12d, 1545677311 - setbe cl - add r13, rcx - ; IMUL_R r6, r3 - imul r14, r11 - ; IROL_R r6, r2 - mov ecx, r10d - rol r14, cl - ; FPDIV_R e3, a1 - divpd xmm7, xmm9 - maxpd xmm7, xmm13 - ; IXOR_M r5, L1[r1] - mov eax, r9d - and eax, 16376 - xor r13, qword ptr [rsi+rax] - ; COND_R r3, ab(r2, 1734636060) - xor ecx, ecx - cmp r10d, 1734636060 - seta cl - add r11, rcx - ; ISTORE L1[r2], r7 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; IADD_R r5, r6 - add r13, r14 - ; FPSUB_R f1, a2 - subpd xmm1, xmm10 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IROL_R r2, r6 - mov ecx, r14d - rol r10, cl - ; IMUL_R r0, r4 - imul r8, r12 - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; ISUB_R r6, r7 - sub r14, r15 - ; IROL_R r4, r7 - mov ecx, r15d - rol r12, cl - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; ISUB_R r1, r3 - sub r9, r11 - ; FPDIV_R e0, a1 - divpd xmm4, xmm9 - maxpd xmm4, xmm13 - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPSUB_M f2, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; FPMUL_R e0, a0 - mulpd xmm4, xmm8 - ; IXOR_M r4, L2[r7] - mov eax, r15d - and eax, 262136 - xor r12, qword ptr [rsi+rax] - ; FPSUB_R f3, a3 - subpd xmm3, xmm11 - ; ISMULH_R r1, r6 - mov rax, r9 - imul r14 - mov r9, rdx - ; COND_R r4, be(r7, 224524971) - xor ecx, ecx - cmp r15d, 224524971 - setbe cl - add r12, rcx - ; FPADD_M f2, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; IMUL_R r5, r4 - imul r13, r12 - ; IADD_RC r1, r5, 370966979 - lea r9, [r9+r13+370966979] - ; IADD_RC r7, r3, -1762209698 - lea r15, [r15+r11-1762209698] - ; FPMUL_M e3, L2[r2] - mov eax, r10d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; ISUB_R r2, r7 - sub r10, r15 - ; IMUL_9C r3, 171157280 - lea r11, [r11+r11*8+171157280] - ; ISUB_R r3, r5 - sub r11, r13 - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; ISTORE L1[r4], r1 - mov eax, r12d - and eax, 16376 - mov qword ptr [rsi+rax], r9 - ; IADD_R r0, r2 - add r8, r10 - ; IXOR_R r7, r6 - xor r15, r14 - ; IROR_R r0, r4 - mov ecx, r12d - ror r8, cl - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IXOR_M r4, L1[r7] - mov eax, r15d - and eax, 16376 - xor r12, qword ptr [rsi+rax] - ; ISTORE L1[r5], r7 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; IMUL_9C r7, -1206742834 - lea r15, [r15+r15*8-1206742834] - ; ISMULH_R r0, r4 + ; IMULH_R r0, r4 mov rax, r8 - imul r12 + mul r12 mov r8, rdx - ; FPADD_R f2, a0 - addpd xmm2, xmm8 - ; FPSUB_R f1, a0 - subpd xmm1, xmm8 - ; INEG_R r7 - neg r15 - ; COND_M r0, of(L1[r5], -2056260506) - xor ecx, ecx - mov eax, r13d - and eax, 16376 - cmp dword ptr [rsi+rax], -2056260506 - seto cl - add r8, rcx - ; FPSQRT_R e2 + ; IMUL_R r5, r3 + imul r13, r11 + ; FSQRT_R e2 sqrtpd xmm6, xmm6 - ; IMUL_R r3, r4 - imul r11, r12 - ; FPNEG_R f1 - xorps xmm1, xmm15 - ; FPADD_M f2, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; FPMUL_M e3, L2[r5] - mov eax, r13d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; ISTORE L1[r2], r2 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; IMUL_M r3, L2[r4] - mov eax, r12d - and eax, 262136 - imul r11, qword ptr [rsi+rax] - ; IROL_R r5, r6 - mov ecx, r14d - rol r13, cl - ; IADD_RC r4, r3, -904431293 - lea r12, [r12+r11-904431293] - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; IROL_R r7, r0 - mov ecx, r8d - rol r15, cl - ; ISTORE L2[r1], r7 - mov eax, r9d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; IROL_R r4, r3 - mov ecx, r11d - rol r12, cl - ; IADD_R r5, r2 - add r13, r10 - ; COND_R r3, ge(r6, -444806705) - xor ecx, ecx - cmp r14d, -444806705 - setge cl - add r11, rcx - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; IROL_R r0, 57 - rol r8, 57 - ; IADD_R r0, r2 - add r8, r10 - ; IADD_R r7, r4 - add r15, r12 - ; IROL_R r1, r7 - mov ecx, r15d - rol r9, cl - ; IXOR_M r7, L2[r5] - mov eax, r13d - and eax, 262136 - xor r15, qword ptr [rsi+rax] - ; ISTORE L1[r2], r0 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; ISUB_R r1, r4 - sub r9, r12 - ; IXOR_R r5, r0 - xor r13, r8 - ; IXOR_M r7, L2[r1] - mov eax, r9d - and eax, 262136 - xor r15, qword ptr [rsi+rax] - ; FPSUB_R f0, a0 - subpd xmm0, xmm8 - ; IXOR_M r1, L1[r4] - mov eax, r12d - and eax, 16376 - xor r9, qword ptr [rsi+rax] - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; ISDIV_C r1, 1473744194 - mov rax, -5006799265644655925 - imul r9 - xor eax, eax - add rdx, r9 - sar rdx, 30 - sets al - add rdx, rax - add r9, rdx - ; IMUL_9C r1, 1626151459 - lea r9, [r9+r9*8+1626151459] - ; IXOR_M r6, L1[r4] - mov eax, r12d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; FPADD_R f3, a2 - addpd xmm3, xmm10 - ; ISUB_R r6, r7 - sub r14, r15 - ; IADD_RC r1, r5, 2075955307 - lea r9, [r9+r13+2075955307] - ; IROL_R r6, r3 - mov ecx, r11d - rol r14, cl - ; IMULH_R r2, -1135671124 - mov eax, -1135671124 - mul r10 - add r10, rdx - ; ISUB_R r5, r2 - sub r13, r10 - ; IMULH_R r3, r5 - mov rax, r11 - mul r13 - mov r11, rdx - ; IADD_M r4, L3[386040] - add r12, qword ptr [rsi+386040] - ; COND_R r6, ge(r4, 1518758207) - xor ecx, ecx - cmp r12d, 1518758207 - setge cl - add r14, rcx - ; FPDIV_R e3, a1 - divpd xmm7, xmm9 - maxpd xmm7, xmm13 - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; FPADD_M f1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 - ; FPMUL_M e0, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; IROL_R r5, r1 - mov ecx, r9d - rol r13, cl - ; FPADD_R f3, a0 + ; FADD_R f3, a0 addpd xmm3, xmm8 - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IROR_R r0, r7 - mov ecx, r15d - ror r8, cl - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; IXOR_R r7, r0 - xor r15, r8 - ; ISTORE L1[r4], r1 + ; IADD_R r3, r2 + add r11, r10 + ; FADD_R f1, a0 + addpd xmm1, xmm8 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; FADD_R f0, a1 + addpd xmm0, xmm9 + ; IMUL_R r5, r6 + imul r13, r14 + ; IADD_RC r1, r2, -1263285243 + lea r9, [r9+r10-1263285243] + ; ISUB_M r4, L1[r6] + mov eax, r14d + and eax, 16376 + sub r12, qword ptr [rsi+rax] + ; IROL_R r7, r2 + mov ecx, r10d + rol r15, cl + ; IMUL_R r0, r7 + imul r8, r15 + ; IXOR_R r1, r6 + xor r9, r14 + ; IXOR_M r2, L1[r4] mov eax, r12d and eax, 16376 - mov qword ptr [rsi+rax], r9 - ; ISTORE L2[r0], r4 + xor r10, qword ptr [rsi+rax] + ; FSUB_R f3, a1 + subpd xmm3, xmm9 + ; ISTORE L1[r0], r5 mov eax, r8d - and eax, 262136 - mov qword ptr [rsi+rax], r12 - ; FPDIV_R e3, a3 - divpd xmm7, xmm11 - maxpd xmm7, xmm13 - ; ISTORE L2[r4], r6 - mov eax, r12d - and eax, 262136 - mov qword ptr [rsi+rax], r14 - ; IMUL_R r3, r1 - imul r11, r9 - ; IXOR_R r2, r4 - xor r10, r12 - ; ISTORE L2[r3], r5 + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FDIV_M e2, L2[r3] mov eax, r11d and eax, 262136 - mov qword ptr [rsi+rax], r13 - ; FPMUL_M e2, L2[r4] - mov eax, r12d - and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm6, xmm12 + andps xmm12, xmm14 + divpd xmm6, xmm12 maxpd xmm6, xmm13 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; COND_R r1, ab(r7, -229570354) - xor ecx, ecx - cmp r15d, -229570354 - seta cl - add r9, rcx - ; IROR_R r7, r3 - mov ecx, r11d - ror r15, cl - ; FPDIV_R e2, a0 - divpd xmm6, xmm8 - maxpd xmm6, xmm13 - ; IADD_R r2, r5 - add r10, r13 - ; FPDIV_R e1, a3 - divpd xmm5, xmm11 - maxpd xmm5, xmm13 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; ISUB_R r3, r7 - sub r11, r15 - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; IMUL_M r0, L3[98136] - imul r8, qword ptr [rsi+98136] - ; IMUL_9C r5, -895487055 - lea r13, [r13+r13*8-895487055] - ; IMULH_R r2, r7 - mov rax, r10 - mul r15 - mov r10, rdx - ; IADD_R r4, r1 - add r12, r9 - ; ISDIV_C r0, 494395999 - mov rax, 5007888582388710937 - imul r8 + ; IROL_R r2, r0 + mov ecx, r8d + rol r10, cl + ; IADD_R r7, r5 + add r15, r13 + ; FDIV_M e0, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; IADD_R r2, r0 + add r10, r8 + ; ISTORE L1[r3], r6 + mov eax, r11d + and eax, 16376 + mov qword ptr [rsi+rax], r14 + ; IXOR_R r1, r7 + xor r9, r15 + ; ISUB_M r5, L2[r7] + mov eax, r15d + and eax, 262136 + sub r13, qword ptr [rsi+rax] + ; ISDIV_C r7, 266992378 + mov rax, -9173520256920442565 + imul r15 xor eax, eax + add rdx, r15 sar rdx, 27 sets al add rdx, rax - add r8, rdx - ; FPSWAP_R e0 - shufpd xmm4, xmm4, 1 - ; IXOR_R r1, r5 - xor r9, r13 - ; COND_R r2, ab(r3, 1932234501) + add r15, rdx + ; FDIV_M e3, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; IMUL_R r2, r0 + imul r10, r8 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IMUL_R r0, r6 + imul r8, r14 + ; ISTORE L1[r0], r7 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; FSUB_R f0, a1 + subpd xmm0, xmm9 + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; IXOR_R r5, r4 + xor r13, r12 + ; ISTORE L2[r7], r2 + mov eax, r15d + and eax, 262136 + mov qword ptr [rsi+rax], r10 + ; ISWAP_R r6, r7 + xchg r14, r15 + ; FADD_R f3, a2 + addpd xmm3, xmm10 + ; ISMULH_R r5, r0 + mov rax, r13 + imul r8 + mov r13, rdx + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] + ; COND_R r7, ge(r6, -1972898485) xor ecx, ecx - cmp r11d, 1932234501 - seta cl + cmp r14d, -1972898485 + setge cl + add r15, rcx + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; IROR_R r7, r6 + mov ecx, r14d + ror r15, cl + ; IADD_RC r2, r4, -117457973 + lea r10, [r10+r12-117457973] + ; IMUL_R r0, -1500893068 + imul r8, -1500893068 + ; IADD_R r2, r3 + add r10, r11 + ; FSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IROR_R r7, r4 + mov ecx, r12d + ror r15, cl + ; IMUL_9C r4, 381194890 + lea r12, [r12+r12*8+381194890] + ; IADD_RC r3, r7, 1050899263 + lea r11, [r11+r15+1050899263] + ; IADD_R r2, r7 + add r10, r15 + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; IADD_RC r6, r6, 540663146 + lea r14, [r14+r14+540663146] + ; IROR_R r5, 58 + ror r13, 58 + ; FSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; FSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; FMUL_R e1, a2 + mulpd xmm5, xmm10 + ; ISWAP_R r5, r6 + xchg r13, r14 + ; IADD_R r5, r3 + add r13, r11 + ; IADD_R r7, -1780268176 + add r15, -1780268176 + ; IADD_RC r7, r0, -1497756854 + lea r15, [r15+r8-1497756854] + ; ISTORE L2[r0], r7 + mov eax, r8d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; ISMULH_R r2, r4 + mov rax, r10 + imul r12 + mov r10, rdx + ; FSUB_R f0, a2 + subpd xmm0, xmm10 + ; ISMULH_R r2, r3 + mov rax, r10 + imul r11 + mov r10, rdx + ; IADD_R r0, r3 + add r8, r11 + ; ISUB_R r7, r2 + sub r15, r10 + ; FADD_R f2, a0 + addpd xmm2, xmm8 + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; FADD_R f2, a3 + addpd xmm2, xmm11 + ; IMUL_R r1, r2 + imul r9, r10 + ; IMUL_M r7, L1[r5] + mov eax, r13d + and eax, 16376 + imul r15, qword ptr [rsi+rax] + ; IMUL_R r3, r2 + imul r11, r10 + ; IXOR_R r1, r0 + xor r9, r8 + ; FSUB_M f0, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; IADD_RC r4, r4, 1456841848 + lea r12, [r12+r12+1456841848] + ; IXOR_R r3, r2 + xor r11, r10 + ; COND_R r0, of(r4, 1678513610) + xor ecx, ecx + cmp r12d, 1678513610 + seto cl + add r8, rcx + ; ISMULH_R r4, -1620573087 + mov rax, -1620573087 + imul r12 + add r12, rdx + ; IMUL_R r4, r1 + imul r12, r9 + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 + ; FADD_M f2, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; FMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; IXOR_R r0, r7 + xor r8, r15 + ; ISTORE L2[r1], r4 + mov eax, r9d + and eax, 262136 + mov qword ptr [rsi+rax], r12 + ; IXOR_M r7, L1[r6] + mov eax, r14d + and eax, 16376 + xor r15, qword ptr [rsi+rax] + ; ISUB_R r2, r4 + sub r10, r12 + ; ISUB_M r4, L1[r6] + mov eax, r14d + and eax, 16376 + sub r12, qword ptr [rsi+rax] + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; IXOR_R r7, r2 + xor r15, r10 + ; IXOR_R r0, r5 + xor r8, r13 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; ISWAP_R r7, r1 + xchg r15, r9 + ; ISWAP_R r1, r4 + xchg r9, r12 + ; COND_R r2, ge(r2, -226330940) + xor ecx, ecx + cmp r10d, -226330940 + setge cl add r10, rcx - ; FPMUL_R e1, a0 - mulpd xmm5, xmm8 - ; FPSUB_M f1, L1[r1] + ; FMUL_R e2, a3 + mulpd xmm6, xmm11 + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; FADD_R f1, a0 + addpd xmm1, xmm8 + ; ISUB_R r7, r5 + sub r15, r13 + ; ISUB_M r0, L1[r1] + mov eax, r9d + and eax, 16376 + sub r8, qword ptr [rsi+rax] + ; FSUB_R f3, a1 + subpd xmm3, xmm9 + ; IROL_R r3, r5 + mov ecx, r13d + rol r11, cl + ; IADD_RC r5, r2, 795784298 + lea r13, [r13+r10+795784298] + ; IADD_RC r0, r4, -2050178553 + lea r8, [r8+r12-2050178553] + ; IMUL_9C r5, 1062534001 + lea r13, [r13+r13*8+1062534001] + ; FADD_R f0, a2 + addpd xmm0, xmm10 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IDIV_C r3, 1662492575 + mov rax, 11914062610815620875 + mul r11 + shr rdx, 30 + add r11, rdx + ; IMUL_M r5, L1[r0] + mov eax, r8d + and eax, 16376 + imul r13, qword ptr [rsi+rax] + ; IDIV_C r4, 1963597892 + mov rax, r12 + shr rax, 2 + mov rcx, 1260889558222626443 + mul rcx + shr rdx, 25 + add r12, rdx + ; IMUL_9C r7, 1820045218 + lea r15, [r15+r15*8+1820045218] + ; IMUL_M r0, L1[r3] + mov eax, r11d + and eax, 16376 + imul r8, qword ptr [rsi+rax] + ; IXOR_R r3, r7 + xor r11, r15 + ; ISMULH_R r4, r2 + mov rax, r12 + imul r10 + mov r12, rdx + ; IROL_R r3, r0 + mov ecx, r8d + rol r11, cl + ; IXOR_R r2, r0 + xor r10, r8 + ; IXOR_M r0, L2[r1] + mov eax, r9d + and eax, 262136 + xor r8, qword ptr [rsi+rax] + ; ISDIV_C r7, -935446980 + mov rax, 7859804860668271393 + imul r15 + xor eax, eax + sub rdx, r15 + sar rdx, 29 + sets al + add rdx, rax + add r15, rdx + ; IMUL_M r6, L1[r2] + mov eax, r10d + and eax, 16376 + imul r14, qword ptr [rsi+rax] + ; FSUB_M f3, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; IADD_RC r4, r2, 1704868083 + lea r12, [r12+r10+1704868083] + ; FADD_R f2, a0 + addpd xmm2, xmm8 + ; ISTORE L1[r0], r0 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; IADD_RC r7, r7, 1302457878 + lea r15, [r15+r15+1302457878] + ; ISUB_R r1, 1330165941 + sub r9, 1330165941 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; IROR_R r0, r4 + mov ecx, r12d + ror r8, cl + ; FSUB_R f1, a0 + subpd xmm1, xmm8 + ; IROR_R r5, r6 + mov ecx, r14d + ror r13, cl + ; COND_R r0, ab(r1, -310933871) + xor ecx, ecx + cmp r9d, -310933871 + seta cl + add r8, rcx + ; COND_R r4, ab(r7, 757929676) + xor ecx, ecx + cmp r15d, 757929676 + seta cl + add r12, rcx + ; FMUL_R e0, a1 + mulpd xmm4, xmm9 + ; IMUL_R r1, r3 + imul r9, r11 + ; ISUB_R r3, r2 + sub r11, r10 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; FDIV_M e1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; IROL_R r1, 5 + rol r9, 5 + ; IADD_R r7, -1421188024 + add r15, -1421188024 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; FSUB_R f2, a3 + subpd xmm2, xmm11 + ; FADD_M f3, L1[r1] mov eax, r9d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; FPSUB_R f0, a0 - subpd xmm0, xmm8 - ; IROL_R r1, r7 - mov ecx, r15d - rol r9, cl - ; IADD_RC r0, r5, -2051588680 - lea r8, [r8+r13-2051588680] - ; COND_R r6, of(r5, -795593984) + addpd xmm3, xmm12 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 + ; IADD_RC r2, r4, -317832028 + lea r10, [r10+r12-317832028] + ; IMUL_M r4, L1[r5] + mov eax, r13d + and eax, 16376 + imul r12, qword ptr [rsi+rax] + ; FDIV_M e1, L1[r7] + mov eax, r15d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; IADD_R r5, r2 + add r13, r10 + ; ISUB_R r4, 401020510 + sub r12, 401020510 + ; IROR_R r3, r0 + mov ecx, r8d + ror r11, cl + ; ISTORE L1[r7], r0 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IMUL_9C r3, 720965215 + lea r11, [r11+r11*8+720965215] + ; IMUL_9C r6, 74948046 + lea r14, [r14+r14*8+74948046] + ; ISTORE L1[r7], r3 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IXOR_R r2, r6 + xor r10, r14 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; ISUB_R r4, r1 + sub r12, r9 + ; ISUB_R r3, r0 + sub r11, r8 + ; IROL_R r7, r5 + mov ecx, r13d + rol r15, cl + ; IMUL_R r2, r6 + imul r10, r14 + ; COND_R r2, ge(r2, -1892157506) xor ecx, ecx - cmp r13d, -795593984 - seto cl - add r14, rcx - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; IMULH_R r7, r3 - mov rax, r15 - mul r11 - mov r15, rdx - ; ISUB_R r7, r4 - sub r15, r12 - ; IROL_R r0, r6 - mov ecx, r14d - rol r8, cl - ; ISDIV_C r1, -675825513 - mov rax, -7326980207007250257 - imul r9 + cmp r10d, -1892157506 + setge cl + add r10, rcx + ; FADD_R f1, a3 + addpd xmm1, xmm11 + ; IADD_R r7, r0 + add r15, r8 + ; IDIV_C r1, 624867857 + mov rax, 15848983434401622933 + mul r9 + shr rdx, 29 + add r9, rdx + ; FADD_R f0, a1 + addpd xmm0, xmm9 + ; IADD_RC r5, r7, -477591118 + lea r13, [r13+r15-477591118] + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; ISUB_M r6, L1[r2] + mov eax, r10d + and eax, 16376 + sub r14, qword ptr [rsi+rax] + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IADD_R r0, r4 + add r8, r12 + ; FSUB_R f3, a1 + subpd xmm3, xmm9 + ; FSUB_R f2, a0 + subpd xmm2, xmm8 + ; ISDIV_C r2, -396711688 + mov rax, 5964731804029407733 + imul r10 xor eax, eax + sub rdx, r10 sar rdx, 28 sets al add rdx, rax - add r9, rdx - ; ISTORE L1[r6], r3 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IROR_R r4, r3 - mov ecx, r11d - ror r12, cl - ; IDIV_C r4, 3919226376 - mov rax, r12 - shr rax, 3 - mov rcx, 2526906936258851663 - mul rcx - shr rdx, 26 - add r12, rdx - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; FPSUB_R f0, a0 + add r10, rdx + ; FSUB_R f2, a2 + subpd xmm2, xmm10 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; FADD_R f1, a3 + addpd xmm1, xmm11 + ; IMUL_R r3, r2 + imul r11, r10 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; ISMULH_R r5, r2 + mov rax, r13 + imul r10 + mov r13, rdx + ; IMULH_R r6, r2 + mov rax, r14 + mul r10 + mov r14, rdx + ; FADD_R f3, a3 + addpd xmm3, xmm11 + ; IMUL_R r6, r7 + imul r14, r15 + ; FSUB_R f0, a0 subpd xmm0, xmm8 - ; IADD_R r0, r2 - add r8, r10 - ; IADD_M r4, L1[r2] - mov eax, r10d + ; FSUB_R f2, a0 + subpd xmm2, xmm8 + ; ISUB_R r6, r4 + sub r14, r12 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IXOR_R r0, r5 + xor r8, r13 + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; IROL_R r7, r5 + mov ecx, r13d + rol r15, cl + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IADD_RC r3, r6, -1317630728 + lea r11, [r11+r14-1317630728] + ; IMUL_R r2, r3 + imul r10, r11 + ; IADD_RC r1, r4, 894105694 + lea r9, [r9+r12+894105694] + ; IMUL_9C r7, 504293473 + lea r15, [r15+r15*8+504293473] + ; FADD_M f1, L2[r0] + mov eax, r8d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; IMUL_R r7, r1 + imul r15, r9 + ; IXOR_R r2, r4 + xor r10, r12 + ; IADD_RC r0, r1, 392362094 + lea r8, [r8+r9+392362094] + ; IDIV_C r4, 1645771433 + mov rax, 376097195048767223 + mul r12 + shr rdx, 25 + add r12, rdx + ; ISUB_R r4, r3 + sub r12, r11 + ; ISUB_M r7, L1[r4] + mov eax, r12d and eax, 16376 - add r12, qword ptr [rsi+rax] - ; ISTORE L1[r7], r2 + sub r15, qword ptr [rsi+rax] + ; IMUL_M r5, L1[r7] mov eax, r15d and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IADD_R r5, r4 - add r13, r12 - ; IXOR_R r6, r7 - xor r14, r15 - ; ISMULH_R r4, r7 - mov rax, r12 - imul r15 - mov r12, rdx - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 + imul r13, qword ptr [rsi+rax] + ; IROR_R r1, r7 + mov ecx, r15d + ror r9, cl + ; INEG_R r4 + neg r12 + ; IMUL_R r3, 1863959234 + imul r11, 1863959234 + ; IROR_R r4, 59 + ror r12, 59 + ; IMUL_M r1, L3[363256] + imul r9, qword ptr [rsi+363256] + ; ISTORE L2[r6], r7 + mov eax, r14d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; ISTORE L1[r1], r5 + mov eax, r9d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FSUB_M f0, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; FSQRT_R e2 + sqrtpd xmm6, xmm6 + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROL_R r5, r2 + mov ecx, r10d + rol r13, cl + ; IADD_R r0, r4 + add r8, r12