diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index ff812e7..e2eaf44 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -35,6 +35,8 @@ namespace RandomX {
static const char* regE[4] = { "xmm4", "xmm5", "xmm6", "xmm7" };
static const char* regA[4] = { "xmm8", "xmm9", "xmm10", "xmm11" };
+ static const char* fsumInstr[4] = { "paddb", "paddw", "paddd", "paddq" };
+
static const char* regA4 = "xmm12";
static const char* dblMin = "xmm13";
static const char* absMask = "xmm14";
@@ -365,6 +367,7 @@ namespace RandomX {
instr.dst %= 4;
instr.src %= 4;
asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl;
+ //asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl;
}
//5 uOPs
@@ -380,6 +383,7 @@ namespace RandomX {
instr.dst %= 4;
instr.src %= 4;
asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl;
+ //asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl;
}
//5 uOPs
@@ -391,9 +395,9 @@ namespace RandomX {
}
//1 uOP
- void AssemblyGeneratorX86::h_FPNEG_R(Instruction& instr, int i) {
+ void AssemblyGeneratorX86::h_CFSUM_R(Instruction& instr, int i) {
instr.dst %= 4;
- asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl;
+ asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl;
}
//1 uOPs
@@ -538,7 +542,7 @@ namespace RandomX {
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
- INST_HANDLE(FPNEG_R)
+ INST_HANDLE(CFSUM_R)
//Floating point group E
INST_HANDLE(FMUL_R)
diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
index 5abebc1..5abf707 100644
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@@ -69,7 +69,7 @@ namespace RandomX {
void h_FADD_M(Instruction&, int);
void h_FSUB_R(Instruction&, int);
void h_FSUB_M(Instruction&, int);
- void h_FPNEG_R(Instruction&, int);
+ void h_CFSUM_R(Instruction&, int);
void h_FMUL_R(Instruction&, int);
void h_FMUL_M(Instruction&, int);
void h_FDIV_R(Instruction&, int);
diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp
index ebacf42..3bf3371 100644
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@@ -44,7 +44,7 @@ namespace RandomX {
}
static uint64_t getSmallPositiveFloatBits(uint64_t entropy) {
- auto exponent = entropy >> 60; //0..15
+ auto exponent = entropy >> 59; //0..31
auto mantissa = entropy & mantissaMask;
exponent += exponentBias;
exponent &= exponentMask;
diff --git a/src/Instruction.cpp b/src/Instruction.cpp
index 18017e7..5784c99 100644
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@@ -247,9 +247,9 @@ namespace RandomX {
os << std::endl;
}
- void Instruction::h_FPNEG_R(std::ostream& os) const {
+ void Instruction::h_CFSUM_R(std::ostream& os) const {
auto dstIndex = dst % 4;
- os << "f" << dstIndex << std::endl;
+ os << "f" << dstIndex << ", " << (1 << ((mod % 4) + 3)) << std::endl;
}
void Instruction::h_FMUL_R(std::ostream& os) const {
@@ -370,7 +370,7 @@ namespace RandomX {
INST_NAME(FADD_M)
INST_NAME(FSUB_R)
INST_NAME(FSUB_M)
- INST_NAME(FPNEG_R)
+ INST_NAME(CFSUM_R)
//Floating point group E
INST_NAME(FMUL_R)
@@ -421,7 +421,7 @@ namespace RandomX {
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
- INST_HANDLE(FPNEG_R)
+ INST_HANDLE(CFSUM_R)
//Floating point group E
INST_HANDLE(FMUL_R)
diff --git a/src/Instruction.hpp b/src/Instruction.hpp
index f530bbc..4f9e178 100644
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@@ -54,7 +54,7 @@ namespace RandomX {
constexpr int FADD_M = 22;
constexpr int FSUB_R = 23;
constexpr int FSUB_M = 24;
- constexpr int FPNEG_R = 25;
+ constexpr int CFSUM_R = 25;
constexpr int FMUL_R = 26;
constexpr int FMUL_M = 27;
constexpr int FDIV_R = 28;
@@ -116,7 +116,7 @@ namespace RandomX {
void h_FADD_M(std::ostream&) const;
void h_FSUB_R(std::ostream&) const;
void h_FSUB_M(std::ostream&) const;
- void h_FPNEG_R(std::ostream&) const;
+ void h_CFSUM_R(std::ostream&) const;
void h_FMUL_R(std::ostream&) const;
void h_FMUL_M(std::ostream&) const;
void h_FDIV_R(std::ostream&) const;
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index e891a27..de803be 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -87,7 +87,7 @@ namespace RandomX {
; xmm12 -> temporary
; xmm13 -> DBL_MIN
; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
- ; xmm15 -> sign mask 0x80000000000000008000000000000000
+ ; xmm15 -> unused
*/
@@ -178,6 +178,8 @@ namespace RandomX {
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
static const uint8_t REX_ANDPS_XMM12[] = { 0x41, 0x0f, 0x54, 0xe6 };
+ static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
+ static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize;
@@ -615,6 +617,9 @@ namespace RandomX {
instr.src %= 4;
emit(REX_ADDPD);
emitByte(0xc0 + instr.src + 8 * instr.dst);
+ //emit(REX_PADD);
+ //emitByte(PADD_OPCODES[instr.mod % 4]);
+ //emitByte(0xf8 + instr.dst);
}
void JitCompilerX86::h_FADD_M(Instruction& instr) {
@@ -630,6 +635,9 @@ namespace RandomX {
instr.src %= 4;
emit(REX_SUBPD);
emitByte(0xc0 + instr.src + 8 * instr.dst);
+ //emit(REX_PADD);
+ //emitByte(PADD_OPCODES[instr.mod % 4]);
+ //emitByte(0xf8 + instr.dst);
}
void JitCompilerX86::h_FSUB_M(Instruction& instr) {
@@ -640,7 +648,7 @@ namespace RandomX {
emitByte(0xc4 + 8 * instr.dst);
}
- void JitCompilerX86::h_FPNEG_R(Instruction& instr) {
+ void JitCompilerX86::h_CFSUM_R(Instruction& instr) {
instr.dst %= 4;
emit(REX_XORPS);
emitByte(0xc7 + 8 * instr.dst);
@@ -794,7 +802,7 @@ namespace RandomX {
INST_HANDLE(FADD_M)
INST_HANDLE(FSUB_R)
INST_HANDLE(FSUB_M)
- INST_HANDLE(FPNEG_R)
+ INST_HANDLE(CFSUM_R)
INST_HANDLE(FMUL_R)
INST_HANDLE(FMUL_M)
INST_HANDLE(FDIV_R)
diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp
index 4303cfd..feba888 100644
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@@ -115,7 +115,7 @@ namespace RandomX {
void h_FADD_M(Instruction&);
void h_FSUB_R(Instruction&);
void h_FSUB_M(Instruction&);
- void h_FPNEG_R(Instruction&);
+ void h_CFSUM_R(Instruction&);
void h_FMUL_R(Instruction&);
void h_FMUL_M(Instruction&);
void h_FDIV_R(Instruction&);
diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc
index a0acebc..bd2bbdd 100644
--- a/src/asm/program_loop_store.inc
+++ b/src/asm/program_loop_store.inc
@@ -12,6 +12,10 @@
mulpd xmm1, xmm5
mulpd xmm2, xmm6
mulpd xmm3, xmm7
+ ;# xorpd xmm0, xmm15
+ ;# xorpd xmm1, xmm15
+ ;# xorpd xmm2, xmm15
+ ;# xorpd xmm3, xmm15
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc
index 3a994ab..74c2a08 100644
--- a/src/asm/program_prologue_load.inc
+++ b/src/asm/program_prologue_load.inc
@@ -18,5 +18,5 @@
movapd xmm11, xmmword ptr [rcx+120]
movapd xmm13, xmmword ptr [minDbl]
movapd xmm14, xmmword ptr [absMask]
- movapd xmm15, xmmword ptr [signMask]
+ ;# xorpd xmm15, xmm15
diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm
index ac49e50..ff43578 100644
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@@ -54,7 +54,7 @@ executeProgram PROC
; xmm12 -> temporary
; xmm13 -> DBL_MIN
; xmm14 -> absolute value mask
- ; xmm15 -> sign mask
+ ; xmm15 -> unused
; store callee-saved registers
push rbx
@@ -104,7 +104,7 @@ executeProgram PROC
movapd xmm11, xmmword ptr [rcx+120]
movapd xmm13, xmmword ptr [minDbl]
movapd xmm14, xmmword ptr [absMask]
- movapd xmm15, xmmword ptr [signMask]
+ ;# xorps xmm15, xmm15
jmp program_begin
diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp
index 32225e7..3998d07 100644
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@@ -21,10 +21,10 @@ along with RandomX. If not, see.
//Integer
#define WT_IADD_R 12
-#define WT_IADD_M 3
-#define WT_IADD_RC 12
+#define WT_IADD_M 7
+#define WT_IADD_RC 16
#define WT_ISUB_R 12
-#define WT_ISUB_M 3
+#define WT_ISUB_M 7
#define WT_IMUL_9C 9
#define WT_IMUL_R 16
#define WT_IMUL_M 4
@@ -35,10 +35,10 @@ along with RandomX. If not, see.
#define WT_IDIV_C 4
#define WT_ISDIV_C 4
#define WT_INEG_R 2
-#define WT_IXOR_R 12
+#define WT_IXOR_R 16
#define WT_IXOR_M 4
-#define WT_IROR_R 10
-#define WT_IROL_R 10
+#define WT_IROR_R 8
+#define WT_IROL_R 8
#define WT_ISWAP_R 4
//Common floating point
@@ -49,22 +49,22 @@ along with RandomX. If not, see.
#define WT_FADD_M 5
#define WT_FSUB_R 20
#define WT_FSUB_M 5
-#define WT_FPNEG_R 6
//Floating point group E
-#define WT_FMUL_R 16
-#define WT_FMUL_M 4
-#define WT_FDIV_R 7
-#define WT_FDIV_M 1
+#define WT_FMUL_R 20
+#define WT_FMUL_M 0
+#define WT_FDIV_R 0
+#define WT_FDIV_M 4
#define WT_FSQRT_R 6
//Control
#define WT_COND_R 7
#define WT_COND_M 1
#define WT_CFROUND 1
+#define WT_CFSUM_R 0
//Store
-#define WT_ISTORE 18
+#define WT_ISTORE 16
#define WT_FSTORE 0
#define WT_NOP 0
@@ -74,7 +74,7 @@ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \
-WT_FPNEG_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \
+WT_CFSUM_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \
WT_FSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP;
static_assert(wtSum == 256,
diff --git a/src/program.inc b/src/program.inc
index e4de06f..ba4b937 100644
--- a/src/program.inc
+++ b/src/program.inc
@@ -1,768 +1,731 @@
- ; IMUL_R r0, r7
- imul r8, r15
- ; ISMULH_R r2, r1
- mov rax, r10
- imul r9
- mov r10, rdx
- ; IMUL_R r2, r4
- imul r10, r12
- ; IADD_R r7, r0
- add r15, r8
- ; FPSQRT_R e0
- sqrtpd xmm4, xmm4
- ; IMUL_R r3, r6
- imul r11, r14
- ; FPMUL_R e3, a1
- mulpd xmm7, xmm9
- ; IMULH_M r6, L1[r3]
- mov ecx, r11d
- and ecx, 16376
- mov rax, r14
- mul qword ptr [rsi+rcx]
- mov r14, rdx
- ; IMUL_R r5, r1
- imul r13, r9
- ; FPADD_M f0, L2[r6]
- mov eax, r14d
+ ; FMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; IADD_RC r2, r5, -1621224194
+ lea r10, [r10+r13-1621224194]
+ ; ISTORE L2[r2], r7
+ mov eax, r10d
and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm0, xmm12
- ; IROR_R r4, r3
- mov ecx, r11d
- ror r12, cl
- ; IXOR_M r4, L3[984888]
- xor r12, qword ptr [rsi+984888]
- ; IROR_R r0, r3
- mov ecx, r11d
- ror r8, cl
- ; IROR_R r0, r4
- mov ecx, r12d
- ror r8, cl
- ; FPMUL_R e0, a1
- mulpd xmm4, xmm9
- ; IMUL_R r0, r2
- imul r8, r10
- ; ISUB_M r0, L1[r3]
- mov eax, r11d
+ mov qword ptr [rsi+rax], r15
+ ; FSUB_M f2, L1[r2]
+ mov eax, r10d
and eax, 16376
- sub r8, qword ptr [rsi+rax]
- ; FPSUB_R f3, a1
- subpd xmm3, xmm9
- ; ISWAP_R r7, r4
- xchg r15, r12
- ; IDIV_C r1, 3690475308
- mov rax, r9
- shr rax, 2
- mov rcx, 5367070356934653253
- mul rcx
- shr rdx, 28
- add r9, rdx
- ; IROL_R r4, r2
- mov ecx, r10d
- rol r12, cl
- ; IMUL_M r5, L1[r4]
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm2, xmm12
+ ; IMUL_9C r6, -1003503212
+ lea r14, [r14+r14*8-1003503212]
+ ; FSUB_R f1, a0
+ subpd xmm1, xmm8
+ ; IXOR_M r5, L2[r3]
+ mov eax, r11d
+ and eax, 262136
+ xor r13, qword ptr [rsi+rax]
+ ; FSUB_M f2, L1[r4]
mov eax, r12d
and eax, 16376
- imul r13, qword ptr [rsi+rax]
- ; IROL_R r4, r7
- mov ecx, r15d
- rol r12, cl
- ; ISUB_R r3, r1
- sub r11, r9
- ; IADD_R r7, r0
- add r15, r8
- ; IADD_M r1, L1[r3]
- mov eax, r11d
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm2, xmm12
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; ISDIV_C r0, 1400272688
+ mov rax, 7072565507528518045
+ imul r8
+ xor eax, eax
+ sar rdx, 29
+ sets al
+ add rdx, rax
+ add r8, rdx
+ ; IMUL_M r3, L1[r7]
+ mov eax, r15d
and eax, 16376
- add r9, qword ptr [rsi+rax]
- ; FPMUL_R e2, a2
- mulpd xmm6, xmm10
- ; IADD_R r6, -1115286770
- add r14, -1115286770
- ; FPDIV_R e2, a3
- divpd xmm6, xmm11
- maxpd xmm6, xmm13
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; IXOR_R r3, r7
- xor r11, r15
+ imul r11, qword ptr [rsi+rax]
+ ; IROL_R r2, r3
+ mov ecx, r11d
+ rol r10, cl
+ ; IMULH_R r6, r0
+ mov rax, r14
+ mul r8
+ mov r14, rdx
+ ; FMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; IADD_RC r3, r4, -52260428
+ lea r11, [r11+r12-52260428]
+ ; IADD_R r7, -1138617760
+ add r15, -1138617760
+ ; IXOR_M r2, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ xor r10, qword ptr [rsi+rax]
+ ; FSUB_R f2, a1
+ subpd xmm2, xmm9
+ ; IXOR_R r7, r1
+ xor r15, r9
+ ; COND_R r2, lt(r7, -41618808)
+ xor ecx, ecx
+ cmp r15d, -41618808
+ setl cl
+ add r10, rcx
+ ; FMUL_R e3, a0
+ mulpd xmm7, xmm8
+ ; COND_R r4, sg(r1, -961190365)
+ xor ecx, ecx
+ cmp r9d, -961190365
+ sets cl
+ add r12, rcx
+ ; FADD_R f2, a1
+ addpd xmm2, xmm9
+ ; FSUB_R f0, a3
+ subpd xmm0, xmm11
+ ; ISTORE L1[r6], r2
+ mov eax, r14d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r10
+ ; ISUB_R r6, r5
+ sub r14, r13
+ ; IADD_M r0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ add r8, qword ptr [rsi+rax]
; ISTORE L1[r4], r3
mov eax, r12d
and eax, 16376
mov qword ptr [rsi+rax], r11
- ; IROR_R r3, r6
- mov ecx, r14d
- ror r11, cl
- ; ISMULH_R r0, r6
- mov rax, r8
- imul r14
- mov r8, rdx
- ; IROR_R r6, r5
- mov ecx, r13d
- ror r14, cl
- ; IMULH_M r6, L2[r0]
- mov ecx, r8d
- and ecx, 262136
- mov rax, r14
- mul qword ptr [rsi+rcx]
- mov r14, rdx
- ; ISUB_R r2, 1512125960
- sub r10, 1512125960
+ ; COND_M r6, sg(L1[r6], 1048782623)
+ xor ecx, ecx
+ mov eax, r14d
+ and eax, 16376
+ cmp dword ptr [rsi+rax], 1048782623
+ sets cl
+ add r14, rcx
+ ; FSQRT_R e0
+ sqrtpd xmm4, xmm4
+ ; INEG_R r2
+ neg r10
+ ; FSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; FMUL_R e1, a3
+ mulpd xmm5, xmm11
; IMUL_R r7, r6
imul r15, r14
- ; IMULH_R r6, r7
- mov rax, r14
- mul r15
- mov r14, rdx
- ; ISUB_R r4, r1
- sub r12, r9
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
- ; IXOR_R r5, r2
- xor r13, r10
- ; FPADD_M f2, L1[r0]
- mov eax, r8d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm2, xmm12
- ; IMULH_R r6, r1
- mov rax, r14
- mul r9
- mov r14, rdx
- ; ISUB_M r5, L1[r0]
- mov eax, r8d
- and eax, 16376
- sub r13, qword ptr [rsi+rax]
- ; FPMUL_R e2, a3
- mulpd xmm6, xmm11
- ; IMUL_R r4, r6
- imul r12, r14
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; ISUB_R r3, r2
- sub r11, r10
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; IROL_R r7, r0
- mov ecx, r8d
- rol r15, cl
- ; FPSUB_R f3, a2
- subpd xmm3, xmm10
- ; IROL_R r3, r7
- mov ecx, r15d
- rol r11, cl
- ; ISWAP_R r5, r7
- xchg r13, r15
- ; IDIV_C r5, 749951529
- mov rax, 13205547200481862341
- mul r13
- shr rdx, 29
- add r13, rdx
- ; FPADD_R f3, a0
- addpd xmm3, xmm8
- ; IMUL_M r0, L1[r4]
- mov eax, r12d
- and eax, 16376
- imul r8, qword ptr [rsi+rax]
- ; FPADD_R f1, a1
- addpd xmm1, xmm9
- ; IROR_R r2, 60
- ror r10, 60
- ; IROR_R r5, r4
- mov ecx, r12d
- ror r13, cl
- ; FPADD_R f2, a0
- addpd xmm2, xmm8
- ; IXOR_M r4, L1[r6]
- mov eax, r14d
- and eax, 16376
- xor r12, qword ptr [rsi+rax]
- ; IXOR_R r2, r6
- xor r10, r14
- ; FPADD_M f3, L1[r0]
- mov eax, r8d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm3, xmm12
- ; ISUB_R r7, r6
- sub r15, r14
- ; IMUL_9C r2, -962375579
- lea r10, [r10+r10*8-962375579]
- ; FPSUB_R f3, a2
- subpd xmm3, xmm10
- ; FPSUB_R f3, a0
- subpd xmm3, xmm8
- ; IMUL_R r1, r5
- imul r9, r13
- ; IMUL_R r6, r4
- imul r14, r12
- ; ISWAP_R r0, r2
- xchg r8, r10
- ; ISUB_R r6, r5
- sub r14, r13
- ; FPSUB_R f2, a1
- subpd xmm2, xmm9
- ; ISDIV_C r6, 652931802
- mov rax, -3278972671018643631
- imul r14
- xor eax, eax
- add rdx, r14
- sar rdx, 29
- sets al
- add rdx, rax
- add r14, rdx
- ; IMUL_9C r5, -1142924545
- lea r13, [r13+r13*8-1142924545]
- ; ISUB_R r7, 1085161834
- sub r15, 1085161834
- ; IMUL_R r4, r6
- imul r12, r14
- ; FPMUL_M e1, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm5, xmm12
- maxpd xmm5, xmm13
- ; FPMUL_M e3, L2[r1]
- mov eax, r9d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm7, xmm12
- maxpd xmm7, xmm13
- ; COND_R r2, lt(r5, 1635027096)
- xor ecx, ecx
- cmp r13d, 1635027096
- setl cl
- add r10, rcx
- ; IMUL_R r5, -1219696062
- imul r13, -1219696062
- ; IXOR_R r5, r0
- xor r13, r8
- ; FPNEG_R f2
- xorps xmm2, xmm15
- ; FPADD_R f3, a2
- addpd xmm3, xmm10
- ; FPSUB_R f1, a3
- subpd xmm1, xmm11
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; FPDIV_R e1, a3
- divpd xmm5, xmm11
- maxpd xmm5, xmm13
- ; IXOR_M r6, L1[r0]
- mov eax, r8d
- and eax, 16376
- xor r14, qword ptr [rsi+rax]
- ; ISUB_R r7, r4
- sub r15, r12
- ; ISUB_M r6, L1[r1]
- mov eax, r9d
- and eax, 16376
- sub r14, qword ptr [rsi+rax]
- ; ISTORE L1[r5], r3
- mov eax, r13d
- and eax, 16376
- mov qword ptr [rsi+rax], r11
- ; IMUL_R r5, r1
- imul r13, r9
- ; IROR_R r3, r2
- mov ecx, r10d
- ror r11, cl
- ; IMUL_R r4, r7
- imul r12, r15
- ; ISDIV_C r6, -54134756
- mov rax, 7012869325244995177
- imul r14
- xor eax, eax
- sub rdx, r14
- sar rdx, 25
- sets al
- add rdx, rax
- add r14, rdx
- ; FPMUL_R e1, a2
- mulpd xmm5, xmm10
- ; FPSUB_M f2, L2[r4]
- mov eax, r12d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm2, xmm12
- ; IMUL_R r0, r5
- imul r8, r13
- ; FPMUL_R e3, a0
- mulpd xmm7, xmm8
- ; COND_R r5, be(r4, 1545677311)
- xor ecx, ecx
- cmp r12d, 1545677311
- setbe cl
- add r13, rcx
- ; IMUL_R r6, r3
- imul r14, r11
- ; IROL_R r6, r2
- mov ecx, r10d
- rol r14, cl
- ; FPDIV_R e3, a1
- divpd xmm7, xmm9
- maxpd xmm7, xmm13
- ; IXOR_M r5, L1[r1]
- mov eax, r9d
- and eax, 16376
- xor r13, qword ptr [rsi+rax]
- ; COND_R r3, ab(r2, 1734636060)
- xor ecx, ecx
- cmp r10d, 1734636060
- seta cl
- add r11, rcx
- ; ISTORE L1[r2], r7
- mov eax, r10d
- and eax, 16376
- mov qword ptr [rsi+rax], r15
- ; IADD_R r5, r6
- add r13, r14
- ; FPSUB_R f1, a2
- subpd xmm1, xmm10
- ; FPADD_R f2, a1
- addpd xmm2, xmm9
- ; FPSWAP_R f1
- shufpd xmm1, xmm1, 1
- ; IROL_R r2, r6
- mov ecx, r14d
- rol r10, cl
- ; IMUL_R r0, r4
- imul r8, r12
- ; FPSUB_R f0, a2
- subpd xmm0, xmm10
- ; ISUB_R r6, r7
- sub r14, r15
- ; IROL_R r4, r7
- mov ecx, r15d
- rol r12, cl
- ; FPMUL_R e2, a0
- mulpd xmm6, xmm8
- ; ISUB_R r1, r3
- sub r9, r11
- ; FPDIV_R e0, a1
- divpd xmm4, xmm9
- maxpd xmm4, xmm13
- ; FPADD_R f0, a1
- addpd xmm0, xmm9
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; FPSUB_R f2, a2
- subpd xmm2, xmm10
- ; FPSUB_M f2, L1[r6]
- mov eax, r14d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm2, xmm12
- ; FPMUL_R e0, a0
- mulpd xmm4, xmm8
- ; IXOR_M r4, L2[r7]
- mov eax, r15d
- and eax, 262136
- xor r12, qword ptr [rsi+rax]
- ; FPSUB_R f3, a3
- subpd xmm3, xmm11
- ; ISMULH_R r1, r6
- mov rax, r9
- imul r14
- mov r9, rdx
- ; COND_R r4, be(r7, 224524971)
- xor ecx, ecx
- cmp r15d, 224524971
- setbe cl
- add r12, rcx
- ; FPADD_M f2, L1[r1]
- mov eax, r9d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm2, xmm12
- ; IMUL_R r5, r4
- imul r13, r12
- ; IADD_RC r1, r5, 370966979
- lea r9, [r9+r13+370966979]
- ; IADD_RC r7, r3, -1762209698
- lea r15, [r15+r11-1762209698]
- ; FPMUL_M e3, L2[r2]
- mov eax, r10d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm7, xmm12
- maxpd xmm7, xmm13
- ; ISUB_R r2, r7
- sub r10, r15
- ; IMUL_9C r3, 171157280
- lea r11, [r11+r11*8+171157280]
- ; ISUB_R r3, r5
- sub r11, r13
- ; FPNEG_R f3
- xorps xmm3, xmm15
- ; FPNEG_R f2
- xorps xmm2, xmm15
- ; ISTORE L1[r4], r1
- mov eax, r12d
- and eax, 16376
- mov qword ptr [rsi+rax], r9
- ; IADD_R r0, r2
- add r8, r10
- ; IXOR_R r7, r6
- xor r15, r14
- ; IROR_R r0, r4
- mov ecx, r12d
- ror r8, cl
- ; FPMUL_R e3, a2
- mulpd xmm7, xmm10
- ; IXOR_M r4, L1[r7]
- mov eax, r15d
- and eax, 16376
- xor r12, qword ptr [rsi+rax]
- ; ISTORE L1[r5], r7
- mov eax, r13d
- and eax, 16376
- mov qword ptr [rsi+rax], r15
- ; IMUL_9C r7, -1206742834
- lea r15, [r15+r15*8-1206742834]
- ; ISMULH_R r0, r4
+ ; IMULH_R r0, r4
mov rax, r8
- imul r12
+ mul r12
mov r8, rdx
- ; FPADD_R f2, a0
- addpd xmm2, xmm8
- ; FPSUB_R f1, a0
- subpd xmm1, xmm8
- ; INEG_R r7
- neg r15
- ; COND_M r0, of(L1[r5], -2056260506)
- xor ecx, ecx
- mov eax, r13d
- and eax, 16376
- cmp dword ptr [rsi+rax], -2056260506
- seto cl
- add r8, rcx
- ; FPSQRT_R e2
+ ; IMUL_R r5, r3
+ imul r13, r11
+ ; FSQRT_R e2
sqrtpd xmm6, xmm6
- ; IMUL_R r3, r4
- imul r11, r12
- ; FPNEG_R f1
- xorps xmm1, xmm15
- ; FPADD_M f2, L1[r5]
- mov eax, r13d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm2, xmm12
- ; FPSUB_R f3, a0
- subpd xmm3, xmm8
- ; FPNEG_R f3
- xorps xmm3, xmm15
- ; FPMUL_M e3, L2[r5]
- mov eax, r13d
- and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm7, xmm12
- maxpd xmm7, xmm13
- ; ISTORE L1[r2], r2
- mov eax, r10d
- and eax, 16376
- mov qword ptr [rsi+rax], r10
- ; IMUL_M r3, L2[r4]
- mov eax, r12d
- and eax, 262136
- imul r11, qword ptr [rsi+rax]
- ; IROL_R r5, r6
- mov ecx, r14d
- rol r13, cl
- ; IADD_RC r4, r3, -904431293
- lea r12, [r12+r11-904431293]
- ; FPSUB_R f1, a1
- subpd xmm1, xmm9
- ; IROL_R r7, r0
- mov ecx, r8d
- rol r15, cl
- ; ISTORE L2[r1], r7
- mov eax, r9d
- and eax, 262136
- mov qword ptr [rsi+rax], r15
- ; IROL_R r4, r3
- mov ecx, r11d
- rol r12, cl
- ; IADD_R r5, r2
- add r13, r10
- ; COND_R r3, ge(r6, -444806705)
- xor ecx, ecx
- cmp r14d, -444806705
- setge cl
- add r11, rcx
- ; FPADD_R f0, a1
- addpd xmm0, xmm9
- ; IROL_R r0, 57
- rol r8, 57
- ; IADD_R r0, r2
- add r8, r10
- ; IADD_R r7, r4
- add r15, r12
- ; IROL_R r1, r7
- mov ecx, r15d
- rol r9, cl
- ; IXOR_M r7, L2[r5]
- mov eax, r13d
- and eax, 262136
- xor r15, qword ptr [rsi+rax]
- ; ISTORE L1[r2], r0
- mov eax, r10d
- and eax, 16376
- mov qword ptr [rsi+rax], r8
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; ISUB_R r1, r4
- sub r9, r12
- ; IXOR_R r5, r0
- xor r13, r8
- ; IXOR_M r7, L2[r1]
- mov eax, r9d
- and eax, 262136
- xor r15, qword ptr [rsi+rax]
- ; FPSUB_R f0, a0
- subpd xmm0, xmm8
- ; IXOR_M r1, L1[r4]
- mov eax, r12d
- and eax, 16376
- xor r9, qword ptr [rsi+rax]
- ; FPMUL_R e3, a0
- mulpd xmm7, xmm8
- ; ISDIV_C r1, 1473744194
- mov rax, -5006799265644655925
- imul r9
- xor eax, eax
- add rdx, r9
- sar rdx, 30
- sets al
- add rdx, rax
- add r9, rdx
- ; IMUL_9C r1, 1626151459
- lea r9, [r9+r9*8+1626151459]
- ; IXOR_M r6, L1[r4]
- mov eax, r12d
- and eax, 16376
- xor r14, qword ptr [rsi+rax]
- ; FPADD_R f0, a0
- addpd xmm0, xmm8
- ; FPADD_R f3, a2
- addpd xmm3, xmm10
- ; ISUB_R r6, r7
- sub r14, r15
- ; IADD_RC r1, r5, 2075955307
- lea r9, [r9+r13+2075955307]
- ; IROL_R r6, r3
- mov ecx, r11d
- rol r14, cl
- ; IMULH_R r2, -1135671124
- mov eax, -1135671124
- mul r10
- add r10, rdx
- ; ISUB_R r5, r2
- sub r13, r10
- ; IMULH_R r3, r5
- mov rax, r11
- mul r13
- mov r11, rdx
- ; IADD_M r4, L3[386040]
- add r12, qword ptr [rsi+386040]
- ; COND_R r6, ge(r4, 1518758207)
- xor ecx, ecx
- cmp r12d, 1518758207
- setge cl
- add r14, rcx
- ; FPDIV_R e3, a1
- divpd xmm7, xmm9
- maxpd xmm7, xmm13
- ; FPNEG_R f2
- xorps xmm2, xmm15
- ; FPADD_M f1, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm1, xmm12
- ; FPMUL_M e0, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm4, xmm12
- maxpd xmm4, xmm13
- ; FPSQRT_R e2
- sqrtpd xmm6, xmm6
- ; IROL_R r5, r1
- mov ecx, r9d
- rol r13, cl
- ; FPADD_R f3, a0
+ ; FADD_R f3, a0
addpd xmm3, xmm8
- ; IROL_R r3, r0
- mov ecx, r8d
- rol r11, cl
- ; FPMUL_R e3, a1
- mulpd xmm7, xmm9
- ; IROR_R r0, r7
- mov ecx, r15d
- ror r8, cl
- ; FPADD_R f2, a2
- addpd xmm2, xmm10
- ; IXOR_R r7, r0
- xor r15, r8
- ; ISTORE L1[r4], r1
+ ; IADD_R r3, r2
+ add r11, r10
+ ; FADD_R f1, a0
+ addpd xmm1, xmm8
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; FADD_R f0, a1
+ addpd xmm0, xmm9
+ ; IMUL_R r5, r6
+ imul r13, r14
+ ; IADD_RC r1, r2, -1263285243
+ lea r9, [r9+r10-1263285243]
+ ; ISUB_M r4, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ sub r12, qword ptr [rsi+rax]
+ ; IROL_R r7, r2
+ mov ecx, r10d
+ rol r15, cl
+ ; IMUL_R r0, r7
+ imul r8, r15
+ ; IXOR_R r1, r6
+ xor r9, r14
+ ; IXOR_M r2, L1[r4]
mov eax, r12d
and eax, 16376
- mov qword ptr [rsi+rax], r9
- ; ISTORE L2[r0], r4
+ xor r10, qword ptr [rsi+rax]
+ ; FSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; ISTORE L1[r0], r5
mov eax, r8d
- and eax, 262136
- mov qword ptr [rsi+rax], r12
- ; FPDIV_R e3, a3
- divpd xmm7, xmm11
- maxpd xmm7, xmm13
- ; ISTORE L2[r4], r6
- mov eax, r12d
- and eax, 262136
- mov qword ptr [rsi+rax], r14
- ; IMUL_R r3, r1
- imul r11, r9
- ; IXOR_R r2, r4
- xor r10, r12
- ; ISTORE L2[r3], r5
+ and eax, 16376
+ mov qword ptr [rsi+rax], r13
+ ; FDIV_M e2, L2[r3]
mov eax, r11d
and eax, 262136
- mov qword ptr [rsi+rax], r13
- ; FPMUL_M e2, L2[r4]
- mov eax, r12d
- and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
- mulpd xmm6, xmm12
+ andps xmm12, xmm14
+ divpd xmm6, xmm12
maxpd xmm6, xmm13
- ; FPSUB_R f3, a0
- subpd xmm3, xmm8
- ; COND_R r1, ab(r7, -229570354)
- xor ecx, ecx
- cmp r15d, -229570354
- seta cl
- add r9, rcx
- ; IROR_R r7, r3
- mov ecx, r11d
- ror r15, cl
- ; FPDIV_R e2, a0
- divpd xmm6, xmm8
- maxpd xmm6, xmm13
- ; IADD_R r2, r5
- add r10, r13
- ; FPDIV_R e1, a3
- divpd xmm5, xmm11
- maxpd xmm5, xmm13
- ; FPSQRT_R e2
- sqrtpd xmm6, xmm6
- ; ISUB_R r3, r7
- sub r11, r15
- ; FPADD_R f0, a0
- addpd xmm0, xmm8
- ; IMUL_M r0, L3[98136]
- imul r8, qword ptr [rsi+98136]
- ; IMUL_9C r5, -895487055
- lea r13, [r13+r13*8-895487055]
- ; IMULH_R r2, r7
- mov rax, r10
- mul r15
- mov r10, rdx
- ; IADD_R r4, r1
- add r12, r9
- ; ISDIV_C r0, 494395999
- mov rax, 5007888582388710937
- imul r8
+ ; IROL_R r2, r0
+ mov ecx, r8d
+ rol r10, cl
+ ; IADD_R r7, r5
+ add r15, r13
+ ; FDIV_M e0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ andps xmm12, xmm14
+ divpd xmm4, xmm12
+ maxpd xmm4, xmm13
+ ; FADD_R f3, a1
+ addpd xmm3, xmm9
+ ; FADD_R f0, a3
+ addpd xmm0, xmm11
+ ; IADD_R r2, r0
+ add r10, r8
+ ; ISTORE L1[r3], r6
+ mov eax, r11d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r14
+ ; IXOR_R r1, r7
+ xor r9, r15
+ ; ISUB_M r5, L2[r7]
+ mov eax, r15d
+ and eax, 262136
+ sub r13, qword ptr [rsi+rax]
+ ; ISDIV_C r7, 266992378
+ mov rax, -9173520256920442565
+ imul r15
xor eax, eax
+ add rdx, r15
sar rdx, 27
sets al
add rdx, rax
- add r8, rdx
- ; FPSWAP_R e0
- shufpd xmm4, xmm4, 1
- ; IXOR_R r1, r5
- xor r9, r13
- ; COND_R r2, ab(r3, 1932234501)
+ add r15, rdx
+ ; FDIV_M e3, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ andps xmm12, xmm14
+ divpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; IMUL_R r2, r0
+ imul r10, r8
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IMUL_R r0, r6
+ imul r8, r14
+ ; ISTORE L1[r0], r7
+ mov eax, r8d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r15
+ ; FSUB_R f0, a1
+ subpd xmm0, xmm9
+ ; FADD_R f3, a1
+ addpd xmm3, xmm9
+ ; IXOR_R r5, r4
+ xor r13, r12
+ ; ISTORE L2[r7], r2
+ mov eax, r15d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r10
+ ; ISWAP_R r6, r7
+ xchg r14, r15
+ ; FADD_R f3, a2
+ addpd xmm3, xmm10
+ ; ISMULH_R r5, r0
+ mov rax, r13
+ imul r8
+ mov r13, rdx
+ ; IADD_M r0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ add r8, qword ptr [rsi+rax]
+ ; COND_R r7, ge(r6, -1972898485)
xor ecx, ecx
- cmp r11d, 1932234501
- seta cl
+ cmp r14d, -1972898485
+ setge cl
+ add r15, rcx
+ ; FADD_R f2, a2
+ addpd xmm2, xmm10
+ ; IROR_R r7, r6
+ mov ecx, r14d
+ ror r15, cl
+ ; IADD_RC r2, r4, -117457973
+ lea r10, [r10+r12-117457973]
+ ; IMUL_R r0, -1500893068
+ imul r8, -1500893068
+ ; IADD_R r2, r3
+ add r10, r11
+ ; FSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; IROR_R r7, r4
+ mov ecx, r12d
+ ror r15, cl
+ ; IMUL_9C r4, 381194890
+ lea r12, [r12+r12*8+381194890]
+ ; IADD_RC r3, r7, 1050899263
+ lea r11, [r11+r15+1050899263]
+ ; IADD_R r2, r7
+ add r10, r15
+ ; FMUL_R e3, a0
+ mulpd xmm7, xmm8
+ ; IADD_RC r6, r6, 540663146
+ lea r14, [r14+r14+540663146]
+ ; IROR_R r5, 58
+ ror r13, 58
+ ; FSWAP_R f2
+ shufpd xmm2, xmm2, 1
+ ; FSWAP_R f2
+ shufpd xmm2, xmm2, 1
+ ; FMUL_R e1, a2
+ mulpd xmm5, xmm10
+ ; ISWAP_R r5, r6
+ xchg r13, r14
+ ; IADD_R r5, r3
+ add r13, r11
+ ; IADD_R r7, -1780268176
+ add r15, -1780268176
+ ; IADD_RC r7, r0, -1497756854
+ lea r15, [r15+r8-1497756854]
+ ; ISTORE L2[r0], r7
+ mov eax, r8d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; ISMULH_R r2, r4
+ mov rax, r10
+ imul r12
+ mov r10, rdx
+ ; FSUB_R f0, a2
+ subpd xmm0, xmm10
+ ; ISMULH_R r2, r3
+ mov rax, r10
+ imul r11
+ mov r10, rdx
+ ; IADD_R r0, r3
+ add r8, r11
+ ; ISUB_R r7, r2
+ sub r15, r10
+ ; FADD_R f2, a0
+ addpd xmm2, xmm8
+ ; FMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; FADD_R f2, a3
+ addpd xmm2, xmm11
+ ; IMUL_R r1, r2
+ imul r9, r10
+ ; IMUL_M r7, L1[r5]
+ mov eax, r13d
+ and eax, 16376
+ imul r15, qword ptr [rsi+rax]
+ ; IMUL_R r3, r2
+ imul r11, r10
+ ; IXOR_R r1, r0
+ xor r9, r8
+ ; FSUB_M f0, L1[r5]
+ mov eax, r13d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm0, xmm12
+ ; IADD_RC r4, r4, 1456841848
+ lea r12, [r12+r12+1456841848]
+ ; IXOR_R r3, r2
+ xor r11, r10
+ ; COND_R r0, of(r4, 1678513610)
+ xor ecx, ecx
+ cmp r12d, 1678513610
+ seto cl
+ add r8, rcx
+ ; ISMULH_R r4, -1620573087
+ mov rax, -1620573087
+ imul r12
+ add r12, rdx
+ ; IMUL_R r4, r1
+ imul r12, r9
+ ; FSWAP_R e1
+ shufpd xmm5, xmm5, 1
+ ; FADD_M f2, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm2, xmm12
+ ; FMUL_R e1, a2
+ mulpd xmm5, xmm10
+ ; FSUB_R f0, a3
+ subpd xmm0, xmm11
+ ; IXOR_R r0, r7
+ xor r8, r15
+ ; ISTORE L2[r1], r4
+ mov eax, r9d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r12
+ ; IXOR_M r7, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ xor r15, qword ptr [rsi+rax]
+ ; ISUB_R r2, r4
+ sub r10, r12
+ ; ISUB_M r4, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ sub r12, qword ptr [rsi+rax]
+ ; FADD_R f2, a2
+ addpd xmm2, xmm10
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; IXOR_R r7, r2
+ xor r15, r10
+ ; IXOR_R r0, r5
+ xor r8, r13
+ ; FSWAP_R f1
+ shufpd xmm1, xmm1, 1
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; ISWAP_R r7, r1
+ xchg r15, r9
+ ; ISWAP_R r1, r4
+ xchg r9, r12
+ ; COND_R r2, ge(r2, -226330940)
+ xor ecx, ecx
+ cmp r10d, -226330940
+ setge cl
add r10, rcx
- ; FPMUL_R e1, a0
- mulpd xmm5, xmm8
- ; FPSUB_M f1, L1[r1]
+ ; FMUL_R e2, a3
+ mulpd xmm6, xmm11
+ ; FSUB_R f2, a1
+ subpd xmm2, xmm9
+ ; FADD_R f1, a0
+ addpd xmm1, xmm8
+ ; ISUB_R r7, r5
+ sub r15, r13
+ ; ISUB_M r0, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ sub r8, qword ptr [rsi+rax]
+ ; FSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; IROL_R r3, r5
+ mov ecx, r13d
+ rol r11, cl
+ ; IADD_RC r5, r2, 795784298
+ lea r13, [r13+r10+795784298]
+ ; IADD_RC r0, r4, -2050178553
+ lea r8, [r8+r12-2050178553]
+ ; IMUL_9C r5, 1062534001
+ lea r13, [r13+r13*8+1062534001]
+ ; FADD_R f0, a2
+ addpd xmm0, xmm10
+ ; FMUL_R e3, a1
+ mulpd xmm7, xmm9
+ ; IDIV_C r3, 1662492575
+ mov rax, 11914062610815620875
+ mul r11
+ shr rdx, 30
+ add r11, rdx
+ ; IMUL_M r5, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ imul r13, qword ptr [rsi+rax]
+ ; IDIV_C r4, 1963597892
+ mov rax, r12
+ shr rax, 2
+ mov rcx, 1260889558222626443
+ mul rcx
+ shr rdx, 25
+ add r12, rdx
+ ; IMUL_9C r7, 1820045218
+ lea r15, [r15+r15*8+1820045218]
+ ; IMUL_M r0, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ imul r8, qword ptr [rsi+rax]
+ ; IXOR_R r3, r7
+ xor r11, r15
+ ; ISMULH_R r4, r2
+ mov rax, r12
+ imul r10
+ mov r12, rdx
+ ; IROL_R r3, r0
+ mov ecx, r8d
+ rol r11, cl
+ ; IXOR_R r2, r0
+ xor r10, r8
+ ; IXOR_M r0, L2[r1]
+ mov eax, r9d
+ and eax, 262136
+ xor r8, qword ptr [rsi+rax]
+ ; ISDIV_C r7, -935446980
+ mov rax, 7859804860668271393
+ imul r15
+ xor eax, eax
+ sub rdx, r15
+ sar rdx, 29
+ sets al
+ add rdx, rax
+ add r15, rdx
+ ; IMUL_M r6, L1[r2]
+ mov eax, r10d
+ and eax, 16376
+ imul r14, qword ptr [rsi+rax]
+ ; FSUB_M f3, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm3, xmm12
+ ; IADD_RC r4, r2, 1704868083
+ lea r12, [r12+r10+1704868083]
+ ; FADD_R f2, a0
+ addpd xmm2, xmm8
+ ; ISTORE L1[r0], r0
+ mov eax, r8d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r8
+ ; FADD_R f0, a3
+ addpd xmm0, xmm11
+ ; FMUL_R e0, a3
+ mulpd xmm4, xmm11
+ ; FSUB_R f3, a2
+ subpd xmm3, xmm10
+ ; IADD_RC r7, r7, 1302457878
+ lea r15, [r15+r15+1302457878]
+ ; ISUB_R r1, 1330165941
+ sub r9, 1330165941
+ ; FSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; IROR_R r0, r4
+ mov ecx, r12d
+ ror r8, cl
+ ; FSUB_R f1, a0
+ subpd xmm1, xmm8
+ ; IROR_R r5, r6
+ mov ecx, r14d
+ ror r13, cl
+ ; COND_R r0, ab(r1, -310933871)
+ xor ecx, ecx
+ cmp r9d, -310933871
+ seta cl
+ add r8, rcx
+ ; COND_R r4, ab(r7, 757929676)
+ xor ecx, ecx
+ cmp r15d, 757929676
+ seta cl
+ add r12, rcx
+ ; FMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; IMUL_R r1, r3
+ imul r9, r11
+ ; ISUB_R r3, r2
+ sub r11, r10
+ ; FSUB_R f3, a2
+ subpd xmm3, xmm10
+ ; FDIV_M e1, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ andps xmm12, xmm14
+ divpd xmm5, xmm12
+ maxpd xmm5, xmm13
+ ; IROL_R r1, 5
+ rol r9, 5
+ ; IADD_R r7, -1421188024
+ add r15, -1421188024
+ ; FSUB_R f3, a2
+ subpd xmm3, xmm10
+ ; FSUB_R f2, a3
+ subpd xmm2, xmm11
+ ; FADD_M f3, L1[r1]
mov eax, r9d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; FPSUB_R f0, a0
- subpd xmm0, xmm8
- ; IROL_R r1, r7
- mov ecx, r15d
- rol r9, cl
- ; IADD_RC r0, r5, -2051588680
- lea r8, [r8+r13-2051588680]
- ; COND_R r6, of(r5, -795593984)
+ addpd xmm3, xmm12
+ ; FMUL_R e1, a3
+ mulpd xmm5, xmm11
+ ; IADD_RC r2, r4, -317832028
+ lea r10, [r10+r12-317832028]
+ ; IMUL_M r4, L1[r5]
+ mov eax, r13d
+ and eax, 16376
+ imul r12, qword ptr [rsi+rax]
+ ; FDIV_M e1, L1[r7]
+ mov eax, r15d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ andps xmm12, xmm14
+ divpd xmm5, xmm12
+ maxpd xmm5, xmm13
+ ; IADD_R r5, r2
+ add r13, r10
+ ; ISUB_R r4, 401020510
+ sub r12, 401020510
+ ; IROR_R r3, r0
+ mov ecx, r8d
+ ror r11, cl
+ ; ISTORE L1[r7], r0
+ mov eax, r15d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r8
+ ; FSUB_R f2, a1
+ subpd xmm2, xmm9
+ ; FMUL_R e3, a1
+ mulpd xmm7, xmm9
+ ; IMUL_9C r3, 720965215
+ lea r11, [r11+r11*8+720965215]
+ ; IMUL_9C r6, 74948046
+ lea r14, [r14+r14*8+74948046]
+ ; ISTORE L1[r7], r3
+ mov eax, r15d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r11
+ ; IXOR_R r2, r6
+ xor r10, r14
+ ; FMUL_R e3, a1
+ mulpd xmm7, xmm9
+ ; ISUB_R r4, r1
+ sub r12, r9
+ ; ISUB_R r3, r0
+ sub r11, r8
+ ; IROL_R r7, r5
+ mov ecx, r13d
+ rol r15, cl
+ ; IMUL_R r2, r6
+ imul r10, r14
+ ; COND_R r2, ge(r2, -1892157506)
xor ecx, ecx
- cmp r13d, -795593984
- seto cl
- add r14, rcx
- ; FPADD_R f1, a0
- addpd xmm1, xmm8
- ; IMULH_R r7, r3
- mov rax, r15
- mul r11
- mov r15, rdx
- ; ISUB_R r7, r4
- sub r15, r12
- ; IROL_R r0, r6
- mov ecx, r14d
- rol r8, cl
- ; ISDIV_C r1, -675825513
- mov rax, -7326980207007250257
- imul r9
+ cmp r10d, -1892157506
+ setge cl
+ add r10, rcx
+ ; FADD_R f1, a3
+ addpd xmm1, xmm11
+ ; IADD_R r7, r0
+ add r15, r8
+ ; IDIV_C r1, 624867857
+ mov rax, 15848983434401622933
+ mul r9
+ shr rdx, 29
+ add r9, rdx
+ ; FADD_R f0, a1
+ addpd xmm0, xmm9
+ ; IADD_RC r5, r7, -477591118
+ lea r13, [r13+r15-477591118]
+ ; FSUB_R f0, a3
+ subpd xmm0, xmm11
+ ; ISUB_M r6, L1[r2]
+ mov eax, r10d
+ and eax, 16376
+ sub r14, qword ptr [rsi+rax]
+ ; FMUL_R e3, a1
+ mulpd xmm7, xmm9
+ ; IADD_R r0, r4
+ add r8, r12
+ ; FSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; FSUB_R f2, a0
+ subpd xmm2, xmm8
+ ; ISDIV_C r2, -396711688
+ mov rax, 5964731804029407733
+ imul r10
xor eax, eax
+ sub rdx, r10
sar rdx, 28
sets al
add rdx, rax
- add r9, rdx
- ; ISTORE L1[r6], r3
- mov eax, r14d
- and eax, 16376
- mov qword ptr [rsi+rax], r11
- ; IROR_R r4, r3
- mov ecx, r11d
- ror r12, cl
- ; IDIV_C r4, 3919226376
- mov rax, r12
- shr rax, 3
- mov rcx, 2526906936258851663
- mul rcx
- shr rdx, 26
- add r12, rdx
- ; FPSUB_R f1, a1
- subpd xmm1, xmm9
- ; FPSUB_R f0, a0
+ add r10, rdx
+ ; FSUB_R f2, a2
+ subpd xmm2, xmm10
+ ; FSUB_R f3, a2
+ subpd xmm3, xmm10
+ ; FADD_R f1, a3
+ addpd xmm1, xmm11
+ ; IMUL_R r3, r2
+ imul r11, r10
+ ; FADD_R f0, a3
+ addpd xmm0, xmm11
+ ; ISMULH_R r5, r2
+ mov rax, r13
+ imul r10
+ mov r13, rdx
+ ; IMULH_R r6, r2
+ mov rax, r14
+ mul r10
+ mov r14, rdx
+ ; FADD_R f3, a3
+ addpd xmm3, xmm11
+ ; IMUL_R r6, r7
+ imul r14, r15
+ ; FSUB_R f0, a0
subpd xmm0, xmm8
- ; IADD_R r0, r2
- add r8, r10
- ; IADD_M r4, L1[r2]
- mov eax, r10d
+ ; FSUB_R f2, a0
+ subpd xmm2, xmm8
+ ; ISUB_R r6, r4
+ sub r14, r12
+ ; FSWAP_R f1
+ shufpd xmm1, xmm1, 1
+ ; IXOR_R r0, r5
+ xor r8, r13
+ ; FADD_R f2, a1
+ addpd xmm2, xmm9
+ ; IROL_R r7, r5
+ mov ecx, r13d
+ rol r15, cl
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IADD_RC r3, r6, -1317630728
+ lea r11, [r11+r14-1317630728]
+ ; IMUL_R r2, r3
+ imul r10, r11
+ ; IADD_RC r1, r4, 894105694
+ lea r9, [r9+r12+894105694]
+ ; IMUL_9C r7, 504293473
+ lea r15, [r15+r15*8+504293473]
+ ; FADD_M f1, L2[r0]
+ mov eax, r8d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm1, xmm12
+ ; IMUL_R r7, r1
+ imul r15, r9
+ ; IXOR_R r2, r4
+ xor r10, r12
+ ; IADD_RC r0, r1, 392362094
+ lea r8, [r8+r9+392362094]
+ ; IDIV_C r4, 1645771433
+ mov rax, 376097195048767223
+ mul r12
+ shr rdx, 25
+ add r12, rdx
+ ; ISUB_R r4, r3
+ sub r12, r11
+ ; ISUB_M r7, L1[r4]
+ mov eax, r12d
and eax, 16376
- add r12, qword ptr [rsi+rax]
- ; ISTORE L1[r7], r2
+ sub r15, qword ptr [rsi+rax]
+ ; IMUL_M r5, L1[r7]
mov eax, r15d
and eax, 16376
- mov qword ptr [rsi+rax], r10
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
- ; IADD_R r5, r4
- add r13, r12
- ; IXOR_R r6, r7
- xor r14, r15
- ; ISMULH_R r4, r7
- mov rax, r12
- imul r15
- mov r12, rdx
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
+ imul r13, qword ptr [rsi+rax]
+ ; IROR_R r1, r7
+ mov ecx, r15d
+ ror r9, cl
+ ; INEG_R r4
+ neg r12
+ ; IMUL_R r3, 1863959234
+ imul r11, 1863959234
+ ; IROR_R r4, 59
+ ror r12, 59
+ ; IMUL_M r1, L3[363256]
+ imul r9, qword ptr [rsi+363256]
+ ; ISTORE L2[r6], r7
+ mov eax, r14d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; ISTORE L1[r1], r5
+ mov eax, r9d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r13
+ ; FSUB_M f0, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm0, xmm12
+ ; FSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; FMUL_R e0, a3
+ mulpd xmm4, xmm11
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IROL_R r5, r2
+ mov ecx, r10d
+ rol r13, cl
+ ; IADD_R r0, r4
+ add r8, r12