Random accesses - JIT compiler

This commit is contained in:
tevador
2019-01-10 22:04:55 +01:00
parent b71e0eec65
commit d1a808643d
24 changed files with 341 additions and 341 deletions

View File

@@ -169,11 +169,12 @@ namespace RandomX {
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
}
void AssemblyGeneratorX86::gencr(Instruction& instr) {
void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) {
switch (instr.locc & 7)
{
case 0:
asmCode << "\tmov rcx, rax" << std::endl;
if(rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
@@ -186,7 +187,8 @@ namespace RandomX {
case 1:
case 2:
case 3:
asmCode << "\tmov rcx, rax" << std::endl;
if (rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
@@ -197,9 +199,9 @@ namespace RandomX {
return;
default:
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl;
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl;
}
return;
}
@@ -208,7 +210,7 @@ namespace RandomX {
void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) {
if(move)
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
const char* store = (instr.locc & 8) ? "movhpd" : "movlpd";
const char* store = (instr.locc & 128) ? "movhpd" : "movlpd";
switch (instr.locc & 7)
{
case 4:
@@ -463,14 +465,13 @@ namespace RandomX {
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
genar(instr, i);
//asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tshl eax, 13" << std::endl;
//asmCode << "\tand rcx, -2048" << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
//asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
gencr(instr, false);
}
static inline const char* jumpCondition(Instruction& instr, bool invert = false) {

View File

@@ -44,7 +44,7 @@ namespace RandomX {
void genbr1(Instruction&);
void genbr132(Instruction&);
void genbf(Instruction&, const char*);
void gencr(Instruction&);
void gencr(Instruction&, bool);
void gencf(Instruction&, bool);
void generateCode(Instruction&, int);

View File

@@ -47,8 +47,8 @@ namespace RandomX {
}
void CompiledVirtualMachine::execute() {
executeProgram(reg, mem, scratchpad, readDataset);
//compiler.getProgramFunc()(reg, mem, scratchpad);
//executeProgram(reg, mem, scratchpad, readDataset);
compiler.getProgramFunc()(reg, mem, scratchpad);
#ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl;

View File

@@ -197,6 +197,17 @@ namespace RandomX {
#define ALU_RETIRE(x) x(a, b, c); \
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define CHECK_NOP_FPDIV(b, c)
#ifndef STATS
#define CHECK_NOP_FPADD(b, c)
#define CHECK_NOP_FPSUB(b, c)
#define CHECK_NOP_FPMUL(b, c)
#else
#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++;
#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++;
#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++;
#endif
#define FPU_RETIRE(x) x(a, b, c); \
writecf(inst, c); \
if(trace) { \
@@ -248,8 +259,10 @@ namespace RandomX {
INC_COUNT(x) \
convertible_t a = loada(inst); \
fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
fpu_reg_t btemp = b; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE(x) \
CHECK_NOP_##x(btemp, c) \
}
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \

View File

@@ -83,6 +83,12 @@ namespace RandomX {
int count_retdepth_max = 0;
int count_endstack = 0;
int count_instructions[ProgramLength] = { 0 };
int count_FPADD_nop = 0;
int count_FPADD_nop2 = 0;
int count_FPSUB_nop = 0;
int count_FPSUB_nop2 = 0;
int count_FPMUL_nop = 0;
int count_FPMUL_nop2 = 0;
#endif
convertible_t loada(Instruction&);

View File

@@ -29,9 +29,12 @@
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_begin)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_program_read_r)
.global DECL(randomx_program_read_f)
.global DECL(randomx_program_read_l1)
.global DECL(randomx_program_read_l2)
.global DECL(randomx_program_end)
.global DECL(randomx_program_transform)
#define db .byte
.align 64
DECL(randomx_program_prologue):
@@ -45,14 +48,26 @@ DECL(randomx_program_begin):
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc"
.align 64
DECL(randomx_program_read_r):
#include "asm/program_read_r.inc"
#define scratchpad_mask and ecx, 2040
.align 64
DECL(randomx_program_read_f):
#include "asm/program_read_f.inc"
DECL(randomx_program_read_l1):
#include "asm/program_read.inc"
#undef scratchpad_mask
#define scratchpad_mask and ecx, 32760
.align 64
DECL(randomx_program_read_l2):
#include "asm/program_read.inc"
#undef scratchpad_mask
.align 64
DECL(randomx_program_end):
nop
nop
.align 8
DECL(randomx_program_transform):
#include "asm/program_transform_address.inc"

View File

@@ -20,9 +20,11 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue
PUBLIC randomx_program_begin
PUBLIC randomx_program_epilogue
PUBLIC randomx_program_read_r
PUBLIC randomx_program_read_f
PUBLIC randomx_program_read_l1
PUBLIC randomx_program_read_l2
PUBLIC randomx_program_end
PUBLIC randomx_program_transform
ALIGN 64
randomx_program_prologue PROC
@@ -39,21 +41,34 @@ randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
ALIGN 64
randomx_program_read_r PROC
include asm/program_read_r.inc
randomx_program_read_r ENDP
scratchpad_mask MACRO
and ecx, 2040
ENDM
ALIGN 64
randomx_program_read_f PROC
include asm/program_read_f.inc
randomx_program_read_f ENDP
randomx_program_read_l1 PROC
include asm/program_read.inc
randomx_program_read_l1 ENDP
scratchpad_mask MACRO
and ecx, 32760
ENDM
ALIGN 64
randomx_program_read_l2 PROC
include asm/program_read.inc
randomx_program_read_l2 ENDP
ALIGN 64
randomx_program_end PROC
nop
randomx_program_end ENDP
ALIGN 8
randomx_program_transform PROC
include asm/program_transform_address.inc
randomx_program_transform ENDP
_RANDOMX_JITX86_STATIC ENDS
END

View File

@@ -21,7 +21,8 @@ extern "C" {
void randomx_program_prologue();
void randomx_program_begin();
void randomx_program_epilogue();
void randomx_program_read_r();
void randomx_program_read_f();
void randomx_program_transform();
void randomx_program_read_l1();
void randomx_program_read_l2();
void randomx_program_end();
}

View File

@@ -48,12 +48,12 @@ namespace RandomX {
REGISTER ALLOCATION:
rax -> temporary
rbx -> MemoryRegisters& memory
rbx -> "ic"
rcx -> temporary
rdx -> temporary
rsi -> convertible_t* scratchpad
rdi -> "ic" (instruction counter)
rbp -> beginning of VM stack
rdi -> beginning of VM stack
rbp -> "ma", "mx"
rsp -> end of VM stack
r8 -> "r0"
r9 -> "r1"
@@ -82,7 +82,8 @@ namespace RandomX {
| saved registers
|
v
[rbp] RegisterFile& registerFile
[rdi+8] RegisterFile& registerFile
[rdi] uint8_t* dataset
|
|
| VM stack
@@ -97,18 +98,19 @@ namespace RandomX {
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r;
const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f;
const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1;
const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
const int32_t prologueSize = codeProgramBegin - codePrologue;
const int32_t epilogueSize = codeReadDatasetR - codeEpilogue;
const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR;
const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF;
const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue;
const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1;
const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2;
const int32_t readDatasetFOffset = CodeSize - readDatasetFSize;
const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize;
const int32_t epilogueOffset = readDatasetROffset - epilogueSize;
const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size;
const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size;
const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize;
JitCompilerX86::JitCompilerX86() {
#ifdef _WIN32
@@ -121,9 +123,9 @@ namespace RandomX {
throw std::runtime_error("mmap failed");
#endif
memcpy(code, codePrologue, prologueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize);
memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize);
memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size);
memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size);
}
void JitCompilerX86::generateProgram(Pcg32& gen) {
@@ -140,12 +142,33 @@ namespace RandomX {
emitByte(0xe9);
emit(instructionOffsets[0] - (codePos + 4));
fixCallOffsets();
uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
*reinterpret_cast<uint32_t*>(code + readDatasetL1Offset + 1) = transformL1;
*reinterpret_cast<uint32_t*>(code + readDatasetL2Offset + 1) = transformL2;
}
void JitCompilerX86::generateCode(Instruction& instr, int i) {
instructionOffsets.push_back(codePos);
emit(0x840fcfff); //dec edx; jz <epilogue>
emit(0x840fcbff); //dec ebx; jz <epilogue>
emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805));
if (instr.loca & 3) { //A.LOC.W
emit(readDatasetL1Offset - (codePos + 4));
}
else {
emit(readDatasetL2Offset - (codePos + 4));
}
if ((instr.loca & 192) == 0) { //A.LOC.X
emit(uint16_t(0x3348));
emitByte(0xe9); //xor rbp, rcx
}
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
}
@@ -157,73 +180,26 @@ namespace RandomX {
}
void JitCompilerX86::genar(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetROffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8]
return;
emit(uint16_t(0xe181)); //and ecx,
if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
}
else {
emit(ScratchpadL2 - 1); //whole scratchpad
}
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
}
void JitCompilerX86::genaf(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetFOffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(uint16_t(0xe181)); //and ecx,
if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
}
else {
emit(ScratchpadL2 - 1); //whole scratchpad
}
emitByte(0xf3);
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
}
void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
@@ -274,8 +250,13 @@ namespace RandomX {
}
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) {
emit(0x41c88b48); //mov rcx, rax; REX
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) {
if (rax) {
emit(0x41c88b48); //mov rcx, rax; REX
}
else {
emitByte(0x41);
}
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
@@ -285,22 +266,27 @@ namespace RandomX {
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
}
void JitCompilerX86::gencr(Instruction& instr) {
void JitCompilerX86::gencr(Instruction& instr, bool rax = true) {
switch (instr.locc & 7)
{
case 0:
scratchpadStoreR(instr, ScratchpadL2);
scratchpadStoreR(instr, ScratchpadL2, rax);
break;
case 1:
case 2:
case 3:
scratchpadStoreR(instr, ScratchpadL1);
scratchpadStoreR(instr, ScratchpadL1, rax);
break;
default:
emit(uint16_t(0x8b4c)); //mov
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
if (rax) {
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
}
else {
emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx
}
break;
}
}
@@ -322,29 +308,21 @@ namespace RandomX {
emitByte(0xc6);
}
void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) {
void JitCompilerX86::gencf(Instruction& instr) {
int regc = (instr.regc % RegistersCount);
if (!alwaysLow) {
if (regc <= 1) {
emitByte(0x44); //REX
}
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
if (regc <= 1) {
emitByte(0x44); //REX
}
switch (instr.locc & 7)
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
if (instr.locc & 4) //C.LOC.R
{
case 4:
scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8));
break;
case 5:
case 6:
case 7:
scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
break;
default:
break;
if (instr.locc & 3) { //C.LOC.W
scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad
}
else {
scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //whole scratchpad
}
}
}
@@ -596,24 +574,11 @@ namespace RandomX {
void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
genar(instr);
emit(0x81480de0c1c88b48);
emit(0x600025fffff800e1);
emit(uint16_t(0x0000));
emitByte(0xf2);
int regc = (instr.regc % RegistersCount);
if (regc <= 1) {
emitByte(0x4c); //REX
}
else {
emitByte(0x48); //REX
}
emit(uint16_t(0x2a0f));
emitByte(0xc1 + 8 * regc);
emitByte(0x0d);
emit(0xf824448900009fc0);
emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
emit(0x00250de0c1c88b48); //mov rcx,rax; shl eax,0xd
emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0
emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8]
emitByte(0xf8);
gencf(instr, true);
gencr(instr, false); //result in rcx
}
static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
@@ -670,7 +635,7 @@ namespace RandomX {
if ((instr.locc & 7) <= 3) {
crlen = 17;
}
emit(0x74e53b48); //cmp rsp, rbp; je
emit(0x74e73b48); //cmp rsp, rdi; je
emitByte(11 + crlen);
emitByte(0x48);
emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8]

View File

@@ -64,10 +64,10 @@ namespace RandomX {
void genbr1(Instruction&, uint16_t, uint16_t);
void genbr132(Instruction&, uint16_t, uint8_t);
void genbf(Instruction&, uint8_t);
void scratchpadStoreR(Instruction&, uint32_t);
void scratchpadStoreR(Instruction&, uint32_t, bool);
void scratchpadStoreF(Instruction&, int, uint32_t, bool);
void gencr(Instruction&);
void gencf(Instruction&, bool);
void gencr(Instruction&, bool);
void gencf(Instruction&);
void generateCode(Instruction&, int);
void fixCallOffsets();

View File

@@ -1,8 +1,9 @@
;# unroll VM stack
mov rsp, rbp
mov rsp, rdi
;# save VM register values
pop rcx
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10

View File

@@ -7,9 +7,11 @@
push r15
;# function arguments
push rdi ;# RegisterFile& registerFile
mov rbx, rsi ;# MemoryRegisters& memory
mov rsi, rdx ;# convertible_t* scratchpad
push rdi ;# RegisterFile& registerFile
mov rbp, qword ptr [rsi] ;# "mx", "ma"
mov rax, qword ptr [rsi+8] ;# uint8_t* dataset
push rax
mov rsi, rdx ;# convertible_t* scratchpad
mov rcx, rdi
#include "program_prologue_load.inc"

View File

@@ -1,5 +1,5 @@
mov rbp, rsp ;# beginning of VM stack
mov rdi, 1048577 ;# number of VM instructions to execute + 1
mov rdi, rsp ;# beginning of VM stack
mov ebx, 1048577 ;# number of VM instructions to execute + 1
xorps xmm10, xmm10
cmpeqpd xmm10, xmm10

View File

@@ -15,9 +15,11 @@
movdqu xmmword ptr [rsp+0], xmm10
;# function arguments
push rcx ;# RegisterFile& registerFile
mov rbx, rdx ;# MemoryRegisters& memory
mov rsi, r8 ;# convertible_t* scratchpad
push rcx ;# RegisterFile& registerFile
mov rbp, qword ptr [rdx] ;# "mx", "ma"
mov rax, qword ptr [rdx+8] ;# uint8_t* dataset
push rax
mov rsi, r8 ;# convertible_t* scratchpad
include program_prologue_load.inc

32
src/asm/program_read.inc Normal file
View File

@@ -0,0 +1,32 @@
push rcx ;# preserve ecx
db 0, 0, 0, 0 ;# TransformAddress placeholder
mov rax, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx"
;# prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx]
;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8]
xor qword ptr [rcx+8], rdx
mov rdx, qword ptr [rax+16]
xor qword ptr [rcx+16], rdx
mov rdx, qword ptr [rax+24]
xor qword ptr [rcx+24], rdx
mov rdx, qword ptr [rax+32]
xor qword ptr [rcx+32], rdx
mov rdx, qword ptr [rax+40]
xor qword ptr [rcx+40], rdx
mov rdx, qword ptr [rax+48]
xor qword ptr [rcx+48], rdx
mov rdx, qword ptr [rax+56]
xor qword ptr [rcx+56], rdx
pop rcx ;# restore ecx
ret

View File

@@ -1,13 +0,0 @@
mov edx, dword ptr [rbx] ;# ma
mov rax, qword ptr [rbx+8] ;# dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0

View File

@@ -1,13 +0,0 @@
mov eax, dword ptr [rbx] ;# ma
mov rdx, qword ptr [rbx+8] ;# dataset
mov rax, qword ptr [rdx+rax]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_r_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_r_ret:
ret 0

View File

@@ -77,6 +77,7 @@ namespace RandomX {
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t TransformationCount = 90;
constexpr int RegistersCount = 8;
class Cache;

View File

@@ -158,10 +158,14 @@ executeProgram PROC
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]
; program body
jmp program_begin
; program body
ALIGN 64
program_begin:
include program.inc
ALIGN 64
rx_finish:
; unroll the stack
mov rsp, rdi

View File

@@ -277,10 +277,6 @@ int main(int argc, char** argv) {
if(programCount == 1000)
std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
/*if (threadCount == 1 && !compiled) {
auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];
std::cout << ivm->getProgam();
}*/
}
catch (std::exception& e) {
std::cout << "ERROR: " << e.what() << std::endl;

View File

@@ -76,11 +76,13 @@ rx_body_3:
xor rbp, rcx
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov r8, rcx
rx_i_4: ;MULH_64
dec ebx
@@ -153,7 +155,7 @@ rx_body_7:
mov eax, r14d
xor eax, 057c8c41bh
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_8: ;SHL_64
dec ebx
@@ -218,7 +220,7 @@ rx_body_11:
mov eax, r12d
xor eax, 0852d40d8h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm4
movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_12: ;CALL
dec ebx
@@ -355,7 +357,7 @@ rx_body_18:
mov eax, r11d
xor eax, 0869baa81h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_19: ;FPSUB
dec ebx
@@ -372,7 +374,7 @@ rx_body_19:
subpd xmm0, xmm8
movaps xmm7, xmm0
rx_i_20: ;FPMUL
rx_i_20: ;FPSUB
dec ebx
jz rx_finish
xor r13, 0ecca967dh
@@ -383,15 +385,12 @@ rx_i_20: ;FPMUL
rx_body_20:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm7, xmm0
mov eax, r15d
xor eax, 0aad81365h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7
movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_21: ;FPADD
dec ebx
@@ -482,7 +481,7 @@ rx_body_25:
mov eax, r14d
xor eax, 0baf5c2d4h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_26: ;IMUL_32
dec ebx
@@ -580,7 +579,7 @@ rx_body_31:
mov eax, r14d
xor eax, 01e2da792h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_32: ;XOR_64
dec ebx
@@ -668,7 +667,7 @@ rx_body_36:
andps xmm0, xmm1
movaps xmm7, xmm0
rx_i_37: ;FPMUL
rx_i_37: ;FPSUB
dec ebx
jz rx_finish
xor r12, 0d0706601h
@@ -679,10 +678,7 @@ rx_i_37: ;FPMUL
rx_body_37:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm9, xmm0
mov eax, r9d
xor eax, 0bca81c78h
@@ -764,7 +760,7 @@ taken_call_41:
push rax
call rx_i_127
rx_i_42: ;FPSUB
rx_i_42: ;FPADD
dec ebx
jz rx_finish
xor r15, 0bc1de9f6h
@@ -776,7 +772,7 @@ rx_body_42:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm6
addpd xmm0, xmm6
movaps xmm6, xmm0
rx_i_43: ;SUB_64
@@ -887,7 +883,7 @@ rx_body_48:
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_49: ;FPMUL
rx_i_49: ;FPSUB
dec ebx
jz rx_finish
xor r8, 0f96c6a45h
@@ -898,10 +894,7 @@ rx_i_49: ;FPMUL
rx_body_49:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm3
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm3
movaps xmm5, xmm0
rx_i_50: ;OR_32
@@ -1018,7 +1011,7 @@ rx_body_55:
mov eax, r11d
xor eax, 07c79cddh
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3
movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_56: ;AND_64
dec ebx
@@ -1144,7 +1137,7 @@ taken_call_61:
push rax
call rx_i_120
rx_i_62: ;FPMUL
rx_i_62: ;FPSUB
dec ebx
jz rx_finish
xor r15, 0c3089414h
@@ -1155,17 +1148,14 @@ rx_i_62: ;FPMUL
rx_body_62:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm8
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm8
movaps xmm2, xmm0
mov eax, r10d
xor eax, 05c4789e3h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm2
movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_63: ;FPMUL
rx_i_63: ;FPSUB
dec ebx
jz rx_finish
xor r9, 065cf272eh
@@ -1176,10 +1166,7 @@ rx_i_63: ;FPMUL
rx_body_63:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm7
movaps xmm8, xmm0
rx_i_64: ;SUB_64
@@ -1253,7 +1240,7 @@ taken_call_67:
push rax
call rx_i_79
rx_i_68: ;FPSUB
rx_i_68: ;FPADD
dec ebx
jz rx_finish
xor r13, 03aa5c3a4h
@@ -1264,7 +1251,7 @@ rx_i_68: ;FPSUB
rx_body_68:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm2
addpd xmm0, xmm2
movaps xmm4, xmm0
mov eax, r12d
xor eax, 03c51ef39h
@@ -1354,11 +1341,16 @@ rx_i_73: ;FPROUND
rx_body_73:
and ecx, 32767
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov eax, r10d
xor eax, 040624270h
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_74: ;MUL_64
dec ebx
@@ -1722,7 +1714,7 @@ rx_body_93:
mov eax, r10d
xor eax, 07e48a0d8h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm2
movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_94: ;RET
dec ebx
@@ -1830,7 +1822,7 @@ rx_body_99:
mov eax, r12d
xor eax, 04c21df83h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm4
movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_100: ;ADD_64
dec ebx
@@ -1955,7 +1947,7 @@ rx_body_106:
mov eax, r12d
xor eax, 03cb2505h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm4
movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_107: ;CALL
dec ebx
@@ -1999,7 +1991,7 @@ rx_body_108:
mov eax, r9d
xor eax, 0678b65beh
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_109: ;FPADD
dec ebx
@@ -2207,7 +2199,7 @@ rx_body_120:
addpd xmm0, xmm4
movaps xmm8, xmm0
rx_i_121: ;FPMUL
rx_i_121: ;FPSUB
dec ebx
jz rx_finish
xor r9, 03ab8f73h
@@ -2218,10 +2210,7 @@ rx_i_121: ;FPMUL
rx_body_121:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm5
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm5
movaps xmm8, xmm0
rx_i_122: ;RET
@@ -2813,7 +2802,7 @@ rx_body_153:
mov eax, r8d
xor eax, 09111c981h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm8
movhpd qword ptr [rsi + rax * 8], xmm8
rx_i_154: ;MUL_32
dec ebx
@@ -3196,11 +3185,13 @@ rx_i_174: ;FPROUND
rx_body_174:
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov r14, rcx
rx_i_175: ;SAR_64
dec ebx
@@ -3431,7 +3422,7 @@ rx_body_187:
andps xmm0, xmm1
movaps xmm5, xmm0
rx_i_188: ;FPMUL
rx_i_188: ;FPSUB
dec ebx
jz rx_finish
xor r9, 04659becbh
@@ -3443,10 +3434,7 @@ rx_body_188:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm3
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm3
movaps xmm4, xmm0
rx_i_189: ;FPROUND
@@ -3460,11 +3448,16 @@ rx_i_189: ;FPROUND
rx_body_189:
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov eax, r13d
xor eax, 0e6f1a3b7h
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_190: ;RET
dec ebx
@@ -3761,7 +3754,7 @@ rx_body_205:
andps xmm0, xmm1
movaps xmm5, xmm0
rx_i_206: ;FPMUL
rx_i_206: ;FPSUB
dec ebx
jz rx_finish
xor r11, 0e836a177h
@@ -3773,10 +3766,7 @@ rx_body_206:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm7
movaps xmm4, xmm0
rx_i_207: ;AND_32
@@ -4085,7 +4075,7 @@ rx_body_223:
mov eax, r10d
xor eax, 07fca59eeh
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm2
movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_224: ;SAR_64
dec ebx
@@ -4171,7 +4161,7 @@ rx_body_227:
mov eax, r11d
xor eax, 0aabe2a0ah
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3
movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_228: ;CALL
dec ebx
@@ -4313,11 +4303,16 @@ rx_i_234: ;FPROUND
rx_body_234:
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov eax, r12d
xor eax, 04d2e9e7dh
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_235: ;IMUL_32
dec ebx
@@ -4438,7 +4433,7 @@ rx_body_241:
mov eax, r15d
xor eax, 0bc2423ebh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7
movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_242: ;MULH_64
dec ebx
@@ -4734,7 +4729,7 @@ rx_body_257:
mov eax, r11d
xor eax, 0373b1b6fh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_258: ;MUL_32
dec ebx
@@ -4771,7 +4766,7 @@ rx_body_259:
addpd xmm0, xmm9
movaps xmm3, xmm0
rx_i_260: ;FPMUL
rx_i_260: ;FPSUB
dec ebx
jz rx_finish
xor r13, 0f94e9fa9h
@@ -4783,10 +4778,7 @@ rx_body_260:
xor rbp, rcx
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm5
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm5
movaps xmm9, xmm0
rx_i_261: ;FPSQRT
@@ -4806,7 +4798,7 @@ rx_body_261:
mov eax, r11d
xor eax, 0745a48e9h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3
movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_262: ;OR_32
dec ebx
@@ -5044,7 +5036,7 @@ rx_body_274:
mov eax, r14d
xor eax, 06a2b2b5bh
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm6
movhpd qword ptr [rsi + rax * 8], xmm6
rx_i_275: ;OR_64
dec ebx
@@ -5121,7 +5113,7 @@ rx_body_278:
mov eax, r12d
xor eax, 02d00ad10h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm4
movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_279: ;FPSUB
dec ebx
@@ -5139,7 +5131,7 @@ rx_body_279:
mov eax, r9d
xor eax, 0475ade01h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9
movlpd qword ptr [rsi + rax * 8], xmm9
rx_i_280: ;AND_64
dec ebx
@@ -5210,7 +5202,7 @@ rx_body_283:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_284: ;FPSUB
rx_i_284: ;FPADD
dec ebx
jz rx_finish
xor r15, 0e68f36ach
@@ -5222,7 +5214,7 @@ rx_body_284:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm6
addpd xmm0, xmm6
movaps xmm9, xmm0
mov eax, r9d
xor eax, 0936f2960h
@@ -5313,7 +5305,7 @@ rx_body_289:
andps xmm0, xmm1
movaps xmm8, xmm0
rx_i_290: ;FPMUL
rx_i_290: ;FPSUB
dec ebx
jz rx_finish
xor r15, 060665748h
@@ -5324,10 +5316,7 @@ rx_i_290: ;FPMUL
rx_body_290:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm8
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm8
movaps xmm9, xmm0
rx_i_291: ;RET
@@ -5531,7 +5520,7 @@ rx_body_301:
mov eax, r15d
xor eax, 0433cf2d6h
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm7
movhpd qword ptr [rsi + rax * 8], xmm7
rx_i_302: ;ADD_64
dec ebx
@@ -5937,7 +5926,7 @@ rx_body_324:
mov eax, r9d
xor eax, 0944856d4h
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_325: ;SHL_64
dec ebx
@@ -6076,7 +6065,7 @@ rx_body_332:
mov eax, r11d
xor eax, 0116c919eh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_333: ;XOR_64
dec ebx
@@ -6222,7 +6211,7 @@ rx_body_341:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_342: ;FPMUL
rx_i_342: ;FPSUB
dec ebx
jz rx_finish
xor r9, 09ccc7abah
@@ -6233,10 +6222,7 @@ rx_i_342: ;FPMUL
rx_body_342:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm3, xmm0
rx_i_343: ;SHR_64
@@ -6258,7 +6244,7 @@ rx_body_343:
and eax, 32767
mov qword ptr [rsi + rax * 8], rcx
rx_i_344: ;FPMUL
rx_i_344: ;FPSUB
dec ebx
jz rx_finish
xor r10, 03ef9bcc4h
@@ -6269,10 +6255,7 @@ rx_i_344: ;FPMUL
rx_body_344:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm6
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm6
movaps xmm5, xmm0
rx_i_345: ;MULH_64
@@ -6343,7 +6326,7 @@ rx_body_348:
mov eax, r9d
xor eax, 039c35461h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_349: ;XOR_32
dec ebx
@@ -6413,9 +6396,9 @@ rx_body_352:
mov eax, r10d
xor eax, 03bf686f2h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm2
movlpd qword ptr [rsi + rax * 8], xmm2
rx_i_353: ;FPMUL
rx_i_353: ;FPSUB
dec ebx
jz rx_finish
xor r13, 02e65278bh
@@ -6426,15 +6409,12 @@ rx_i_353: ;FPMUL
rx_body_353:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm7, xmm0
mov eax, r15d
xor eax, 0b3c9f7aeh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7
movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_354: ;MULH_64
dec ebx
@@ -6535,7 +6515,7 @@ rx_body_359:
mov eax, r12d
xor eax, 0f16b9be3h
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm4
movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_360: ;FPMUL
dec ebx
@@ -6570,7 +6550,7 @@ rx_body_361:
mov eax, r14d
xor eax, 0ad0b81f5h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_362: ;SUB_64
dec ebx
@@ -6726,7 +6706,7 @@ rx_body_370:
mov eax, r14d
xor eax, 0a120e0edh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_371: ;FPADD
dec ebx
@@ -6948,7 +6928,7 @@ rx_body_383:
mov eax, r13d
xor eax, 0c9f5cc22h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm5
movlpd qword ptr [rsi + rax * 8], xmm5
rx_i_384: ;SHR_64
dec ebx
@@ -7256,7 +7236,7 @@ rx_body_400:
and eax, 32767
mov qword ptr [rsi + rax * 8], rcx
rx_i_401: ;FPMUL
rx_i_401: ;FPSUB
dec ebx
jz rx_finish
xor r13, 032e81f25h
@@ -7267,15 +7247,12 @@ rx_i_401: ;FPMUL
rx_body_401:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm4
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm4
movaps xmm6, xmm0
mov eax, r14d
xor eax, 03ea60344h
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_402: ;RET
dec ebx
@@ -7382,13 +7359,15 @@ rx_i_406: ;FPROUND
rx_body_406:
and ecx, 32767
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov r9, rcx
rx_i_407: ;FPMUL
rx_i_407: ;FPSUB
dec ebx
jz rx_finish
xor r14, 09699566fh
@@ -7400,10 +7379,7 @@ rx_body_407:
xor rbp, rcx
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm9
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm9
movaps xmm8, xmm0
rx_i_408: ;MUL_64
@@ -7493,7 +7469,7 @@ rx_body_412:
mov eax, r11d
xor eax, 0bbd2640ah
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_413: ;FPDIV
dec ebx
@@ -7704,7 +7680,7 @@ rx_body_424:
mov eax, r9d
xor eax, 0565ae8aah
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9
movlpd qword ptr [rsi + rax * 8], xmm9
rx_i_425: ;IMUL_32
dec ebx
@@ -7887,7 +7863,7 @@ rx_body_434:
mov eax, r9d
xor eax, 08c1cfc74h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_435: ;MUL_64
dec ebx
@@ -8068,7 +8044,7 @@ not_taken_ret_443:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_444: ;FPMUL
rx_i_444: ;FPSUB
dec ebx
jz rx_finish
xor r8, 042455dd8h
@@ -8079,15 +8055,12 @@ rx_i_444: ;FPMUL
rx_body_444:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm7
movaps xmm5, xmm0
mov eax, r13d
xor eax, 0ce416070h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm5
movhpd qword ptr [rsi + rax * 8], xmm5
rx_i_445: ;ADD_64
dec ebx
@@ -8128,7 +8101,7 @@ rx_body_446:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_447: ;FPSUB
rx_i_447: ;FPADD
dec ebx
jz rx_finish
xor r8, 01596d0e8h
@@ -8139,12 +8112,12 @@ rx_i_447: ;FPSUB
rx_body_447:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm7
addpd xmm0, xmm7
movaps xmm5, xmm0
mov eax, r13d
xor eax, 0b384d4afh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm5
movlpd qword ptr [rsi + rax * 8], xmm5
rx_i_448: ;FPSUB
dec ebx
@@ -8668,7 +8641,7 @@ rx_body_477:
mov eax, r14d
xor eax, 0e81fc7a6h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm6
movhpd qword ptr [rsi + rax * 8], xmm6
rx_i_478: ;MUL_64
dec ebx
@@ -9143,7 +9116,7 @@ rx_body_504:
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_505: ;FPMUL
rx_i_505: ;FPSUB
dec ebx
jz rx_finish
xor r12, 032c0a28ah
@@ -9154,17 +9127,14 @@ rx_i_505: ;FPMUL
rx_body_505:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm4
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm4
movaps xmm8, xmm0
mov eax, r8d
xor eax, 021b54eaeh
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm8
movhpd qword ptr [rsi + rax * 8], xmm8
rx_i_506: ;FPMUL
rx_i_506: ;FPSUB
dec ebx
jz rx_finish
xor r9, 0a973d58ch
@@ -9175,10 +9145,7 @@ rx_i_506: ;FPMUL
rx_body_506:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm9
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm9
movaps xmm3, xmm0
rx_i_507: ;RET
@@ -9238,7 +9205,7 @@ taken_call_509:
push rax
call rx_i_42
rx_i_510: ;FPSUB
rx_i_510: ;FPADD
dec ebx
jz rx_finish
xor r8, 0db65513ch
@@ -9249,7 +9216,7 @@ rx_i_510: ;FPSUB
rx_body_510:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm2
addpd xmm0, xmm2
movaps xmm9, xmm0
rx_i_511: ;ROL_64

View File

@@ -74,21 +74,21 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) {
}
#endif
void* allocExecutableMemory(size_t bytes) {
void* allocExecutableMemory(std::size_t bytes) {
void* mem;
#ifdef _WIN32
mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
if (mem == nullptr)
throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc"));
#else
mem = mmap(nullptr, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (mem == MAP_FAILED)
throw std::runtime_error("allocExecutableMemory - mmap failed");
#endif
return mem;
}
void* allocLargePagesMemory(size_t bytes) {
void* allocLargePagesMemory(std::size_t bytes) {
void* mem;
#ifdef _WIN32
setPrivilege("SeLockMemoryPrivilege", 1);

View File

@@ -19,5 +19,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
void* allocExecutableMemory(size_t);
void* allocLargePagesMemory(size_t);
#include <cstddef>
void* allocExecutableMemory(std::size_t);
void* allocLargePagesMemory(std::size_t);