diff --git a/.gitignore b/.gitignore index 0f69877..35c1e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ obj/ *.user *.suo .vs - +x64 diff --git a/makefile b/makefile index cd49f88..7dde5ae 100644 --- a/makefile +++ b/makefile @@ -9,7 +9,7 @@ OBJDIR=obj LDFLAGS=-lpthread CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o LightProgramGenerator.o) ifeq ($(PLATFORM),amd64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o CXXFLAGS += -maes @@ -99,6 +99,9 @@ $(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtual $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@ + +$(OBJDIR)/LightProgramGenerator.o: $(addprefix $(SRCDIR)/,LightProgramGenerator.cpp LightProgramGenerator.hpp Program.hpp blake2/blake2.h blake2/endian.h configuration.h) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightProgramGenerator.cpp -o $@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ diff --git a/randomx.sln b/randomx.sln new file mode 100644 index 0000000..c4d5a2a --- /dev/null +++ b/randomx.sln @@ -0,0 +1,57 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.572 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "randomx", "vcxproj\randomx.vcxproj", "{3346A4AD-C438-4324-8B77-47A16452954B}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{4A4A689F-86AF-41C0-A974-1080506D0923}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "superscalar-avalanche", "vcxproj\superscalar-avalanche.vcxproj", "{CF34A7EF-7DC9-4077-94A5-76F5425EA938}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "superscalar-init", "vcxproj\superscalar-init.vcxproj", "{E59DC709-9B12-4A53-BAF3-79398821C376}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.ActiveCfg = Debug|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.Build.0 = Debug|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.ActiveCfg = Debug|Win32 + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.Build.0 = Debug|Win32 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.ActiveCfg = Release|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.Build.0 = Release|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.ActiveCfg = Release|Win32 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.Build.0 = Release|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x64.ActiveCfg = Debug|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x64.Build.0 = Debug|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x86.ActiveCfg = Debug|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x86.Build.0 = Debug|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x64.ActiveCfg = Release|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x64.Build.0 = Release|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x86.ActiveCfg = Release|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x86.Build.0 = Release|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x64.ActiveCfg = Debug|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x64.Build.0 = Debug|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x86.ActiveCfg = Debug|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x86.Build.0 = Debug|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x64.ActiveCfg = Release|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x64.Build.0 = Release|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x86.ActiveCfg = Release|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {CF34A7EF-7DC9-4077-94A5-76F5425EA938} = {4A4A689F-86AF-41C0-A974-1080506D0923} + {E59DC709-9B12-4A53-BAF3-79398821C376} = {4A4A689F-86AF-41C0-A974-1080506D0923} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {4EBC03DB-AE37-4141-8147-692F16E0ED02} + EndGlobalSection +EndGlobal diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index fd7ee06..b3511c1 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -23,6 +23,7 @@ along with RandomX. If not, see. #include "common.hpp" #include "reciprocal.h" #include "Program.hpp" +#include "superscalarGenerator.hpp" namespace RandomX { @@ -46,6 +47,179 @@ namespace RandomX { static const char* regDatasetAddr = "rdi"; static const char* regScratchpadAddr = "rsi"; + void AssemblyGeneratorX86::generateProgram(Program& prog) { + for (unsigned i = 0; i < 8; ++i) { + registerUsage[i] = -1; + } + asmCode.str(std::string()); //clear + for (unsigned i = 0; i < prog.getSize(); ++i) { + asmCode << "randomx_isn_" << i << ":" << std::endl; + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + //asmCode << std::endl; + } + } + + void AssemblyGeneratorX86::generateAsm(SuperscalarProgram& prog) { + asmCode.str(std::string()); //clear + asmCode << "ALIGN 16" << std::endl; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + switch (instr.opcode) + { + case RandomX::SuperscalarInstructionType::ISUB_R: + asmCode << "sub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case RandomX::SuperscalarInstructionType::IXOR_R: + asmCode << "xor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case RandomX::SuperscalarInstructionType::IADD_RS: + asmCode << "lea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IMUL_R: + asmCode << "imul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case RandomX::SuperscalarInstructionType::IROR_C: + asmCode << "ror " << regR[instr.dst] << ", " << instr.getImm32() << std::endl; + break; + case RandomX::SuperscalarInstructionType::IADD_C7: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + break; + case RandomX::SuperscalarInstructionType::IXOR_C7: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + break; + case RandomX::SuperscalarInstructionType::IADD_C8: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "nop" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IXOR_C8: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "nop" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IADD_C9: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "xchg ax, ax ;nop" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IXOR_C9: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "xchg ax, ax ;nop" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IMULH_R: + asmCode << "mov rax, " << regR[instr.dst] << std::endl; + asmCode << "mul " << regR[instr.src] << std::endl; + asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; + break; + case RandomX::SuperscalarInstructionType::ISMULH_R: + asmCode << "mov rax, " << regR[instr.dst] << std::endl; + asmCode << "imul " << regR[instr.src] << std::endl; + asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IMUL_RCP: + asmCode << "mov rax, " << (int64_t)reciprocal(instr.getImm32()) << std::endl; + asmCode << "imul " << regR[instr.dst] << ", rax" << std::endl; + break; + default: + UNREACHABLE; + } + } + } + + void AssemblyGeneratorX86::generateC(SuperscalarProgram& prog) { + asmCode.str(std::string()); //clear + asmCode << "#include " << std::endl; + asmCode << "#if defined(__SIZEOF_INT128__)" << std::endl; + asmCode << " static inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl; + asmCode << " return ((unsigned __int128)a * b) >> 64;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " static inline int64_t smulh(int64_t a, int64_t b) {" << std::endl; + asmCode << " return ((__int128)a * b) >> 64;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_MULH" << std::endl; + asmCode << " #define HAVE_SMULH" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#if defined(_MSC_VER)" << std::endl; + asmCode << " #define HAS_VALUE(X) X ## 0" << std::endl; + asmCode << " #define EVAL_DEFINE(X) HAS_VALUE(X)" << std::endl; + asmCode << " #include " << std::endl; + asmCode << " #include " << std::endl; + asmCode << " static __inline uint64_t rotr(uint64_t x , int c) {" << std::endl; + asmCode << " return _rotr64(x, c);" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_ROTR" << std::endl; + asmCode << " #if EVAL_DEFINE(__MACHINEARM64_X64(1))" << std::endl; + asmCode << " static __inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl; + asmCode << " return __umulh(a, b);" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_MULH" << std::endl; + asmCode << " #endif" << std::endl; + asmCode << " #if EVAL_DEFINE(__MACHINEX64(1))" << std::endl; + asmCode << " static __inline int64_t smulh(int64_t a, int64_t b) {" << std::endl; + asmCode << " int64_t hi;" << std::endl; + asmCode << " _mul128(a, b, &hi);" << std::endl; + asmCode << " return hi;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_SMULH" << std::endl; + asmCode << " #endif" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#ifndef HAVE_ROTR" << std::endl; + asmCode << " static inline uint64_t rotr(uint64_t a, int b) {" << std::endl; + asmCode << " return (a >> b) | (a << (64 - b));" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_ROTR" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#if !defined(HAVE_MULH) || !defined(HAVE_SMULH) || !defined(HAVE_ROTR)" << std::endl; + asmCode << " #error \"Required functions are not defined\"" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "void superScalar(uint64_t r[8]) {" << std::endl; + asmCode << "uint64_t r8 = r[0], r9 = r[1], r10 = r[2], r11 = r[3], r12 = r[4], r13 = r[5], r14 = r[6], r15 = r[7];" << std::endl; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + switch (instr.opcode) + { + case RandomX::SuperscalarInstructionType::ISUB_R: + asmCode << regR[instr.dst] << " -= " << regR[instr.src] << ";" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IXOR_R: + asmCode << regR[instr.dst] << " ^= " << regR[instr.src] << ";" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IADD_RS: + asmCode << regR[instr.dst] << " += " << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << ";" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IMUL_R: + asmCode << regR[instr.dst] << " *= " << regR[instr.src] << ";" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IROR_C: + asmCode << regR[instr.dst] << " = rotr(" << regR[instr.dst] << ", " << instr.getImm32() << ");" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IADD_C7: + case RandomX::SuperscalarInstructionType::IADD_C8: + case RandomX::SuperscalarInstructionType::IADD_C9: + asmCode << regR[instr.dst] << " += " << (int32_t)instr.getImm32() << ";" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IXOR_C7: + case RandomX::SuperscalarInstructionType::IXOR_C8: + case RandomX::SuperscalarInstructionType::IXOR_C9: + asmCode << regR[instr.dst] << " ^= " << (int32_t)instr.getImm32() << ";" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IMULH_R: + asmCode << regR[instr.dst] << " = mulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; + break; + case RandomX::SuperscalarInstructionType::ISMULH_R: + asmCode << regR[instr.dst] << " = smulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; + break; + case RandomX::SuperscalarInstructionType::IMUL_RCP: + asmCode << regR[instr.dst] << " *= " << (int64_t)reciprocal(instr.getImm32()) << ";" << std::endl; + break; + default: + UNREACHABLE; + } + } + asmCode << "r[0] = r8; r[1] = r9; r[2] = r10; r[3] = r11; r[4] = r12; r[5] = r13; r[6] = r14; r[7] = r15;" << std::endl; + asmCode << "}" << std::endl; + } + int AssemblyGeneratorX86::getConditionRegister() { int min = INT_MAX; int minIndex; @@ -58,21 +232,6 @@ namespace RandomX { return minIndex; } - void AssemblyGeneratorX86::generateProgram(Program& prog) { - for (unsigned i = 0; i < 8; ++i) { - registerUsage[i] = -1; - } - asmCode.str(std::string()); //clear - for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { - asmCode << "randomx_isn_" << i << ":" << std::endl; - Instruction& instr = prog(i); - instr.src %= RegistersCount; - instr.dst %= RegistersCount; - generateCode(instr, i); - //asmCode << std::endl; - } - } - void AssemblyGeneratorX86::traceint(Instruction& instr) { if (trace) { asmCode << "\tpush " << regR[instr.dst] << std::endl; @@ -112,14 +271,12 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if (instr.src != instr.dst) { - asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; - } - else { - asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; - } + if(instr.dst == 5) + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; + else + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; traceint(instr); } @@ -490,7 +647,7 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) { handleCondition(instr, i); - asmCode << "\txor ecx, ecx" << std::endl; + asmCode << "\txor rcx, rcx" << std::endl; asmCode << "\tcmp " << regR32[instr.src] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; @@ -500,7 +657,7 @@ namespace RandomX { //6 uOPs void AssemblyGeneratorX86::h_COND_M(Instruction& instr, int i) { handleCondition(instr, i); - asmCode << "\txor ecx, ecx" << std::endl; + asmCode << "\txor rcx, rcx" << std::endl; genAddressReg(instr); asmCode << "\tcmp dword ptr [rsi+rax], " << (int32_t)instr.getImm32() << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; @@ -532,7 +689,7 @@ namespace RandomX { InstructionGenerator AssemblyGeneratorX86::engine[256] = { //Integer - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 62a6081..4b777e6 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -20,18 +20,23 @@ along with RandomX. If not, see. #pragma once #include "Instruction.hpp" +#include "configuration.h" +#include "common.hpp" #include namespace RandomX { class Program; + class SuperscalarProgram; class AssemblyGeneratorX86; typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int); class AssemblyGeneratorX86 { public: - void generateProgram(Program&); + void generateProgram(Program& prog); + void generateAsm(SuperscalarProgram& prog); + void generateC(SuperscalarProgram& prog); void printCode(std::ostream& os) { os << asmCode.rdbuf(); } @@ -52,7 +57,7 @@ namespace RandomX { void traceflt(Instruction&); void tracenop(Instruction&); - void h_IADD_R(Instruction&, int); + void h_IADD_RS(Instruction&, int); void h_IADD_M(Instruction&, int); void h_IADD_RC(Instruction&, int); void h_ISUB_R(Instruction&, int); diff --git a/src/Blake2Generator.cpp b/src/Blake2Generator.cpp new file mode 100644 index 0000000..2879088 --- /dev/null +++ b/src/Blake2Generator.cpp @@ -0,0 +1,51 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "blake2/blake2.h" +#include "blake2/endian.h" +#include "Blake2Generator.hpp" +#include "common.hpp" + +namespace RandomX { + + Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + store32(&data[60], nonce); + } + + uint8_t Blake2Generator::getByte() { + checkData(1); + return data[dataIndex++]; + } + + uint32_t Blake2Generator::getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } + + void Blake2Generator::checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; + } + } +} \ No newline at end of file diff --git a/src/Blake2Generator.hpp b/src/Blake2Generator.hpp new file mode 100644 index 0000000..24f2fca --- /dev/null +++ b/src/Blake2Generator.hpp @@ -0,0 +1,36 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once +#include + +namespace RandomX { + + class Blake2Generator { + public: + Blake2Generator(const void* seed, int nonce); + uint8_t getByte(); + uint32_t getInt32(); + private: + uint8_t data[64]; + size_t dataIndex; + + void checkData(const size_t); + }; +} \ No newline at end of file diff --git a/src/Cache.hpp b/src/Cache.hpp index 5656baf..bfc7ddf 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -34,7 +34,7 @@ namespace RandomX { return (uint8_t*)allocLargePagesMemory(size); } else { - void* ptr = _mm_malloc(size, sizeof(__m128i)); + void* ptr = _mm_malloc(size, CacheLineSize); if (ptr == nullptr) throw std::bad_alloc(); return (uint8_t*)ptr; diff --git a/src/CompiledLightVirtualMachine.cpp b/src/CompiledLightVirtualMachine.cpp index 49e593c..11bedf8 100644 --- a/src/CompiledLightVirtualMachine.cpp +++ b/src/CompiledLightVirtualMachine.cpp @@ -23,18 +23,25 @@ along with RandomX. If not, see. namespace RandomX { - CompiledLightVirtualMachine::CompiledLightVirtualMachine() { - } - - void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size) { + template + void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; + if(superscalar) + compiler.generateSuperScalarHash(programs); //datasetBasePtr = ds.dataset.memory; } - void CompiledLightVirtualMachine::initialize() { + template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + template + void CompiledLightVirtualMachine::initialize() { VirtualMachine::initialize(); - compiler.generateProgramLight(program); + compiler.generateProgramLight(program); //mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize); } + + template void CompiledLightVirtualMachine::initialize(); + template void CompiledLightVirtualMachine::initialize(); } \ No newline at end of file diff --git a/src/CompiledLightVirtualMachine.hpp b/src/CompiledLightVirtualMachine.hpp index 9ac52be..1d4b78e 100644 --- a/src/CompiledLightVirtualMachine.hpp +++ b/src/CompiledLightVirtualMachine.hpp @@ -26,6 +26,7 @@ along with RandomX. If not, see. namespace RandomX { + template class CompiledLightVirtualMachine : public CompiledVirtualMachine { public: void* operator new(size_t size) { @@ -37,8 +38,8 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); } - CompiledLightVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size) override; + CompiledLightVirtualMachine() {} + void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; }; } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index c313209..3e44476 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -29,7 +29,7 @@ namespace RandomX { CompiledVirtualMachine::CompiledVirtualMachine() { } - void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size) { + void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; datasetBasePtr = ds.dataset.memory; diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index 9deb621..a2866ca 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -42,7 +42,7 @@ namespace RandomX { _mm_free(ptr); } CompiledVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size) override; + void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; virtual void execute() override; void* getProgram() { diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 7069926..e8ddc64 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -40,9 +40,9 @@ namespace RandomX { os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]"; } - void Instruction::h_IADD_R(std::ostream& os) const { + void Instruction::h_IADD_RS(std::ostream& os) const { if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; + os << "r" << (int)dst << ", r" << (int)src << ", LSH " << (int)(mod % 4) << std::endl; } else { os << "r" << (int)dst << ", " << (int32_t)getImm32() << std::endl; @@ -302,13 +302,13 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); - os << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; + os << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; } void Instruction::h_ISTORE(std::ostream& os) const { @@ -333,7 +333,7 @@ namespace RandomX { const char* Instruction::names[256] = { //Integer - INST_NAME(IADD_R) + INST_NAME(IADD_RS) INST_NAME(IADD_M) INST_NAME(IADD_RC) INST_NAME(ISUB_R) @@ -379,7 +379,7 @@ namespace RandomX { InstructionVisualizer Instruction::engine[256] = { //Integer - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 7987ea4..9baf8ce 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -30,7 +30,7 @@ namespace RandomX { typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const; namespace InstructionType { - constexpr int IADD_R = 0; + constexpr int IADD_RS = 0; constexpr int IADD_M = 1; constexpr int IADD_RC = 2; constexpr int ISUB_R = 3; @@ -78,6 +78,9 @@ namespace RandomX { uint32_t getImm32() const { return load32(&imm32); } + void setImm32(uint32_t val) { + return store32(&imm32, val); + } const char* getName() const { return names[opcode]; } @@ -95,7 +98,7 @@ namespace RandomX { void genAddressImm(std::ostream& os) const; void genAddressRegDst(std::ostream&) const; - void h_IADD_R(std::ostream&) const; + void h_IADD_RS(std::ostream&) const; void h_IADD_M(std::ostream&) const; void h_IADD_RC(std::ostream&) const; void h_ISUB_R(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 15a5049..132a2c9 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -22,7 +22,6 @@ along with RandomX. If not, see. #include "InterpretedVirtualMachine.hpp" #include "dataset.hpp" #include "Cache.hpp" -#include "LightClientAsyncWorker.hpp" #include #include #include @@ -36,6 +35,7 @@ along with RandomX. If not, see. #ifdef STATS #include #endif +#include "superscalarGenerator.hpp" #ifdef FPUCHECK constexpr bool fpuCheck = true; @@ -45,17 +45,20 @@ constexpr bool fpuCheck = false; namespace RandomX { - InterpretedVirtualMachine::~InterpretedVirtualMachine() { - - } - - void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size) { + template + void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; readDataset = &datasetReadLight; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; + if(superscalar) + precompileSuperscalar(programs); } - void InterpretedVirtualMachine::initialize() { + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + template + void InterpretedVirtualMachine::initialize() { VirtualMachine::initialize(); for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { program(i).src %= RegistersCount; @@ -63,12 +66,19 @@ namespace RandomX { } } - void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template void InterpretedVirtualMachine::initialize(); + template void InterpretedVirtualMachine::initialize(); + + template + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) { executeBytecode(ic, r, f, e, a); } } + template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + static void print(int_reg_t r) { std::cout << std::hex << std::setw(16) << std::setfill('0') << r << std::endl; } @@ -98,14 +108,15 @@ namespace RandomX { return std::fpclassify(x) == FP_SUBNORMAL; } - FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[ic]; if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); //if(trace) printState(r, f, e, a); switch (ibc.type) { - case InstructionType::IADD_R: { - *ibc.idst += *ibc.isrc; + case InstructionType::IADD_RS: { + *ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm; } break; case InstructionType::IADD_M: { @@ -289,7 +300,8 @@ namespace RandomX { #endif } - void InterpretedVirtualMachine::execute() { + template + void InterpretedVirtualMachine::execute() { int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; __m128d f[4]; __m128d e[4]; @@ -350,11 +362,16 @@ namespace RandomX { mem.mx ^= r[readReg2] ^ r[readReg3]; mem.mx &= CacheLineAlignMask; - Cache& cache = mem.ds.cache; - uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); - for (int i = 0; i < RegistersCount; ++i) - r[i] ^= datasetLine[i]; + if (superscalar) { + executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r); + } + else { + Cache& cache = mem.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + } std::swap(mem.mx, mem.ma); if (trace) { @@ -419,6 +436,9 @@ namespace RandomX { _mm_store_pd(®.e[3].lo, e[3]); } + template void InterpretedVirtualMachine::execute(); + template void InterpretedVirtualMachine::execute(); + static int getConditionRegister(int(®isterUsage)[8]) { int min = INT_MAX; int minIndex; @@ -431,9 +451,127 @@ namespace RandomX { return minIndex; } + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298410992540426748ULL; + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + + static uint8_t* getMixBlock(uint64_t registerValue, Cache& cache) { + uint8_t* mixBlock; + if (RANDOMX_ARGON_GROWTH == 0) { + constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); + mixBlock = cache.memory + (registerValue & mask) * CacheLineSize; + } + else { + const uint32_t modulus = cache.size / CacheLineSize; + mixBlock = cache.memory + (registerValue % modulus) * CacheLineSize; + } + return mixBlock; + } + + template + void InterpretedVirtualMachine::executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector& reciprocals) { + for (unsigned j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + switch (instr.opcode) + { + case RandomX::SuperscalarInstructionType::ISUB_R: + r[instr.dst] -= r[instr.src]; + break; + case RandomX::SuperscalarInstructionType::IXOR_R: + r[instr.dst] ^= r[instr.src]; + break; + case RandomX::SuperscalarInstructionType::IADD_RS: + r[instr.dst] += r[instr.src] << (instr.mod % 4); + break; + case RandomX::SuperscalarInstructionType::IMUL_R: + r[instr.dst] *= r[instr.src]; + break; + case RandomX::SuperscalarInstructionType::IROR_C: + r[instr.dst] = rotr(r[instr.dst], instr.getImm32()); + break; + case RandomX::SuperscalarInstructionType::IADD_C7: + case RandomX::SuperscalarInstructionType::IADD_C8: + case RandomX::SuperscalarInstructionType::IADD_C9: + r[instr.dst] += signExtend2sCompl(instr.getImm32()); + break; + case RandomX::SuperscalarInstructionType::IXOR_C7: + case RandomX::SuperscalarInstructionType::IXOR_C8: + case RandomX::SuperscalarInstructionType::IXOR_C9: + r[instr.dst] ^= signExtend2sCompl(instr.getImm32()); + break; + case RandomX::SuperscalarInstructionType::IMULH_R: + r[instr.dst] = mulh(r[instr.dst], r[instr.src]); + break; + case RandomX::SuperscalarInstructionType::ISMULH_R: + r[instr.dst] = smulh(r[instr.dst], r[instr.src]); + break; + case RandomX::SuperscalarInstructionType::IMUL_RCP: + if(superscalar) + r[instr.dst] *= reciprocals[instr.getImm32()]; + else + r[instr.dst] *= reciprocal(instr.getImm32()); + break; + default: + UNREACHABLE; + } + } + } + + template + void InterpretedVirtualMachine::executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]) { + int_reg_t rl[8]; + uint8_t* mixBlock; + uint64_t registerValue = blockNumber; + rl[0] = (blockNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + Cache& cache = mem.ds.cache; + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + mixBlock = getMixBlock(registerValue, cache); + SuperscalarProgram& prog = superScalarPrograms[i]; + + executeSuperscalar(rl, prog, reciprocals); + + for(unsigned q = 0; q < 8; ++q) + rl[q] ^= load64(mixBlock + 8 * q); + + registerValue = rl[prog.getAddressRegister()]; + } + + for (unsigned q = 0; q < 8; ++q) + r[q] ^= rl[q]; + } + + template + void InterpretedVirtualMachine::precompileSuperscalar(SuperscalarProgram* programs) { + memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms)); + reciprocals.clear(); + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + for (unsigned j = 0; j < superScalarPrograms[i].getSize(); ++j) { + Instruction& instr = superScalarPrograms[i](j); + if (instr.opcode == SuperscalarInstructionType::IMUL_RCP) { + auto rcp = reciprocal(instr.getImm32()); + instr.setImm32(reciprocals.size()); + reciprocals.push_back(rcp); + } + } + } + } + #include "instructionWeights.hpp" - void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template + void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { int registerUsage[8]; for (unsigned i = 0; i < 8; ++i) { registerUsage[i] = -1; @@ -442,17 +580,20 @@ namespace RandomX { auto& instr = program(i); auto& ibc = byteCode[i]; switch (instr.opcode) { - CASE_REP(IADD_R) { + CASE_REP(IADD_RS) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; - ibc.type = InstructionType::IADD_R; + ibc.type = InstructionType::IADD_RS; ibc.idst = &r[dst]; - if (src != dst) { + if (dst != 5) { ibc.isrc = &r[src]; + ibc.shift = instr.mod % 4; + ibc.imm = 0; } else { + ibc.isrc = &r[src]; + ibc.shift = instr.mod % 4; ibc.imm = signExtend2sCompl(instr.getImm32()); - ibc.isrc = &ibc.imm; } registerUsage[instr.dst] = i; } break; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index d6da7e3..3632112 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -23,23 +23,17 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "Program.hpp" #include "intrinPortable.h" +#include namespace RandomX { - class ITransform { - public: - virtual int32_t apply(int32_t) const = 0; - virtual const char* getName() const = 0; - virtual std::ostream& printAsm(std::ostream&) const = 0; - virtual std::ostream& printCxx(std::ostream&) const = 0; - }; - struct InstructionByteCode; - class InterpretedVirtualMachine; + template class InterpretedVirtualMachine; - typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); + template + using InstructionHandler = void(InterpretedVirtualMachine::*)(Instruction&); - struct alignas(8) InstructionByteCode { + struct InstructionByteCode { union { int_reg_t* idst; __m128d* fdst; @@ -62,6 +56,7 @@ namespace RandomX { constexpr int asedwfagdewsa = sizeof(InstructionByteCode); + template class InterpretedVirtualMachine : public VirtualMachine { public: void* operator new(size_t size) { @@ -74,16 +69,18 @@ namespace RandomX { _mm_free(ptr); } InterpretedVirtualMachine(bool soft) : softAes(soft) {} - ~InterpretedVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size) override; + ~InterpretedVirtualMachine() {} + void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; void execute() override; + static void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector& reciprocals); private: - static InstructionHandler engine[256]; + static InstructionHandler engine[256]; DatasetReadFunc readDataset; bool softAes; InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; - + std::vector reciprocals; + alignas(64) SuperscalarProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES]; #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -131,7 +128,9 @@ namespace RandomX { int datasetAccess[256] = { 0 }; #endif void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void precompileSuperscalar(SuperscalarProgram*); void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]); }; } \ No newline at end of file diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index 9ccdb16..e78dbe7 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -32,10 +32,18 @@ .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) .global DECL(randomx_program_read_dataset_light) +.global DECL(randomx_program_read_dataset_sshash_init) +.global DECL(randomx_program_read_dataset_sshash_fin) +.global DECL(randomx_program_read_dataset_light_sub) +.global DECL(randomx_dataset_init) .global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_program_read_dataset_light_sub) .global DECL(randomx_program_epilogue) +.global DECL(randomx_sshash_load) +.global DECL(randomx_sshash_prefetch) +.global DECL(randomx_sshash_end) +.global DECL(randomx_sshash_init) .global DECL(randomx_program_end) #define db .byte @@ -63,6 +71,12 @@ DECL(randomx_program_read_dataset): DECL(randomx_program_read_dataset_light): #include "asm/program_read_dataset_light.inc" +DECL(randomx_program_read_dataset_sshash_init): + #include "asm/program_read_dataset_sshash_init.inc" + +DECL(randomx_program_read_dataset_sshash_fin): + #include "asm/program_read_dataset_sshash_fin.inc" + DECL(randomx_program_loop_store): #include "asm/program_loop_store.inc" @@ -75,10 +89,84 @@ DECL(randomx_program_read_dataset_light_sub): squareHashSub: #include "asm/squareHash.inc" +.balign 64 +DECL(randomx_dataset_init): + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + ;# cache in rdi + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + .byte 232 ;# 0xE8 = call + ;# .set CALL_LOC, + .int 32768 - (call_offset - DECL(randomx_dataset_init)) +call_offset: + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop rcx + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret + .balign 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" +.balign 64 +DECL(randomx_sshash_load): + #include "asm/program_sshash_load.inc" + +DECL(randomx_sshash_prefetch): + #include "asm/program_sshash_prefetch.inc" + +DECL(randomx_sshash_end): + nop + +.balign 64 +DECL(randomx_sshash_init): + lea r8, [rbx+1] + #include "asm/program_sshash_prefetch.inc" + imul r8, qword ptr r0_mul[rip] + mov r9, qword ptr r1_add[rip] + xor r9, r8 + mov r10, qword ptr r2_add[rip] + xor r10, r8 + mov r11, qword ptr r3_add[rip] + xor r11, r8 + mov r12, qword ptr r4_add[rip] + xor r12, r8 + mov r13, qword ptr r5_add[rip] + xor r13, r8 + mov r14, qword ptr r6_add[rip] + xor r14, r8 + mov r15, qword ptr r7_add[rip] + xor r15, r8 + jmp DECL(randomx_program_end) + +.balign 64 + #include "asm/program_sshash_constants.inc" + .balign 64 DECL(randomx_program_end): nop diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index ffac80c..ab29312 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -25,10 +25,17 @@ PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset_light +PUBLIC randomx_program_read_dataset_sshash_init +PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_program_read_dataset_light_sub +PUBLIC randomx_dataset_init PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue +PUBLIC randomx_sshash_load +PUBLIC randomx_sshash_prefetch +PUBLIC randomx_sshash_end +PUBLIC randomx_sshash_init PUBLIC randomx_program_end ALIGN 64 @@ -60,6 +67,14 @@ randomx_program_read_dataset_light PROC include asm/program_read_dataset_light.inc randomx_program_read_dataset_light ENDP +randomx_program_read_dataset_sshash_init PROC + include asm/program_read_dataset_sshash_init.inc +randomx_program_read_dataset_sshash_init ENDP + +randomx_program_read_dataset_sshash_fin PROC + include asm/program_read_dataset_sshash_fin.inc +randomx_program_read_dataset_sshash_fin ENDP + randomx_program_loop_store PROC include asm/program_loop_store.inc randomx_program_loop_store ENDP @@ -75,11 +90,93 @@ randomx_program_read_dataset_light_sub PROC include asm/squareHash.inc randomx_program_read_dataset_light_sub ENDP +ALIGN 64 +randomx_dataset_init PROC + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + mov rdi, rcx ;# cache + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + db 232 ;# 0xE8 = call + dd 32768 - distance + distance equ $ - offset randomx_dataset_init + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop r9 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret +randomx_dataset_init ENDP + ALIGN 64 randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP +ALIGN 64 +randomx_sshash_load PROC + include asm/program_sshash_load.inc +randomx_sshash_load ENDP + +randomx_sshash_prefetch PROC + include asm/program_sshash_prefetch.inc +randomx_sshash_prefetch ENDP + +randomx_sshash_end PROC + nop +randomx_sshash_end ENDP + +ALIGN 64 +randomx_sshash_init PROC + lea r8, [rbx+1] + include asm/program_sshash_prefetch.inc + imul r8, qword ptr [r0_mul] + mov r9, qword ptr [r1_add] + xor r9, r8 + mov r10, qword ptr [r2_add] + xor r10, r8 + mov r11, qword ptr [r3_add] + xor r11, r8 + mov r12, qword ptr [r4_add] + xor r12, r8 + mov r13, qword ptr [r5_add] + xor r13, r8 + mov r14, qword ptr [r6_add] + xor r14, r8 + mov r15, qword ptr [r7_add] + xor r15, r8 + jmp randomx_program_end +randomx_sshash_init ENDP + +ALIGN 64 + include asm/program_sshash_constants.inc + ALIGN 64 randomx_program_end PROC nop diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index 3d835b6..3bb56ac 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -24,9 +24,16 @@ extern "C" { void randomx_program_start(); void randomx_program_read_dataset(); void randomx_program_read_dataset_light(); + void randomx_program_read_dataset_sshash_init(); + void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_program_read_dataset_light_sub(); + void randomx_dataset_init(); void randomx_program_epilogue(); + void randomx_sshash_load(); + void randomx_sshash_prefetch(); + void randomx_sshash_end(); + void randomx_sshash_init(); void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 5ddc382..ad7c85a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -87,6 +87,9 @@ namespace RandomX { */ #include "JitCompilerX86-static.hpp" +#include "superscalarGenerator.hpp" + +#define NOP_TEST true const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; @@ -94,23 +97,36 @@ namespace RandomX { const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light; + const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; + const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeReadDatasetLightSub = (uint8_t*)&randomx_program_read_dataset_light_sub; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load; + const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch; + const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; + const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; const int32_t prologueSize = codeLoopBegin - codePrologue; - const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset; - const int32_t readDatasetLightSize = codeLoopStore - codeReadDatasetLight; + const int32_t readDatasetLightSize = codeReadDatasetLightSshInit - codeReadDatasetLight; + const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; + const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; - const int32_t readDatasetLightSubSize = codeEpilogue - codeReadDatasetLightSub; + const int32_t readDatasetLightSubSize = codeDatasetInit - codeReadDatasetLightSub; + const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; + const int32_t epilogueSize = codeShhLoad - codeEpilogue; + const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; + const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; + const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; const int32_t epilogueOffset = CodeSize - epilogueSize; const int32_t readDatasetLightSubOffset = epilogueOffset - readDatasetLightSubSize; + constexpr int32_t superScalarHashOffset = 32768; static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; @@ -166,7 +182,7 @@ namespace RandomX { static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 }; static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; - static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t XOR_RCX_RCX[] = { 0x48, 0x33, 0xC9 }; static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; @@ -184,6 +200,18 @@ namespace RandomX { static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t JZ[] = { 0x0f, 0x84 }; + static const uint8_t RET = 0xc3; + + static const uint8_t NOP1[] = { 0x90 }; + static const uint8_t NOP2[] = { 0x66, 0x90 }; + static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 }; + static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; + static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + + static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -196,6 +224,10 @@ namespace RandomX { memcpy(code + readDatasetLightSubOffset, codeReadDatasetLightSub, readDatasetLightSubSize); } + JitCompilerX86::~JitCompilerX86() { + freePagedMemory(code, CodeSize); + } + void JitCompilerX86::generateProgram(Program& prog) { generateProgramPrologue(prog); memcpy(code + codePos, codeReadDataset, readDatasetSize); @@ -203,19 +235,67 @@ namespace RandomX { generateProgramEpilogue(prog); } + template void JitCompilerX86::generateProgramLight(Program& prog) { if (RANDOMX_CACHE_ACCESSES != 8) throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_CACHE_ACCESSES"); if (RANDOMX_ARGON_GROWTH != 0) throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_ARGON_GROWTH"); generateProgramPrologue(prog); - memcpy(code + codePos, codeReadDatasetLight, readDatasetLightSize); - codePos += readDatasetLightSize; - emitByte(CALL); - emit32(readDatasetLightSubOffset - (codePos + 4)); + if (superscalar) { + emit(codeReadDatasetLightSshInit, readDatasetLightInitSize); + emitByte(CALL); + emit32(superScalarHashOffset - (codePos + 4)); + emit(codeReadDatasetLightSshFin, readDatasetLightFinSize); + } + else { + memcpy(code + codePos, codeReadDatasetLight, readDatasetLightSize); + codePos += readDatasetLightSize; + emitByte(CALL); + emit32(readDatasetLightSubOffset - (codePos + 4)); + } generateProgramEpilogue(prog); } + template void JitCompilerX86::generateProgramLight(Program& prog); + template void JitCompilerX86::generateProgramLight(Program& prog); + + template + void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[N]) { + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); + codePos = superScalarHashOffset + codeSshInitSize; + for (unsigned j = 0; j < N; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + emit(codeShhLoad, codeSshLoadSize); + if (j < N - 1) { + emit(REX_MOV_RR64); + emitByte(0xd8 + prog.getAddressRegister()); + emit(codeShhPrefetch, codeSshPrefetchSize); + int align = (codePos % 16); + while (align != 0) { + int nopSize = 16 - align; + if (nopSize > 8) nopSize = 8; + emit(NOPX[nopSize - 1], nopSize); + align = (codePos % 16); + } + } + } + emitByte(RET); + } + + template + void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + void JitCompilerX86::generateDatasetInitCode() { + memcpy(code, codeDatasetInit, datasetInitSize); + } + void JitCompilerX86::generateProgramPrologue(Program& prog) { #ifdef RANDOMX_JUMP instructionOffsets.clear(); @@ -238,12 +318,7 @@ namespace RandomX { emitByte(0xc0 + readReg1); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; - for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { - Instruction& instr = prog(i); - instr.src %= RegistersCount; - instr.dst %= RegistersCount; - generateCode(instr, i); - } + generateCode(prog); emit(REX_MOV_RR); emitByte(0xc0 + readReg2); emit(REX_XOR_EAX); @@ -258,9 +333,9 @@ namespace RandomX { emit32(prologueSize - codePos - 4); emitByte(JMP); emit32(epilogueOffset - codePos - 4); - emitByte(0x90); } + template void JitCompilerX86::generateCode(Instruction& instr, int i) { #ifdef RANDOMX_JUMP instructionOffsets.push_back(codePos); @@ -269,6 +344,95 @@ namespace RandomX { (this->*generator)(instr, i); } + template<> + void JitCompilerX86::generateCode(Instruction& instr, int i) { + switch (instr.opcode) + { + case RandomX::SuperscalarInstructionType::ISUB_R: + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case RandomX::SuperscalarInstructionType::IXOR_R: + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case RandomX::SuperscalarInstructionType::IADD_RS: + emit(REX_LEA); + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.mod % 4, instr.src, instr.dst); + break; + case RandomX::SuperscalarInstructionType::IMUL_R: + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case RandomX::SuperscalarInstructionType::IROR_C: + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.getImm32() & 63); + break; + case RandomX::SuperscalarInstructionType::IADD_C7: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + break; + case RandomX::SuperscalarInstructionType::IXOR_C7: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + break; + case RandomX::SuperscalarInstructionType::IADD_C8: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP1); + break; + case RandomX::SuperscalarInstructionType::IXOR_C8: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP1); + break; + case RandomX::SuperscalarInstructionType::IADD_C9: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP2); + break; + case RandomX::SuperscalarInstructionType::IXOR_C9: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP2); + break; + case RandomX::SuperscalarInstructionType::IMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case RandomX::SuperscalarInstructionType::ISMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case RandomX::SuperscalarInstructionType::IMUL_RCP: + emit(MOV_RAX_I); + emit64(reciprocal(instr.getImm32())); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); + break; + default: + UNREACHABLE; + } + } + + template void JitCompilerX86::generateCode(Instruction& instr, int i); + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { emit(REX_MOV_RR); emitByte((rax ? 0xc0 : 0xc8) + instr.src); @@ -292,9 +456,9 @@ namespace RandomX { emit32(instr.getImm32() & ScratchpadL3Mask); } - void JitCompilerX86::h_IADD_R(Instruction& instr, int i) { + void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if (instr.src != instr.dst) { + /*if (instr.src != instr.dst) { emit(REX_ADD_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } @@ -302,7 +466,19 @@ namespace RandomX { emit(REX_81); emitByte(0xc0 + instr.dst); emit32(instr.getImm32()); + }*/ + if (false && NOP_TEST) { + emit(NOP4); + return; } + emit(REX_LEA); + if (instr.dst == 5) //rbp,r13 cannot be the base register without offset + emitByte(0xac); + else + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.mod % 4, instr.src, instr.dst); + if (instr.dst == 5) + emit32(instr.getImm32()); } void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { @@ -335,10 +511,18 @@ namespace RandomX { void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + return; + } emit(REX_SUB_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_81); emitByte(0xe8 + instr.dst); emit32(instr.getImm32()); @@ -371,10 +555,18 @@ namespace RandomX { void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP4); + return; + } emit(REX_IMUL_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_IMUL_RRI); emitByte(0xc0 + 9 * instr.dst); emit32(instr.getImm32()); @@ -398,6 +590,12 @@ namespace RandomX { void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -427,6 +625,12 @@ namespace RandomX { void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -456,6 +660,13 @@ namespace RandomX { void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { if (instr.getImm32() != 0) { + if (false && NOP_TEST) { + emitByte(0x66); + emitByte(0x66); + emit(NOP8); + emit(NOP4); + return; + } registerUsage[instr.dst] = i; emit(MOV_RAX_I); emit64(reciprocal(instr.getImm32())); @@ -477,10 +688,18 @@ namespace RandomX { void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + return; + } emit(REX_XOR_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_XOR_RI); emitByte(0xf0 + instr.dst); emit32(instr.getImm32()); @@ -505,12 +724,21 @@ namespace RandomX { void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR); emitByte(0xc8 + instr.src); emit(REX_ROT_CL); emitByte(0xc8 + instr.dst); } else { + if (false && NOP_TEST) { + emit(NOP4); + return; + } emit(REX_ROT_I8); emitByte(0xc8 + instr.dst); emitByte(instr.getImm32() & 63); @@ -705,14 +933,21 @@ namespace RandomX { const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; int reg = getConditionRegister(); int target = registerUsage[reg] + 1; - emit(REX_ADD_I); - emitByte(0xc0 + reg); - emit32(1 << shift); - emit(REX_TEST); - emitByte(0xc0 + reg); - emit32(conditionMask); - emit(JZ); - emit32(instructionOffsets[target] - (codePos + 4)); + if (false && NOP_TEST) { + emit(NOP7); + emit(NOP7); + emit(NOP6); + } + else { + emit(REX_ADD_I); + emitByte(0xc0 + reg); + emit32(1 << shift); + emit(REX_TEST); + emitByte(0xc0 + reg); + emit32(conditionMask); + emit(JZ); + emit32(instructionOffsets[target] - (codePos + 4)); + } for (unsigned j = 0; j < 8; ++j) { //mark all registers as used registerUsage[j] = i; } @@ -722,7 +957,14 @@ namespace RandomX { #ifdef RANDOMX_JUMP handleCondition(instr, i); #endif - emit(XOR_ECX_ECX); + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP7); + emit(NOP3); + emit(NOP3); + return; + } + emit(XOR_RCX_RCX); emit(REX_CMP_R32I); emitByte(0xf8 + instr.src); emit32(instr.getImm32()); @@ -737,7 +979,7 @@ namespace RandomX { #ifdef RANDOMX_JUMP handleCondition(instr, i); #endif - emit(XOR_ECX_ECX); + emit(XOR_RCX_RCX); genAddressReg(instr); emit(REX_CMP_M32I); emit32(instr.getImm32()); @@ -770,7 +1012,7 @@ namespace RandomX { #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) InstructionGeneratorX86 JitCompilerX86::engine[256] = { - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e127a40..2908b04 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -27,6 +27,7 @@ along with RandomX. If not, see. namespace RandomX { class Program; + class SuperscalarProgram; class JitCompilerX86; typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); @@ -36,11 +37,19 @@ namespace RandomX { class JitCompilerX86 { public: JitCompilerX86(); + ~JitCompilerX86(); void generateProgram(Program&); + template void generateProgramLight(Program&); + template + void generateSuperScalarHash(SuperscalarProgram (&programs)[N]); ProgramFunc getProgramFunc() { return (ProgramFunc)code; } + DatasetInitFunc getDatasetInitFunc() { + generateDatasetInitCode(); + return (DatasetInitFunc)code; + } uint8_t* getCode() { return code; } @@ -52,6 +61,18 @@ namespace RandomX { uint8_t* code; int32_t codePos; + template + void generateCode(P& prog) { + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode

(instr, i); + } + } + + void generateDatasetInitCode(); + void generateProgramPrologue(Program&); void generateProgramEpilogue(Program&); int getConditionRegister(); @@ -61,6 +82,8 @@ namespace RandomX { void genSIB(int scale, int index, int base); void handleCondition(Instruction&, int); + + template void generateCode(Instruction&, int); void emitByte(uint8_t val) { @@ -90,13 +113,15 @@ namespace RandomX { template void emit(const uint8_t (&src)[N]) { - for (unsigned i = 0; i < N; ++i) { - code[codePos + i] = src[i]; - } - codePos += N; + emit(src, N); } - void h_IADD_R(Instruction&, int); + void emit(const uint8_t* src, size_t count) { + memcpy(code + codePos, src, count); + codePos += count; + } + + void h_IADD_RS(Instruction&, int); void h_IADD_M(Instruction&, int); void h_IADD_RC(Instruction&, int); void h_ISUB_R(Instruction&, int); diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp deleted file mode 100644 index fbba713..0000000 --- a/src/LightClientAsyncWorker.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* -Copyright (c) 2019 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -#include "LightClientAsyncWorker.hpp" -#include "dataset.hpp" -#include "Cache.hpp" - -namespace RandomX { - - LightClientAsyncWorker::LightClientAsyncWorker(const Cache& c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), -#ifdef TRACE - sw(true), -#endif - workerThread(&LightClientAsyncWorker::runWorker, this) { - - } - - void LightClientAsyncWorker::prepareBlock(addr_t addr) { -#ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr / CacheLineSize << std::endl; -#endif - { - std::lock_guard lk(mutex); - startBlock = addr / CacheLineSize; - blockCount = 1; - output = currentLine.data(); - hasWork = true; - } -#ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl; -#endif - notifier.notify_one(); - } - - const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) { -#ifdef TRACE - std::cout << sw.getElapsed() << ": getBlock-enter " << addr / CacheLineSize << std::endl; -#endif - uint32_t currentBlock = addr / CacheLineSize; - if (currentBlock != startBlock || output != currentLine.data()) { - initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8); - } - else { - sync(); - } -#ifdef TRACE - std::cout << sw.getElapsed() << ": getBlock-return " << addr / CacheLineSize << std::endl; -#endif - return currentLine.data(); - } - - void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { -#ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlocks-enter " << startBlock << "/" << blockCount << std::endl; -#endif - { - std::lock_guard lk(mutex); - this->startBlock = startBlock; - this->blockCount = blockCount; - output = out; - hasWork = true; - notifier.notify_one(); - } - } - - void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { - for (uint32_t i = 0; i < blockCount; ++i) { - initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8); - } - } - - void LightClientAsyncWorker::sync() { - std::unique_lock lk(mutex); - notifier.wait(lk, [this] { return !hasWork; }); - } - - void LightClientAsyncWorker::runWorker() { -#ifdef TRACE - std::cout << sw.getElapsed() << ": runWorker-enter " << std::endl; -#endif - for (;;) { - std::unique_lock lk(mutex); - notifier.wait(lk, [this] { return hasWork; }); -#ifdef TRACE - std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; -#endif - //getBlocks(output, startBlock, blockCount); - initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8); - hasWork = false; -#ifdef TRACE - std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; -#endif - lk.unlock(); - notifier.notify_one(); - } - } -} \ No newline at end of file diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp deleted file mode 100644 index 7c45e53..0000000 --- a/src/LightClientAsyncWorker.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* -Copyright (c) 2019 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -//#define TRACE -#include "common.hpp" - -#include -#include -#include -#include -#ifdef TRACE -#include "Stopwatch.hpp" -#include -#endif - -namespace RandomX { - - using DatasetLine = std::array; - - class LightClientAsyncWorker : public ILightClientAsyncWorker { - public: - LightClientAsyncWorker(const Cache&); - void prepareBlock(addr_t) final; - void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; - const uint64_t* getBlock(addr_t) final; - void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; - void sync() final; - private: - void runWorker(); - std::condition_variable notifier; - std::mutex mutex; - alignas(16) DatasetLine currentLine; - void* output; - uint32_t startBlock, blockCount; - bool hasWork; -#ifdef TRACE - Stopwatch sw; -#endif - std::thread workerThread; - }; -} \ No newline at end of file diff --git a/src/Program.cpp b/src/Program.cpp index ebd271d..2b10f0b 100644 --- a/src/Program.cpp +++ b/src/Program.cpp @@ -21,7 +21,8 @@ along with RandomX. If not, see. #include "hashAes1Rx4.hpp" namespace RandomX { - void Program::print(std::ostream& os) const { + template + void ProgramBase::print(std::ostream& os) const { for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { auto instr = programBuffer[i]; os << instr; diff --git a/src/Program.hpp b/src/Program.hpp index 621b614..2f2a402 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -39,11 +39,61 @@ namespace RandomX { uint64_t getEntropy(int i) { return load64(&entropyBuffer[i]); } + uint32_t getSize() { + return RANDOMX_PROGRAM_SIZE; + } private: - void print(std::ostream&) const; + void print(std::ostream& os) const { + for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { + auto instr = programBuffer[i]; + os << instr; + } + } uint64_t entropyBuffer[16]; Instruction programBuffer[RANDOMX_PROGRAM_SIZE]; }; static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); + + class SuperscalarProgram { + public: + Instruction& operator()(int pc) { + return programBuffer[pc]; + } + friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) { + p.print(os); + return os; + } + uint32_t getSize() { + return size; + } + void setSize(uint32_t val) { + size = val; + } + int getAddressRegister() { + return addrReg; + } + void setAddressRegister(uint32_t val) { + addrReg = val; + } + double ipc; + int codeSize; + int macroOps; + int decodeCycles; + int cpuLatency; + int asicLatency; + int mulCount; + int cpuLatencies[8]; + int asicLatencies[8]; + private: + void print(std::ostream& os) const { + for (unsigned i = 0; i < size; ++i) { + auto instr = programBuffer[i]; + os << instr; + } + } + Instruction programBuffer[RANDOMX_SUPERSCALAR_MAX_SIZE]; + uint32_t size; + int addrReg; + }; } diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 00a14de..7352933 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -24,13 +24,11 @@ along with RandomX. If not, see. namespace RandomX { - - class VirtualMachine { public: VirtualMachine(); virtual ~VirtualMachine() {} - virtual void setDataset(dataset_t ds, uint64_t size) = 0; + virtual void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0; void setScratchpad(void* ptr) { scratchpad = (uint8_t*)ptr; } diff --git a/src/asm/program_read_dataset_sshash_fin.inc b/src/asm/program_read_dataset_sshash_fin.inc new file mode 100644 index 0000000..f5a067d --- /dev/null +++ b/src/asm/program_read_dataset_sshash_fin.inc @@ -0,0 +1,10 @@ + mov rbx, qword ptr [rsp+64] + xor r8, qword ptr [rsp+56] + xor r9, qword ptr [rsp+48] + xor r10, qword ptr [rsp+40] + xor r11, qword ptr [rsp+32] + xor r12, qword ptr [rsp+24] + xor r13, qword ptr [rsp+16] + xor r14, qword ptr [rsp+8] + xor r15, qword ptr [rsp+0] + add rsp, 72 \ No newline at end of file diff --git a/src/asm/program_read_dataset_sshash_init.inc b/src/asm/program_read_dataset_sshash_init.inc new file mode 100644 index 0000000..a186d2e --- /dev/null +++ b/src/asm/program_read_dataset_sshash_init.inc @@ -0,0 +1,16 @@ + sub rsp, 72 + mov qword ptr [rsp+64], rbx + mov qword ptr [rsp+56], r8 + mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+40], r10 + mov qword ptr [rsp+32], r11 + mov qword ptr [rsp+24], r12 + mov qword ptr [rsp+16], r13 + mov qword ptr [rsp+8], r14 + mov qword ptr [rsp+0], r15 + xor rbp, rax ;# modify "mx" + ror rbp, 32 ;# swap "ma" and "mx" + mov ebx, ebp ;# ecx = ma + and ebx, 2147483584 ;# align "ma" to the start of a cache line + shr ebx, 6 ;# ebx = Dataset block number + ;# call 32768 \ No newline at end of file diff --git a/src/asm/program_sshash_constants.inc b/src/asm/program_sshash_constants.inc new file mode 100644 index 0000000..2044a0e --- /dev/null +++ b/src/asm/program_sshash_constants.inc @@ -0,0 +1,24 @@ +r0_mul: + ;#/ 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_add: + ;#/ 9298410992540426748 + db 252, 161, 245, 89, 136, 151, 10, 129 +r2_add: + ;#/ 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_add: + ;#/ 9306329213124610396 + db 92, 9, 34, 191, 28, 185, 38, 129 +r4_add: + ;#/ 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_add: + ;#/ 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_add: + ;#/ 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_add: + ;#/ 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/asm/program_sshash_load.inc b/src/asm/program_sshash_load.inc new file mode 100644 index 0000000..5351356 --- /dev/null +++ b/src/asm/program_sshash_load.inc @@ -0,0 +1,8 @@ + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/src/asm/program_sshash_prefetch.inc b/src/asm/program_sshash_prefetch.inc new file mode 100644 index 0000000..96ec35a --- /dev/null +++ b/src/asm/program_sshash_prefetch.inc @@ -0,0 +1,4 @@ + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index 118f053..034c10f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -41,7 +41,7 @@ namespace RandomX { static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2."); static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1"); - constexpr int wtSum = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ + constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_9C + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \ RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \ RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \ @@ -95,6 +95,7 @@ namespace RandomX { constexpr int ScratchpadL3Mask = (ScratchpadL3 - 1) * 8; constexpr int ScratchpadL3Mask64 = (ScratchpadL3 / 8 - 1) * 64; constexpr int RegistersCount = 8; + constexpr int LimitedAddressRegister = 5; //x86 r13 register struct Cache { uint8_t* memory; @@ -141,6 +142,7 @@ namespace RandomX { typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(®)[RegistersCount]); typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); + typedef void(*DatasetInitFunc)(uint8_t* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); } std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf); diff --git a/src/configuration.h b/src/configuration.h index 8780998..80cf0c4 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,6 +37,9 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 +#define RANDOMX_SUPERSCALAR_LATENCY 170 +#define RANDOMX_SUPERSCALAR_MAX_SIZE 512 + //Dataset size in bytes. Must be a power of 2. #define RANDOMX_DATASET_SIZE (2ULL * 1024 * 1024 * 1024) @@ -75,12 +78,12 @@ Instruction frequencies (per 256 opcodes) Total sum of frequencies must be 256 */ -#define RANDOMX_FREQ_IADD_R 12 +#define RANDOMX_FREQ_IADD_RS 32 #define RANDOMX_FREQ_IADD_M 7 -#define RANDOMX_FREQ_IADD_RC 16 -#define RANDOMX_FREQ_ISUB_R 12 +#define RANDOMX_FREQ_IADD_RC 0 +#define RANDOMX_FREQ_ISUB_R 17 #define RANDOMX_FREQ_ISUB_M 7 -#define RANDOMX_FREQ_IMUL_9C 9 +#define RANDOMX_FREQ_IMUL_9C 0 #define RANDOMX_FREQ_IMUL_R 16 #define RANDOMX_FREQ_IMUL_M 4 #define RANDOMX_FREQ_IMULH_R 4 diff --git a/src/main.cpp b/src/main.cpp index 6c3f9e8..1092268 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,6 +36,8 @@ along with RandomX. If not, see. #include "dataset.hpp" #include "Cache.hpp" #include "hashAes1Rx4.hpp" +#include "superscalarGenerator.hpp" +#include "JitCompilerX86.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -176,7 +178,6 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi fillAes1Rx4((void*)hash, RANDOMX_SCRATCHPAD_L3, scratchpad); vm->resetRoundingMode(); vm->setScratchpad(scratchpad); - //dump((char*)scratchpad, RandomX::ScratchpadSize, "spad-before.txt"); for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { fillAes1Rx4((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer()); vm->initialize(); @@ -193,6 +194,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } }*/ vm->getResult(scratchpad, RANDOMX_SCRATCHPAD_L3, hash); + //dump((char*)scratchpad, RANDOMX_SCRATCHPAD_L3, "spad.txt"); result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; @@ -203,8 +205,10 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } } + + int main(int argc, char** argv) { - bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit; + bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genSuperscalar, legacy; int programCount, threadCount, initThreadCount, epoch; readOption("--softAes", argc, argv, softAes); @@ -219,6 +223,19 @@ int main(int argc, char** argv) { readOption("--jit", argc, argv, jit); readOption("--genNative", argc, argv, genNative); readOption("--help", argc, argv, help); + readOption("--genSuperscalar", argc, argv, genSuperscalar); + readOption("--legacy", argc, argv, legacy); + + if (genSuperscalar) { + RandomX::SuperscalarProgram p; + RandomX::Blake2Generator gen(seed, programCount); + RandomX::generateSuperscalar(p, gen); + RandomX::AssemblyGeneratorX86 asmX86; + asmX86.generateAsm(p); + //std::ofstream file("lightProg2.asm"); + asmX86.printCode(std::cout); + return 0; + } if (genAsm) { if (softAes) @@ -252,6 +269,7 @@ int main(int argc, char** argv) { const uint64_t cacheSize = (RANDOMX_ARGON_MEMORY + RANDOMX_ARGON_GROWTH * epoch) * RandomX::ArgonBlockSize; const uint64_t datasetSize = (RANDOMX_DATASET_SIZE + RANDOMX_DS_GROWTH * epoch); dataset.cache.size = cacheSize; + RandomX::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES]; std::cout << "RandomX - " << (miningMode ? "mining" : "verification") << " mode" << std::endl; @@ -268,6 +286,12 @@ int main(int argc, char** argv) { outputHex(std::cout, (char*)dataset.cache.memory, sizeof(__m128i)); std::cout << std::endl; } + if (!legacy) { + RandomX::Blake2Generator gen(seed, programCount); + for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + RandomX::generateSuperscalar(programs[i], gen); + } + } if (!miningMode) { std::cout << "Cache (" << cacheSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl; } @@ -276,19 +300,27 @@ int main(int argc, char** argv) { dataset.dataset.size = datasetSize; RandomX::datasetAlloc(dataset, largePages); const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize; - if (initThreadCount > 1) { - auto perThread = datasetBlockCount / initThreadCount; - auto remainder = datasetBlockCount % initThreadCount; - for (int i = 0; i < initThreadCount; ++i) { - auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); - threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); - } - for (unsigned i = 0; i < threads.size(); ++i) { - threads[i].join(); - } + if (!legacy) { + RandomX::JitCompilerX86 jit86; + jit86.generateSuperScalarHash(programs); + jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); + //dump((const char*)dataset.dataset.memory, RANDOMX_DATASET_SIZE, "dataset.dat"); } else { - RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); + if (initThreadCount > 1) { + auto perThread = datasetBlockCount / initThreadCount; + auto remainder = datasetBlockCount % initThreadCount; + for (int i = 0; i < initThreadCount; ++i) { + auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); + threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); + } + for (unsigned i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + } + else { + RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); + } } RandomX::deallocCache(cache, largePages); threads.clear(); @@ -301,12 +333,16 @@ int main(int argc, char** argv) { vm = new RandomX::CompiledVirtualMachine(); } else { - if (jit) - vm = new RandomX::CompiledLightVirtualMachine(); + if (jit && !legacy) + vm = new RandomX::CompiledLightVirtualMachine(); + else if (jit) + vm = new RandomX::CompiledLightVirtualMachine(); + else if (!legacy) + vm = new RandomX::InterpretedVirtualMachine(softAes); else - vm = new RandomX::InterpretedVirtualMachine(softAes); + vm = new RandomX::InterpretedVirtualMachine(softAes); } - vm->setDataset(dataset, datasetSize); + vm->setDataset(dataset, datasetSize, programs); vms.push_back(vm); } uint8_t* scratchpadMem; @@ -340,8 +376,8 @@ int main(int argc, char** argv) { double elapsed = sw.getElapsed(); std::cout << "Calculated result: "; result.print(std::cout); - if(programCount == 1000) - std::cout << "Reference result: 83875c55fb9ff4a75205a744b82926ebbe23219c6291889c9ee91603c845c597" << std::endl; + if(!legacy && programCount == 1000) + std::cout << "Reference result: 4a74a376d490c8b41d42887e86d4addb5a95572e0c663d1e81aec928e4e094e1" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; } diff --git a/src/program.inc b/src/program.inc index 46d8093..97a8122 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,3 +1,5 @@ + mov ebx, 111 ; Start marker bytes + db 064h, 067h, 090h ; Start marker bytes randomx_isn_0: ; IROR_R r3, 30 ror r11, 30 @@ -1001,3 +1003,5 @@ randomx_isn_255: ; IROR_R r7, r3 mov ecx, r11d ror r15, cl + mov ebx, 222 ; End marker bytes + db 064h, 067h, 090h ; End marker bytes \ No newline at end of file diff --git a/src/superscalarGenerator.cpp b/src/superscalarGenerator.cpp new file mode 100644 index 0000000..d4fd32a --- /dev/null +++ b/src/superscalarGenerator.cpp @@ -0,0 +1,846 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include +#include "configuration.h" +#include "Program.hpp" +#include "blake2/endian.h" +#include +#include +#include +#include +#include +#include "superscalarGenerator.hpp" + +namespace RandomX { + + static bool isMultiplication(int type) { + return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP; + } + + //uOPs (micro-ops) are represented only by the execution port they can go to + namespace ExecutionPort { + using type = int; + constexpr type Null = 0; + constexpr type P0 = 1; + constexpr type P1 = 2; + constexpr type P5 = 4; + constexpr type P01 = P0 | P1; + constexpr type P05 = P0 | P5; + constexpr type P015 = P0 | P1 | P5; + } + + //Macro-operation as output of the x86 decoder + //Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op + //Macro-op can consist of 1 or 2 uOPs. + class MacroOp { + public: + MacroOp(const char* name, int size) + : name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop) + : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) + : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} + MacroOp(const MacroOp& parent, bool dependent) + : name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {} + const char* getName() const { + return name_; + } + int getSize() const { + return size_; + } + int getLatency() const { + return latency_; + } + ExecutionPort::type getUop1() const { + return uop1_; + } + ExecutionPort::type getUop2() const { + return uop2_; + } + bool isSimple() const { + return uop2_ == ExecutionPort::Null; + } + bool isEliminated() const { + return uop1_ == ExecutionPort::Null; + } + bool isDependent() const { + return dependent_; + } + static const MacroOp Add_rr; + static const MacroOp Add_ri; + static const MacroOp Lea_sib; + static const MacroOp Sub_rr; + static const MacroOp Imul_rr; + static const MacroOp Imul_r; + static const MacroOp Mul_r; + static const MacroOp Mov_rr; + static const MacroOp Mov_ri64; + static const MacroOp Xor_rr; + static const MacroOp Xor_ri; + static const MacroOp Ror_rcl; + static const MacroOp Ror_ri; + static const MacroOp TestJz_fused; + static const MacroOp Xor_self; + static const MacroOp Cmp_ri; + static const MacroOp Setcc_r; + private: + const char* name_; + int size_; + int latency_; + ExecutionPort::type uop1_; + ExecutionPort::type uop2_; + bool dependent_ = false; + }; + + //Size: 3 bytes + const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); + + //Size: 4 bytes + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); + const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); + const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); + + //Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes) + const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); + + //Size: 10 bytes + const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); + + //Unused: + const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); + const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); + const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); + const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); + + const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; + const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; + const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; + + class SuperscalarInstructionInfo { + public: + const char* getName() const { + return name_; + } + int getSize() const { + return ops_.size(); + } + bool isSimple() const { + return getSize() == 1; + } + int getLatency() const { + return latency_; + } + const MacroOp& getOp(int index) const { + return ops_[index]; + } + int getType() const { + return type_; + } + int getResultOp() const { + return resultOp_; + } + int getDstOp() const { + return dstOp_; + } + int getSrcOp() const { + return srcOp_; + } + static const SuperscalarInstructionInfo ISUB_R; + static const SuperscalarInstructionInfo IXOR_R; + static const SuperscalarInstructionInfo IADD_RS; + static const SuperscalarInstructionInfo IMUL_R; + static const SuperscalarInstructionInfo IROR_C; + static const SuperscalarInstructionInfo IADD_C7; + static const SuperscalarInstructionInfo IXOR_C7; + static const SuperscalarInstructionInfo IADD_C8; + static const SuperscalarInstructionInfo IXOR_C8; + static const SuperscalarInstructionInfo IADD_C9; + static const SuperscalarInstructionInfo IXOR_C9; + static const SuperscalarInstructionInfo IMULH_R; + static const SuperscalarInstructionInfo ISMULH_R; + static const SuperscalarInstructionInfo IMUL_RCP; + static const SuperscalarInstructionInfo NOP; + private: + const char* name_; + int type_; + std::vector ops_; + int latency_; + int resultOp_ = 0; + int dstOp_ = 0; + int srcOp_; + + SuperscalarInstructionInfo(const char* name) + : name_(name), type_(-1), latency_(0) {} + SuperscalarInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) + : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { + ops_.push_back(MacroOp(op)); + } + template + SuperscalarInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) + : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { + for (unsigned i = 0; i < N; ++i) { + ops_.push_back(MacroOp(arr[i])); + latency_ += ops_.back().getLatency(); + } + static_assert(N > 1, "Invalid array size"); + } + }; + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP"); + + //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. + //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). + //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. + const int buffer0[] = { 4, 8, 4 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 7, 3, 3 }; + const int buffer3[] = { 4, 9, 3 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 3, 10 }; + + class DecoderBuffer { + public: + static const DecoderBuffer Default; + template + DecoderBuffer(const char* name, int index, const int(&arr)[N]) + : name_(name), index_(index), counts_(arr), opsCount_(N) {} + const int* getCounts() const { + return counts_; + } + int getSize() const { + return opsCount_; + } + int getIndex() const { + return index_; + } + const char* getName() const { + return name_; + } + const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const { + //If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10 + //because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs. + //Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops. + if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R) + return &decodeBuffer3310; + + //To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications + //is lower than the number of cycles. + if (mulCount < cycle + 1) + return &decodeBuffer4444; + + //If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication. + if(instrType == SuperscalarInstructionType::IMUL_RCP) + return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; + + //Default: select a random fetch configuration. + return fetchNextDefault(gen); + } + private: + const char* name_; + int index_; + const int* counts_; + int opsCount_; + DecoderBuffer() : index_(-1) {} + static const DecoderBuffer decodeBuffer484; + static const DecoderBuffer decodeBuffer7333; + static const DecoderBuffer decodeBuffer3733; + static const DecoderBuffer decodeBuffer493; + static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer3310; + static const DecoderBuffer* decodeBuffers[4]; + const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { + return decodeBuffers[gen.getByte() & 3]; + } + }; + + const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); + const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3); + const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5); + + const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = { + &DecoderBuffer::decodeBuffer484, + &DecoderBuffer::decodeBuffer7333, + &DecoderBuffer::decodeBuffer3733, + &DecoderBuffer::decodeBuffer493, + }; + + const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + + const SuperscalarInstructionInfo* slot_3[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R }; + const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R }; + const SuperscalarInstructionInfo* slot_4[] = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS }; + const SuperscalarInstructionInfo* slot_7[] = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 }; + const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 }; + const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 }; + const SuperscalarInstructionInfo* slot_10 = &SuperscalarInstructionInfo::IMUL_RCP; + + static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { + int index; + if (availableRegisters.size() == 0) + return false; + + if (availableRegisters.size() > 1) { + index = gen.getInt32() % availableRegisters.size(); + } + else { + index = 0; + } + reg = availableRegisters[index]; + return true; + } + + class RegisterInfo { + public: + RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {} + int latency; + int lastOpGroup; + int lastOpPar; + int value; + }; + + //"SuperscalarInstruction" consists of one or more macro-ops + class SuperscalarInstruction { + public: + void toInstr(Instruction& instr) { //translate to a RandomX instruction format + instr.opcode = getType(); + instr.dst = dst_; + instr.src = src_ >= 0 ? src_ : dst_; + instr.mod = mod_; + instr.setImm32(imm32_); + } + + void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { + switch (slotSize) + { + case 3: + //if this is the last slot, we can also select "IMULH" instructions + if (isLast) { + create(slot_3L[gen.getByte() & 3], gen); + } + else { + create(slot_3[gen.getByte() & 1], gen); + } + break; + case 4: + //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions + if (fetchType == 4 && !isLast) { + create(&SuperscalarInstructionInfo::IMUL_R, gen); + } + else { + create(slot_4[gen.getByte() & 1], gen); + } + break; + case 7: + create(slot_7[gen.getByte() & 1], gen); + break; + case 8: + create(slot_8[gen.getByte() & 1], gen); + break; + case 9: + create(slot_9[gen.getByte() & 1], gen); + break; + case 10: + create(slot_10, gen); + break; + default: + UNREACHABLE; + } + } + + void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) { + info_ = info; + reset(); + switch (info->getType()) + { + case SuperscalarInstructionType::ISUB_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IADD_RS; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IXOR_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IXOR_R; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IADD_RS: { + mod_ = gen.getByte(); + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IADD_RS; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IMUL_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IMUL_R; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IROR_C: { + mod_ = 0; + do { + imm32_ = gen.getByte() & 63; + } while (imm32_ == 0); + opGroup_ = SuperscalarInstructionType::IROR_C; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: { + mod_ = 0; + imm32_ = gen.getInt32(); + opGroup_ = SuperscalarInstructionType::IADD_C7; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: { + mod_ = 0; + imm32_ = gen.getInt32(); + opGroup_ = SuperscalarInstructionType::IXOR_C7; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IMULH_R: { + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IMULH_R; + opGroupPar_ = gen.getInt32(); + } break; + + case SuperscalarInstructionType::ISMULH_R: { + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::ISMULH_R; + opGroupPar_ = gen.getInt32(); + } break; + + case SuperscalarInstructionType::IMUL_RCP: { + mod_ = 0; + do { + imm32_ = gen.getInt32(); + } while ((imm32_ & (imm32_ - 1)) == 0); + opGroup_ = SuperscalarInstructionType::IMUL_RCP; + opGroupPar_ = -1; + } break; + + default: + break; + } + } + + bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (®isters)[8], Blake2Generator& gen) { + /*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R) + std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/ + std::vector availableRegisters; + //Conditions for the destination register: + // * value must be ready at the required cycle + // * cannot be the same as the source register unless the instruction allows it + // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" + // * register cannot be multiplied twice in a row unless allowChainedMul is true + // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication + // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator) + // * either the last instruction applied to the register or its source must be different than this instruction + // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" + // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister)) + availableRegisters.push_back(i); + } + return selectRegister(availableRegisters, gen, dst_); + } + + bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { + std::vector availableRegisters; + //all registers that are ready at the cycle + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle) + availableRegisters.push_back(i); + } + //if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination + if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { + if (availableRegisters[0] == LimitedAddressRegister || availableRegisters[1] == LimitedAddressRegister) { + opGroupPar_ = src_ = LimitedAddressRegister; + return true; + } + } + if (selectRegister(availableRegisters, gen, src_)) { + if (groupParIsSource_) + opGroupPar_ = src_; + return true; + } + return false; + } + + int getType() { + return info_->getType(); + } + int getSource() { + return src_; + } + int getDestination() { + return dst_; + } + int getGroup() { + return opGroup_; + } + int getGroupPar() { + return opGroupPar_; + } + + const SuperscalarInstructionInfo& getInfo() const { + return *info_; + } + + static const SuperscalarInstruction Null; + + private: + const SuperscalarInstructionInfo* info_; + int src_ = -1; + int dst_ = -1; + int mod_; + uint32_t imm32_; + int opGroup_; + int opGroupPar_; + bool canReuse_ = false; + bool groupParIsSource_ = false; + + void reset() { + src_ = dst_ = -1; + canReuse_ = groupParIsSource_ = false; + } + + SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) { + } + }; + + const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP); + + constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4; + constexpr int LOOK_FORWARD_CYCLES = 4; + constexpr int MAX_THROWAWAY_COUNT = 256; + +#ifndef _DEBUG + constexpr bool TRACE = false; + constexpr bool INFO = false; +#else + constexpr bool TRACE = true; + constexpr bool INFO = true; +#endif + + template + static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { + //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload + //port P1 (multiplication) by instructions that can go to any port. + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { + if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { + if (commit) { + if (TRACE) std::cout << "; P5 at cycle " << cycle << std::endl; + portBusy[cycle][2] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) { + if (commit) { + if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) { + if (commit) { + if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = uop; + } + return cycle; + } + } + return -1; + } + + template + static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { + //if this macro-op depends on the previous one, increase the starting cycle if needed + //this handles an explicit dependency chain in IMUL_RCP + if (mop.isDependent()) { + cycle = std::max(cycle, depCycle); + } + //move instructions are eliminated and don't need an execution unit + if (mop.isEliminated()) { + if (commit) + if (TRACE) std::cout << "; (eliminated)" << std::endl; + return cycle; + } + else if (mop.isSimple()) { + //this macro-op has only one uOP + return scheduleUop(mop.getUop1(), portBusy, cycle); + } + else { + //macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { + + int cycle1 = scheduleUop(mop.getUop1(), portBusy, cycle); + int cycle2 = scheduleUop(mop.getUop2(), portBusy, cycle); + + if (cycle1 == cycle2) { + if (commit) { + scheduleUop(mop.getUop1(), portBusy, cycle1); + scheduleUop(mop.getUop2(), portBusy, cycle2); + } + return cycle1; + } + } + } + + return -1; + } + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) { + + ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; + memset(portBusy, 0, sizeof(portBusy)); + RegisterInfo registers[8]; + + const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; + SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null; + int macroOpIndex = 0; + int codeSize = 0; + int macroOpCount = 0; + int cycle = 0; + int depCycle = 0; + int retireCycle = 0; + bool portsSaturated = false; + int programSize = 0; + int mulCount = 0; + int decodeCycle; + int throwAwayCount = 0; + + //decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated. + //Each decode cycle decodes 16 bytes of x86 code. + //Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always + //saturated first. The cycle limit is present only to guarantee loop termination. + //Program size is limited to RANDOMX_SUPERSCALAR_MAX_SIZE instructions. + for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < RANDOMX_SUPERSCALAR_MAX_SIZE; ++decodeCycle) { + + //select a decode configuration + decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); + if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; + + int bufferIndex = 0; + + //fill all instruction slots in the current decode buffer + while (bufferIndex < decodeBuffer->getSize()) { + int topCycle = cycle; + + //if we have issued all macro-ops for the current RandomX instruction, create a new instruction + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + if (portsSaturated) + break; + //select an instruction so that the first macro-op fits into the current slot + currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); + macroOpIndex = 0; + if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; + } + const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex); + if (TRACE) std::cout << mop.getName() << " "; + + //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution + int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); + if (scheduleCycle < 0) { + /*if (TRACE)*/ std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; + //__debugbreak(); + portsSaturated = true; + break; + } + + //find a source register (if applicable) that will be ready when this instruction executes + if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) { + int forward; + //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { + if (TRACE) std::cout << "; src STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + //if no register was found, throw the instruction away and try another one + if (forward == LOOK_FORWARD_CYCLES) { + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; + continue; + } + //abort this decode buffer + /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; + currentInstruction = SuperscalarInstruction::Null; + break; + } + if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + } + //find a destination register that will be ready when this instruction executes + if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { + int forward; + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) { + if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; + continue; + } + //abort this decode buffer + /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; + currentInstruction = SuperscalarInstruction::Null; + break; + } + if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; + } + throwAwayCount = 0; + + //recalculate when the instruction can be scheduled for execution based on operand availability + scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); + + //calculate when the result will be ready + depCycle = scheduleCycle + mop.getLatency(); + + //if this instruction writes the result, modify register information + // RegisterInfo.latency - which cycle the register will be ready + // RegisterInfo.lastOpGroup - the last operation that was applied to the register + // RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register) + if (macroOpIndex == currentInstruction.getInfo().getResultOp()) { + int dst = currentInstruction.getDestination(); + RegisterInfo& ri = registers[dst]; + retireCycle = depCycle; + ri.latency = retireCycle; + ri.lastOpGroup = currentInstruction.getGroup(); + ri.lastOpPar = currentInstruction.getGroupPar(); + if (TRACE) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; + } + codeSize += mop.getSize(); + bufferIndex++; + macroOpIndex++; + macroOpCount++; + + //terminating condition + if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) { + portsSaturated = true; + } + cycle = topCycle; + + //when all macro-ops of the current instruction have been issued, add the instruction into the program + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + currentInstruction.toInstr(prog(programSize++)); + mulCount += isMultiplication(currentInstruction.getType()); + } + } + ++cycle; + } + + double ipc = (macroOpCount / (double)retireCycle); + + memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies)); + + //Calculate ASIC latency: + //Assumes 1 cycle latency for all operations and unlimited parallelization. + for (int i = 0; i < programSize; ++i) { + Instruction& instr = prog(i); + int latDst = prog.asicLatencies[instr.dst] + 1; + int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0; + prog.asicLatencies[instr.dst] = std::max(latDst, latSrc); + } + + //address register is the register with the highest ASIC latency + int asicLatencyMax = 0; + int addressReg = 0; + for (int i = 0; i < 8; ++i) { + if (prog.asicLatencies[i] > asicLatencyMax) { + asicLatencyMax = prog.asicLatencies[i]; + addressReg = i; + } + prog.cpuLatencies[i] = registers[i].latency; + } + + prog.setSize(programSize); + prog.setAddressRegister(addressReg); + + prog.cpuLatency = retireCycle; + prog.asicLatency = asicLatencyMax; + prog.codeSize = codeSize; + prog.macroOps = macroOpCount; + prog.decodeCycles = decodeCycle; + prog.ipc = ipc; + prog.mulCount = mulCount; + + + /*if(INFO) std::cout << "; ALU port utilization:" << std::endl; + if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; + + int portCycles = 0; + for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + std::cout << "; " << std::setw(3) << i << " "; + for (int j = 0; j < 3; ++j) { + std::cout << (portBusy[i][j] ? '*' : '_'); + portCycles += !!portBusy[i][j]; + } + std::cout << std::endl; + }*/ + } +} \ No newline at end of file diff --git a/src/superscalarGenerator.hpp b/src/superscalarGenerator.hpp new file mode 100644 index 0000000..a64e80d --- /dev/null +++ b/src/superscalarGenerator.hpp @@ -0,0 +1,47 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once +#include "Program.hpp" +#include "Blake2Generator.hpp" + +namespace RandomX { + // Intel Ivy Bridge reference + namespace SuperscalarInstructionType { //uOPs (decode) execution ports latency code size + constexpr int ISUB_R = 0; //1 p015 1 3 (sub) + constexpr int IXOR_R = 1; //1 p015 1 3 (xor) + constexpr int IADD_RS = 2; //1 p01 1 4 (lea) + constexpr int IMUL_R = 3; //1 p1 3 4 (imul) + constexpr int IROR_C = 4; //1 p05 1 4 (ror) + constexpr int IADD_C7 = 5; //1 p015 1 7 (add) + constexpr int IXOR_C7 = 6; //1 p015 1 7 (xor) + constexpr int IADD_C8 = 7; //1+0 p015 1 7+1 (add+nop) + constexpr int IXOR_C8 = 8; //1+0 p015 1 7+1 (xor+nop) + constexpr int IADD_C9 = 9; //1+0 p015 1 7+2 (add+nop) + constexpr int IXOR_C9 = 10; //1+0 p015 1 7+2 (xor+nop) + constexpr int IMULH_R = 11; //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+mul+mov) + constexpr int ISMULH_R = 12; //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+imul+mov) + constexpr int IMUL_RCP = 13; //1+1 p015+p1 4 10+4 (mov+imul) + + constexpr int COUNT = 14; + constexpr int INVALID = -1; + } + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen); +} \ No newline at end of file diff --git a/src/tests/superscalar-avalanche.cpp b/src/tests/superscalar-avalanche.cpp new file mode 100644 index 0000000..9fa1613 --- /dev/null +++ b/src/tests/superscalar-avalanche.cpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include +#include +#include +#include "../superscalarGenerator.hpp" +#include "../InterpretedVirtualMachine.hpp" +#include "../intrinPortable.h" +#include "../Blake2Generator.hpp" + +const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; + +int main() { + + int insensitiveProgCount[64] = { 0 }; + std::vector dummy; + for (int bit = 0; bit < 64; ++bit) { + for (int i = 0; i < 10000; ++i) { + uint64_t ra[8] = { + 6364136223846793005ULL, + 9298410992540426048ULL, + 12065312585734608966ULL, + 9306329213124610396ULL, + 5281919268842080866ULL, + 10536153434571861004ULL, + 3398623926847679864ULL, + 9549104520008361294ULL, + }; + uint64_t rb[8]; + memcpy(rb, ra, sizeof rb); + rb[0] ^= (1ULL << bit); + RandomX::SuperscalarProgram p; + RandomX::Blake2Generator gen(seed, i); + RandomX::generateSuperscalar(p, gen); + RandomX::InterpretedVirtualMachine::executeSuperscalar(ra, p, dummy); + RandomX::InterpretedVirtualMachine::executeSuperscalar(rb, p, dummy); + uint64_t diff = 0; + for (int j = 0; j < 8; ++j) { + diff += __popcnt64(ra[j] ^ rb[j]); + } + if (diff < 192 || diff > 320) { + std::cout << "Seed: " << i << " diff = " << diff << std::endl; + insensitiveProgCount[bit]++; + } + } + } + for (int bit = 0; bit < 64; ++bit) { + std::cout << bit << " " << insensitiveProgCount[bit] << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/src/tests/superscalar-init.cpp b/src/tests/superscalar-init.cpp new file mode 100644 index 0000000..a7c1208 --- /dev/null +++ b/src/tests/superscalar-init.cpp @@ -0,0 +1,78 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include +#include +#include +#include +#include "../superscalarGenerator.hpp" +#include "../InterpretedVirtualMachine.hpp" +#include "../intrinPortable.h" +#include "../configuration.h" + +const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; + +int main() { + std::cout << "THIS PROGRAM REQUIRES MORE THAN 10 GB OF RAM TO COMPLETE" << std::endl; + std::vector dummy; + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298410992540426748ULL; //9298410992540426048ULL + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + constexpr uint32_t totalBlocks = RANDOMX_DATASET_SIZE / RandomX::CacheLineSize; + std::unordered_set registerValues; + registerValues.reserve(totalBlocks); + registerValues.rehash(totalBlocks); + int collisionCount[9] = { 0 }; + for (uint32_t blockNumber = 0; blockNumber < totalBlocks; ++blockNumber) { + uint64_t rl[8]; + rl[0] = (blockNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + int blockCollisions = 0; + for (int i = 0; i < 8; ++i) { + uint64_t reducedValue = rl[i] & 0x3FFFFFFFFFFFF8; //bits 3-53 only + if (registerValues.find(reducedValue) != registerValues.end()) { + blockCollisions++; + std::cout << "Block " << blockNumber << ": collision of register r" << i << std::endl; + } + else { + registerValues.insert(reducedValue); + } + } + collisionCount[blockCollisions]++; + if ((blockNumber % (320 * 1024)) == 0) + std::cout << "Block " << blockNumber << " processed" << std::endl; + } + + for (int i = 0; i < 9; ++i) { + std::cout << i << " register(s) collide in " << collisionCount[i] << " blocks" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/src/variant4_random_math.h b/src/variant4_random_math.h new file mode 100644 index 0000000..3ae1841 --- /dev/null +++ b/src/variant4_random_math.h @@ -0,0 +1,441 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + +// Register size can be configured to either 32 bit (uint32_t) or 64 bit (uint64_t) +typedef uint32_t v4_reg; + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 9 registers in total: +// - 4 variable registers +// - 5 constant registers initialized from loop variables +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#if defined(__GNUC__) +#define FORCEINLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#if defined(__GNUC__) +#define UNREACHABLE_CODE __builtin_unreachable() +#elif defined(_MSC_VER) +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg* r) +{ + enum + { + REG_BITS = sizeof(v4_reg) * 8, + }; + +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, data_size, (char*) data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + data[20] = -38; // change seed + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = false; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // Use register R8 as source instead + b = 8; + src_index = 8; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif \ No newline at end of file diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj new file mode 100644 index 0000000..d646143 --- /dev/null +++ b/vcxproj/randomx.vcxproj @@ -0,0 +1,185 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {3346A4AD-C438-4324-8B77-47A16452954B} + randomx + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + Level3 + Disabled + false + true + + + + + Level4 + Disabled + false + true + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + UseLinkTimeCodeGeneration + false + + + + + Level3 + MaxSpeed + true + true + false + true + AssemblyCode + + + true + true + UseLinkTimeCodeGeneration + false + + + 4194304 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters new file mode 100644 index 0000000..77939bd --- /dev/null +++ b/vcxproj/randomx.vcxproj.filters @@ -0,0 +1,170 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + \ No newline at end of file diff --git a/vcxproj/superscalar-avalanche.vcxproj b/vcxproj/superscalar-avalanche.vcxproj new file mode 100644 index 0000000..1cac62b --- /dev/null +++ b/vcxproj/superscalar-avalanche.vcxproj @@ -0,0 +1,143 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938} + superscalaravalanche + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + + + + + Level3 + Disabled + true + true + + + + + Level3 + Disabled + true + true + + + + + Level3 + MaxSpeed + true + true + true + true + + + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcxproj/superscalar-avalanche.vcxproj.filters b/vcxproj/superscalar-avalanche.vcxproj.filters new file mode 100644 index 0000000..93b3838 --- /dev/null +++ b/vcxproj/superscalar-avalanche.vcxproj.filters @@ -0,0 +1,72 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + \ No newline at end of file diff --git a/vcxproj/superscalar-init.vcxproj b/vcxproj/superscalar-init.vcxproj new file mode 100644 index 0000000..d765f85 --- /dev/null +++ b/vcxproj/superscalar-init.vcxproj @@ -0,0 +1,143 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {E59DC709-9B12-4A53-BAF3-79398821C376} + superscalarinit + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + + + + + Level3 + Disabled + false + true + + + + + Level3 + Disabled + false + true + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcxproj/superscalar-init.vcxproj.filters b/vcxproj/superscalar-init.vcxproj.filters new file mode 100644 index 0000000..cad6e2b --- /dev/null +++ b/vcxproj/superscalar-init.vcxproj.filters @@ -0,0 +1,72 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + \ No newline at end of file