;# Copyright (c) 2018 tevador ;# ;# This file is part of RandomX. ;# ;# RandomX is free software: you can redistribute it and/or modify ;# it under the terms of the GNU General Public License as published by ;# the Free Software Foundation, either version 3 of the License, or ;# (at your option) any later version. ;# ;# RandomX is distributed in the hope that it will be useful, ;# but WITHOUT ANY WARRANTY; without even the implied warranty of ;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;# GNU General Public License for more details. ;# ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. IFDEF RAX _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary ; rbx -> "ic" ; rcx -> temporary ; rdx -> temporary ; rsi -> scratchpad pointer ; rdi -> dataset pointer ; rbp -> "ma", "mx" ; rsp -> stack pointer ; r8 -> "r0" ; r9 -> "r1" ; r10 -> "r2" ; r11 -> "r3" ; r12 -> "r4" ; r13 -> "r5" ; r14 -> "r6" ; r15 -> "r7" ; xmm0 -> "f0" ; xmm1 -> "f1" ; xmm2 -> "f2" ; xmm3 -> "f3" ; xmm4 -> "e0" ; xmm5 -> "e1" ; xmm6 -> "e2" ; xmm7 -> "e3" ; xmm8 -> "a0" ; xmm9 -> "a1" ; xmm10 -> "a2" ; xmm11 -> "a3" ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask ; xmm15 -> sign mask ; store callee-saved registers push rbx push rbp push rdi push rsi push r12 push r13 push r14 push r15 sub rsp, 80 movdqu xmmword ptr [rsp+64], xmm6 movdqu xmmword ptr [rsp+48], xmm7 movdqu xmmword ptr [rsp+32], xmm8 movdqu xmmword ptr [rsp+16], xmm9 movdqu xmmword ptr [rsp+0], xmm10 sub rsp, 80 movdqu xmmword ptr [rsp+64], xmm11 movdqu xmmword ptr [rsp+48], xmm12 movdqu xmmword ptr [rsp+32], xmm13 movdqu xmmword ptr [rsp+16], xmm14 movdqu xmmword ptr [rsp+0], xmm15 ; function arguments push rcx ; RegisterFile& registerFile mov rbp, qword ptr [rdx] ; "mx", "ma" mov eax, ebp ; "mx" mov rdi, qword ptr [rdx+8] ; uint8_t* dataset mov rsi, r8 ; convertible_t* scratchpad mov rbx, r9 ; loop counter ;# zero integer registers xor r8, r8 xor r9, r9 xor r10, r10 xor r11, r11 xor r12, r12 xor r13, r13 xor r14, r14 xor r15, r15 ;# load constant registers lea rcx, [rcx+120] movapd xmm8, xmmword ptr [rcx+72] movapd xmm9, xmmword ptr [rcx+88] movapd xmm10, xmmword ptr [rcx+104] movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] movapd xmm15, xmmword ptr [signMask] jmp program_begin ALIGN 64 minDbl: db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 absMask: db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 signMask: db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 ALIGN 64 program_begin: xor rax, r8 ;# read address register 1 xor rax, r9 mov rdx, rax and eax, 1048512 push rax lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] xor r11, qword ptr [rcx+24] xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ror rdx, 32 and edx, 1048512 push rdx lea rcx, [rsi+rdx] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] cvtdq2pd xmm3, qword ptr [rcx+24] cvtdq2pd xmm4, qword ptr [rcx+32] cvtdq2pd xmm5, qword ptr [rcx+40] cvtdq2pd xmm6, qword ptr [rcx+48] cvtdq2pd xmm7, qword ptr [rcx+56] andps xmm4, xmm14 andps xmm5, xmm14 andps xmm6, xmm14 andps xmm7, xmm14 ;# 256 instructions include program.inc mov eax, r8d ;# read address register 1 xor eax, r9d ;# read address register 2 xor rbp, rax ;# modify "mx" and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rdi+rdx] ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma lea rcx, [rdi+rdx] ;# dataset cache line xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] xor r11, qword ptr [rcx+24] xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] pop rax lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 mov qword ptr [rcx+24], r11 mov qword ptr [rcx+32], r12 mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 pop rax lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 mulpd xmm2, xmm6 mulpd xmm3, xmm7 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 movapd xmmword ptr [rcx+48], xmm3 xor eax, eax dec ebx jnz program_begin rx_finish: ; save VM register values pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 mov qword ptr [rcx+24], r11 mov qword ptr [rcx+32], r12 mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 movdqa xmmword ptr [rcx+64], xmm0 movdqa xmmword ptr [rcx+80], xmm1 movdqa xmmword ptr [rcx+96], xmm2 movdqa xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] movdqa xmmword ptr [rcx+64], xmm4 movdqa xmmword ptr [rcx+80], xmm5 movdqa xmmword ptr [rcx+96], xmm6 movdqa xmmword ptr [rcx+112], xmm7 ; load callee-saved registers movdqu xmm15, xmmword ptr [rsp] movdqu xmm14, xmmword ptr [rsp+16] movdqu xmm13, xmmword ptr [rsp+32] movdqu xmm12, xmmword ptr [rsp+48] movdqu xmm11, xmmword ptr [rsp+64] add rsp, 80 movdqu xmm10, xmmword ptr [rsp] movdqu xmm9, xmmword ptr [rsp+16] movdqu xmm8, xmmword ptr [rsp+32] movdqu xmm7, xmmword ptr [rsp+48] movdqu xmm6, xmmword ptr [rsp+64] add rsp, 80 pop r15 pop r14 pop r13 pop r12 pop rsi pop rdi pop rbp pop rbx ; return ret TransformAddress MACRO reg32, reg64 ;# Transforms the address in the register so that the transformed address ;# lies in a different cache line than the original address (mod 2^N). ;# This is done to prevent a load-store dependency. ;# There are 3 different transformations that can be used: x -> 9*x+C, x -> x+C, x -> x^C ;lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127 db 64 add reg32, -39 ;# C = all except -7 to +7 ;xor reg32, -8 ;# C = all except 0 to 7 ENDM ALIGN 64 rx_read: ;# IN eax = random 32-bit address ;# GLOBAL rdi = address of the dataset address ;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx TransformAddress eax, rax ;# TransformAddress function mov rcx, qword ptr [rdi] ;# load the dataset address xor rbp, rax ;# modify "mx" ;# prefetch cacheline "mx" and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rcx+rdx] ;# read cacheline "ma" ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma lea rcx, [rcx+rdx] ;# dataset cache line xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] xor r11, qword ptr [rcx+24] xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ret executeProgram ENDP _RANDOMX_EXECUTE_PROGRAM ENDS ENDIF END