Abstracted away from x86 intrinsics

2026-03-05 06:07:33 -05:00 · 2019-05-14 09:13:38 +02:00
parent 3dd21ea93d
commit 1aa7865619
10 changed files with 267 additions and 249 deletions
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@@ -20,6 +20,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once

 #include <cstdint>
+#include "blake2/endian.h"

 constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
 	return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
@@ -33,6 +34,11 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
 	return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
 }

+constexpr int RoundToNearest = 0;
+constexpr int RoundDown = 1;
+constexpr int RoundUp = 2;
+constexpr int RoundToZero = 3;
+
 #if defined(_MSC_VER)
 #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
 #define __SSE2__ 1
@@ -46,185 +52,230 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
 #include <intrin.h>
 #endif

-#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
+typedef __m128i rx_vec_i128;
+typedef __m128d rx_vec_f128;
+
+#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
+#define rx_aligned_free(a) _mm_free(a)
+#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
+
+#define rx_load_vec_f128 _mm_load_pd
+#define rx_store_vec_f128 _mm_store_pd
+#define rx_shuffle_vec_f128 _mm_shuffle_pd
+#define rx_add_vec_f128 _mm_add_pd
+#define rx_sub_vec_f128 _mm_sub_pd
+#define rx_mul_vec_f128 _mm_mul_pd
+#define rx_div_vec_f128 _mm_div_pd
+#define rx_sqrt_vec_f128 _mm_sqrt_pd
+#define rx_set1_long_vec_i128 _mm_set1_epi64x
+#define rx_vec_i128_vec_f128 _mm_castsi128_pd
+
+FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
+	return _mm_castsi128_pd(_mm_set_epi64x(x1, x0));
+}
+
+FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
+	return _mm_castsi128_pd(_mm_set1_epi64x(x));
+}
+
+#define rx_xor_vec_f128 _mm_xor_pd
+#define rx_and_vec_f128 _mm_and_pd
+#define rx_or_vec_f128 _mm_or_pd
+#define rx_aesenc_vec_i128 _mm_aesenc_si128
+#define rx_aesdec_vec_i128 _mm_aesdec_si128
+
+FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(a);
+}
+
+FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+}
+
+FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa));
+}
+
+FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff));
+}
+
+#define rx_set_int_vec_i128 _mm_set_epi32
+#define rx_xor_vec_i128 _mm_xor_si128
+#define rx_load_vec_i128 _mm_load_si128
+#define rx_store_vec_i128 _mm_store_si128
+
+FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
+	__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
+	return _mm_cvtepi32_pd(ix);
+}
+
+constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
+
+FORCE_INLINE void rx_reset_float_state() {
+	_mm_setcsr(rx_mxcsr_default);
+}
+
+FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
+	_mm_setcsr(rx_mxcsr_default | (mode << 13));
+}

 #else
 #include <cstdint>
 #include <stdexcept>
 #include <cstdlib>
 #include <cmath>
-#include "blake2/endian.h"
-
-#define _mm_malloc(a,b) malloc(a)
-#define _mm_free(a) free(a)
-#define PREFETCHNTA(x)

 typedef union {
 	uint64_t u64[2];
 	uint32_t u32[4];
 	uint16_t u16[8];
 	uint8_t u8[16];
-} __m128i;
+} rx_vec_i128;

 typedef union {
 	struct {
 		double lo;
 		double hi;
 	};
-	__m128i i;
-} __m128d;
+	rx_vec_i128 i;
+} rx_vec_f128;

-inline __m128d _mm_load_pd(const double* pd) {
-	__m128d x;
+#define rx_aligned_alloc(a, b) malloc(a)
+#define rx_aligned_free(a) free(a)
+#define rx_prefetch_nta(x)
+
+FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
+	rx_vec_f128 x;
 	x.i.u64[0] = load64(pd + 0);
 	x.i.u64[1] = load64(pd + 1);
 	return x;
 }

-inline void _mm_store_pd(double* mem_addr, __m128d a) {
+FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
 	store64(mem_addr + 0, a.i.u64[0]);
 	store64(mem_addr + 1, a.i.u64[1]);
 }

-inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) {
+	rx_vec_f128 x;
 	x.lo = (imm8 & 1) ? a.hi : a.lo;
 	x.hi = (imm8 & 2) ? b.hi : b.lo;
 	return x;
 }

-inline __m128d _mm_add_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo + b.lo;
 	x.hi = a.hi + b.hi;
 	return x;
 }

-inline __m128d _mm_sub_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo - b.lo;
 	x.hi = a.hi - b.hi;
 	return x;
 }

-inline __m128d _mm_mul_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo * b.lo;
 	x.hi = a.hi * b.hi;
 	return x;
 }

-inline __m128d _mm_div_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo / b.lo;
 	x.hi = a.hi / b.hi;
 	return x;
 }

-inline __m128d _mm_sqrt_pd(__m128d a) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
+	rx_vec_f128 x;
 	x.lo = sqrt(a.lo);
 	x.hi = sqrt(a.hi);
 	return x;
 }

-inline __m128i _mm_set1_epi64x(uint64_t a) {
-	__m128i x;
+FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
+	rx_vec_i128 x;
 	x.u64[0] = a;
 	x.u64[1] = a;
 	return x;
 }

-inline __m128d _mm_castsi128_pd(__m128i a) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
+	rx_vec_f128 x;
 	x.i = a;
 	return x;
 }

-inline __m128d _mm_abs(__m128d xd) {
-	xd.lo = std::fabs(xd.lo);
-	xd.hi = std::fabs(xd.hi);
-	return xd;
+FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
+	rx_vec_f128 v;
+	v.i.u64[0] = x0;
+	v.i.u64[1] = x1;
+	return v;
 }

-inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
+	rx_vec_f128 v;
+	v.i.u64[0] = x;
+	v.i.u64[1] = x;
+	return v;
+}
+
+
+FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
 	x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
 	return x;
 }

-inline __m128d _mm_and_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
 	x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
 	return x;
 }

-inline __m128d _mm_or_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
 	x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
 	return x;
 }

-inline __m128d _mm_set_pd(double e1, double e0) {
-	__m128d x;
-	x.lo = e0;
-	x.hi = e1;
-	return x;
-}
-
-inline __m128d _mm_max_pd(__m128d a, __m128d b) {
-	__m128d x;
-	x.lo = a.lo > b.lo ? a.lo : b.lo;
-	x.hi = a.hi > b.hi ? a.hi : b.hi;
-	return x;
-}
-
-inline __m128d _mm_cvtepi32_pd(__m128i a) {
-	__m128d x;
-	x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]);
-	x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]);
-	return x;
-}
-
 static const char* platformError = "Platform doesn't support hardware AES";

-inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
+FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	throw std::runtime_error(platformError);
 }

-inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) {
+FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	throw std::runtime_error(platformError);
 }

-inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) {
-	throw std::runtime_error(platformError);
+FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
+	return a.u32[0];
 }

-inline int _mm_cvtsi128_si32(__m128i v) {
-	return v.u32[0];
+FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
+	return a.u32[1];
 }

-inline __m128i _mm_cvtsi32_si128(int si32) {
-	__m128i v;
-	v.u32[0] = si32;
-	v.u32[1] = 0;
-	v.u32[2] = 0;
-	v.u32[3] = 0;
-	return v;
+FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
+	return a.u32[2];
 }

-inline  __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) {
-	__m128i v;
-	v.u64[0] = _I0;
-	v.u64[1] = _I1;
-	return v;
+FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
+	return a.u32[3];
 }

-inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
-	__m128i v;
+FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
+	rx_vec_i128 v;
 	v.u32[0] = _I0;
 	v.u32[1] = _I1;
 	v.u32[2] = _I2;
@@ -232,8 +283,8 @@ inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
 	return v;
 };

-inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
-	__m128i c;
+FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) {
+	rx_vec_i128 c;
 	c.u32[0] = _A.u32[0] ^ _B.u32[0];
 	c.u32[1] = _A.u32[1] ^ _B.u32[1];
 	c.u32[2] = _A.u32[2] ^ _B.u32[2];
@@ -241,21 +292,12 @@ inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
 	return c;
 }

-inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) {
-	__m128i c;
-	c.u32[0] = _A.u32[_Imm & 3];
-	c.u32[1] = _A.u32[(_Imm >> 2) & 3];
-	c.u32[2] = _A.u32[(_Imm >> 4) & 3];
-	c.u32[3] = _A.u32[(_Imm >> 6) & 3];
-	return c;
-}
-
-inline __m128i _mm_load_si128(__m128i const*_P) {
+FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return *_P;
 #else
 	uint32_t* ptr = (uint32_t*)_P;
-	__m128i c;
+	rx_vec_i128 c;
 	c.u32[0] = load32(ptr + 0);
 	c.u32[1] = load32(ptr + 1);
 	c.u32[2] = load32(ptr + 2);
@@ -264,7 +306,7 @@ inline __m128i _mm_load_si128(__m128i const*_P) {
 #endif
 }

-inline void _mm_store_si128(__m128i *_P, __m128i _B) {
+FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	*_P = _B;
 #else
@@ -276,46 +318,23 @@ inline void _mm_store_si128(__m128i *_P, __m128i _B) {
 #endif
 }

-inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
-	_Imm &= 255;
-	if (_Imm > 15) {
-		_A.u64[0] = 0;
-		_A.u64[1] = 0;
-	}
-	else {
-		for (int i = 15; i >= _Imm; --i) {
-			_A.u8[i] = _A.u8[i - _Imm];
-		}
-		for (int i = 0; i < _Imm; ++i) {
-			_A.u8[i] = 0;
-		}
-	}
-	return _A;
-}
-
-inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) {
-	__m128i x;
-	x.u32[0] = load32((uint8_t*)mem_addr + 0);
-	x.u32[1] = load32((uint8_t*)mem_addr + 4);
+FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
+	rx_vec_f128 x;
+	x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
+	x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
 	return x;
 }

+#define RANDOMX_DEFAULT_FENV
+
+void rx_reset_float_state();
+
+void rx_set_rounding_mode(uint32_t mode);
+
 #endif

-constexpr int RoundToNearest = 0;
-constexpr int RoundDown = 1;
-constexpr int RoundUp = 2;
-constexpr int RoundToZero = 3;
-
-inline __m128d load_cvt_i32x2(const void* addr) {
-	__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
-	return _mm_cvtepi32_pd(ix);
-}
-
 double loadDoublePortable(const void* addr);
 uint64_t mulh(uint64_t, uint64_t);
 int64_t smulh(int64_t, int64_t);
 uint64_t rotl(uint64_t, int);
 uint64_t rotr(uint64_t, int);
-void initFpu();
-void setRoundMode(uint32_t);