1
0
mirror of https://github.com/alliedmodders/hl2sdk.git synced 2025-09-20 20:46:03 +08:00

Port GetCPUInformation and mathlib from sdk2013

This commit is contained in:
Nick Hastings
2024-04-20 13:34:13 -04:00
parent b099570391
commit 0d247b9566
35 changed files with 3188 additions and 693 deletions

View File

@ -1,4 +1,4 @@
//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
//
// Purpose: SSE Math primitives.
//
@ -16,7 +16,10 @@
// memdbgon must be the last include file in a .cpp file!!!
#include "tier0/memdbgon.h"
#if defined ( _WIN32 ) && !defined ( _WIN64 )
#ifndef COMPILER_MSVC64
// Implement for 64-bit Windows if needed.
#ifdef _WIN32
static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 };
static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
#endif
@ -37,21 +40,21 @@ static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
#define _PS_CONST(Name, Val) \
static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
#elif defined _LINUX || defined __APPLE__
#elif POSIX
#define _PS_EXTERN_CONST(Name, Val) \
const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
const __attribute__((aligned(16))) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
#define _EPI32_CONST(Name, Val) \
static const __attribute__((aligned(16))) int32 _epi32_##Name[4] = { Val, Val, Val, Val }
static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
#define _PS_CONST(Name, Val) \
static const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
#endif
#if defined ( _WIN32 ) && !defined ( _WIN64 )
#ifdef _WIN32
_PS_EXTERN_CONST(am_0, 0.0f);
_PS_EXTERN_CONST(am_1, 1.0f);
_PS_EXTERN_CONST(am_m1, -1.0f);
@ -62,8 +65,8 @@ _PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, (int32)0x80000000);
_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, static_cast<int32>(0x80000000));
_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, static_cast<int32>(~0x80000000));
_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
@ -86,9 +89,6 @@ void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direc
//-----------------------------------------------------------------------------
float _SSE_Sqrt(float x)
{
#if defined( _WIN64 )
return std::sqrt(x);
#else
Assert( s_bMathlibInitialized );
float root = 0.f;
#ifdef _WIN32
@ -97,17 +97,10 @@ float _SSE_Sqrt(float x)
sqrtss xmm0, x
movss root, xmm0
}
#elif defined _LINUX || defined __APPLE__
__asm__ __volatile__(
"movss %1,%%xmm2\n"
"sqrtss %%xmm2,%%xmm1\n"
"movss %%xmm1,%0"
: "=m" (root)
: "m" (x)
);
#elif POSIX
_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
#endif
return root;
#endif // _WIN64
}
// Single iteration NewtonRaphson reciprocal square root:
@ -128,17 +121,21 @@ float _SSE_RSqrtAccurate(float x)
return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
}
#else
#ifdef POSIX
const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value
const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
#endif
// Intel / Kipps SSE RSqrt. Significantly faster than above.
float _SSE_RSqrtAccurate(float a)
{
#if defined( _WIN64 )
return std::sqrt(a);
#else
#ifdef _WIN32
float x;
float half = 0.5f;
float three = 3.f;
#ifdef _WIN32
__asm
{
movss xmm3, a;
@ -154,27 +151,25 @@ float _SSE_RSqrtAccurate(float a)
movss x, xmm1;
}
#elif defined _LINUX || defined __APPLE__
__asm__ __volatile__(
"movss %1, %%xmm3 \n\t"
"movss %2, %%xmm1 \n\t"
"movss %3, %%xmm2 \n\t"
"rsqrtss %%xmm3, %%xmm0 \n\t"
"mulss %%xmm0, %%xmm3 \n\t"
"mulss %%xmm0, %%xmm1 \n\t"
"mulss %%xmm0, %%xmm3 \n\t"
"subss %%xmm3, %%xmm2 \n\t"
"mulss %%xmm2, %%xmm1 \n\t"
"movss %%xmm1, %0 \n\t"
: "=m" (x)
: "m" (a), "m" (half), "m" (three)
);
return x;
#elif POSIX
__m128 xx = _mm_load_ss( &a );
__m128 xr = _mm_rsqrt_ss( xx );
__m128 xt;
xt = _mm_mul_ss( xr, xr );
xt = _mm_mul_ss( xt, xx );
xt = _mm_sub_ss( f3, xt );
xt = _mm_mul_ss( xt, f05 );
xr = _mm_mul_ss( xr, xt );
_mm_store_ss( &a, xr );
return a;
#else
#error "Not Implemented"
#endif
return x;
#endif // _WIN64
}
#endif
@ -182,54 +177,40 @@ float _SSE_RSqrtAccurate(float a)
// or so, so ok for closed transforms. (ie, computing lighting normals)
float _SSE_RSqrtFast(float x)
{
#if defined( _WIN64 )
return std::sqrt(x);
#else
Assert( s_bMathlibInitialized );
float rroot = 0.0f;
float rroot;
#ifdef _WIN32
_asm
{
rsqrtss xmm0, x
movss rroot, xmm0
}
#elif defined _LINUX || defined __APPLE__
__asm__ __volatile__(
"rsqrtss %1, %%xmm0 \n\t"
"movss %%xmm0, %0 \n\t"
: "=m" (x)
: "m" (rroot)
: "%xmm0"
);
#elif POSIX
__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
#else
#error
#endif
return rroot;
#endif // _WIN64
}
float FASTCALL _SSE_VectorNormalize (Vector& vec)
{
#if defined( _WIN64 )
float l = std::sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
vec.x /= l;
vec.y /= l;
vec.z /= l;
return l;
#else
Assert( s_bMathlibInitialized );
// NOTE: This is necessary to prevent an memory overwrite...
// sice vec only has 3 floats, we can't "movaps" directly into it.
#ifdef _WIN32
__declspec(align(16)) float result[4];
#elif defined _LINUX || defined __APPLE__
__attribute__((aligned(16))) float result[4];
#elif POSIX
float result[4] __attribute__((aligned(16)));
#endif
float *v = &vec[0];
#ifdef _WIN32
float *r = &result[0];
#endif
float radius = 0.f;
// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't
@ -237,7 +218,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
if ( v[0] || v[1] || v[2] )
{
#ifdef _WIN32
float *r = &result[0];
_asm
{
mov eax, v
@ -262,7 +242,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
}
#elif defined _LINUX || defined __APPLE__
#elif POSIX
__asm__ __volatile__(
#ifdef ALIGNED_VECTOR
"movaps %2, %%xmm4 \n\t"
@ -285,6 +265,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
"movaps %%xmm4, %1 \n\t"
: "=m" (radius), "=m" (result)
: "m" (*v)
: "xmm1", "xmm2", "xmm3", "xmm4"
);
#else
#error "Not Implemented"
@ -296,7 +277,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
}
return radius;
#endif // _WIN64
}
void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
@ -310,10 +290,6 @@ void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
float _SSE_InvRSquared(const float* v)
{
#if defined( _WIN64 )
float r2 = DotProduct(v, v);
return r2 < 1.f ? 1.f : 1/r2;
#else
float inv_r2 = 1.f;
#ifdef _WIN32
_asm { // Intel SSE only routine
@ -331,12 +307,13 @@ float _SSE_InvRSquared(const float* v)
shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X
addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X
addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
maxss xmm1, xmm5 // x1 = MAX( 1.0, x1 )
rcpss xmm0, xmm1 // x0 = 1 / MAX( 1.0, x1 )
maxss xmm1, xmm5 // x1 = max( 1.0, x1 )
rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 )
movss inv_r2, xmm0 // inv_r2 = x0
}
#elif defined _LINUX || defined __APPLE__
#elif POSIX
__asm__ __volatile__(
"movss %0, %%xmm5 \n\t"
#ifdef ALIGNED_VECTOR
"movaps %1, %%xmm4 \n\t"
#else
@ -352,23 +329,64 @@ float _SSE_InvRSquared(const float* v)
"maxss %%xmm5, %%xmm1 \n\t"
"rcpss %%xmm1, %%xmm0 \n\t"
"movss %%xmm0, %0 \n\t"
: "=m" (inv_r2)
: "m" (*v), "m" (inv_r2)
: "+m" (inv_r2)
: "m" (*v)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
#else
#error "Not Implemented"
#endif
return inv_r2;
#endif // _WIN64
}
#ifdef POSIX
// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { static_cast<Type>(Val), static_cast<Type>(Val), static_cast<Type>(Val), static_cast<Type>(Val) }
_PS_CONST_TYPE(sign_mask, int, 0x80000000);
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
#define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
_PI32_CONST(1, 1);
_PI32_CONST(inv1, ~1);
_PI32_CONST(2, 2);
_PI32_CONST(4, 4);
#ifdef _WIN32
_PI32_CONST(0x7f, 0x7f);
#endif
_PS_CONST(1 , 1.0f);
_PS_CONST(0p5, 0.5f);
_PS_CONST(minus_cephes_DP1, -0.78515625);
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS_CONST(sincof_p0, -1.9515295891E-4);
_PS_CONST(sincof_p1, 8.3321608736E-3);
_PS_CONST(sincof_p2, -1.6666654611E-1);
_PS_CONST(coscof_p0, 2.443315711809948E-005);
_PS_CONST(coscof_p1, -1.388731625493765E-003);
_PS_CONST(coscof_p2, 4.166664568298827E-002);
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
typedef union xmm_mm_union {
__m128 xmm;
__m64 mm[2];
} xmm_mm_union;
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
typedef __m128 v4sf; // vector of 4 float (sse1)
typedef __m64 v2si; // vector of 2 int (mmx)
#endif
void _SSE_SinCos(float x, float* s, float* c)
{
#if defined( _WIN64 )
*s = std::sin(x);
*c = std::cos(x);
#elif defined( _WIN32 )
#ifdef _WIN32
float t4, t8, t12;
__asm
@ -453,8 +471,121 @@ void _SSE_SinCos(float x, float* s, float* c)
movss [eax], xmm0
movss [edx], xmm4
}
#elif defined _LINUX || defined __APPLE__
// #warning "_SSE_sincos NOT implemented!"
#elif POSIX
Assert( "Needs testing, verify impl!\n" );
v4sf xx = _mm_load_ss( &x );
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
v2si mm0, mm1, mm2, mm3, mm4, mm5;
sign_bit_sin = xx;
/* take the absolute value */
xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
/* store the integer part of y in mm2:mm3 */
xmm3 = _mm_movehl_ps(xmm3, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm3);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm4 = mm2;
mm5 = mm3;
/* get the swap sign flag for the sine */
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
v4sf swap_sign_bit_sin;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
/* get the polynom selection mask for the sine */
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf poly_mask;
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
xx = _mm_add_ps(xx, xmm1);
xx = _mm_add_ps(xx, xmm2);
xx = _mm_add_ps(xx, xmm3);
/* get the sign flag for the cosine */
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
mm4 = _mm_slli_pi32(mm4, 29);
mm5 = _mm_slli_pi32(mm5, 29);
v4sf sign_bit_cos;
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
_mm_empty(); /* good-bye mmx */
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v4sf z = _mm_mul_ps(xx,xx);
y = *(v4sf*)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, xx);
y2 = _mm_add_ps(y2, xx);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
v4sf ysin2 = _mm_and_ps(xmm3, y2);
v4sf ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
#else
#error "Not Implemented"
#endif
@ -462,9 +593,7 @@ void _SSE_SinCos(float x, float* s, float* c)
float _SSE_cos( float x )
{
#if defined ( _WIN64 )
return std::cos(x);
#elif defined( _WIN32 )
#ifdef _WIN32
float temp;
__asm
{
@ -513,8 +642,102 @@ float _SSE_cos( float x )
movss x, xmm0
}
#elif defined _LINUX || defined __APPLE__
// #warning "_SSE_cos NOT implemented!"
#elif POSIX
Assert( "Needs testing, verify impl!\n" );
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
v2si mm0, mm1, mm2, mm3;
/* take the absolute value */
v4sf xx = _mm_load_ss( &x );
xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
/* get the swap sign flag in mm0:mm1 and the
polynom selection mask in mm2:mm3 */
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
_mm_empty(); /* good-bye mmx */
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
xx = _mm_add_ps(xx, xmm1);
xx = _mm_add_ps(xx, xmm2);
xx = _mm_add_ps(xx, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v4sf*)_ps_coscof_p0;
v4sf z = _mm_mul_ps(xx,xx);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, xx);
y2 = _mm_add_ps(y2, xx);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
#else
#error "Not Implemented"
#endif
@ -525,12 +748,10 @@ float _SSE_cos( float x )
//-----------------------------------------------------------------------------
// SSE2 implementations of optimized routines:
//-----------------------------------------------------------------------------
#ifdef PLATFORM_WINDOWS_PC32
void _SSE2_SinCos(float x, float* s, float* c) // any x
{
#if defined( _WIN64 )
*s = std::sin(x);
*c = std::cos(x);
#elif defined( _WIN32 )
#ifdef _WIN32
__asm
{
movss xmm0, x
@ -606,18 +827,19 @@ void _SSE2_SinCos(float x, float* s, float* c) // any x
movss [eax], xmm0
movss [edx], xmm6
}
#elif defined _LINUX || defined __APPLE__
// #warning "_SSE2_SinCos NOT implemented!"
#elif POSIX
#warning "_SSE2_SinCos NOT implemented!"
Assert( 0 );
#else
#error "Not Implemented"
#endif
}
#endif // PLATFORM_WINDOWS_PC32
#ifdef PLATFORM_WINDOWS_PC32
float _SSE2_cos(float x)
{
#if defined ( _WIN64 )
return std::cos(x);
#elif defined( _WIN32 )
#ifdef _WIN32
__asm
{
movss xmm0, x
@ -663,25 +885,25 @@ float _SSE2_cos(float x)
mulss xmm0, xmm1
movss x, xmm0
}
#elif defined _LINUX || defined __APPLE__
// #warning "_SSE2_cos NOT implemented!"
#elif POSIX
#warning "_SSE2_cos NOT implemented!"
Assert( 0 );
#else
#error "Not Implemented"
#endif
return x;
}
#endif // PLATFORM_WINDOWS_PC32
#if 0
// SSE Version of VectorTransform
void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
{
Assert( s_bMathlibInitialized );
Assert( in1 != out1 );
#if defined ( _WIN64 )
out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
#elif defined( _WIN32 )
#ifdef _WIN32
__asm
{
mov eax, in1;
@ -723,8 +945,8 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
addss xmm0, [ecx+12]
movss [edx+8], xmm0;
}
#elif defined _LINUX || defined __APPLE__
// #warning "VectorTransformSSE C implementation only"
#elif POSIX
#warning "VectorTransformSSE C implementation only"
out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
@ -732,16 +954,15 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
#error "Not Implemented"
#endif
}
#endif
#if 0
void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
{
Assert( s_bMathlibInitialized );
Assert( in1 != out1 );
#if defined ( _WIN64 )
out1[0] = DotProduct( in1, in2[0] );
out1[1] = DotProduct( in1, in2[1] );
out1[2] = DotProduct( in1, in2[2] );
#elif defined( _WIN32 )
#ifdef _WIN32
__asm
{
mov eax, in1;
@ -780,8 +1001,8 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
addss xmm0, xmm2;
movss [edx+8], xmm0;
}
#elif defined _LINUX || defined __APPLE__
// #warning "VectorRotateSSE C implementation only"
#elif POSIX
#warning "VectorRotateSSE C implementation only"
out1[0] = DotProduct( in1, in2[0] );
out1[1] = DotProduct( in1, in2[1] );
out1[2] = DotProduct( in1, in2[2] );
@ -789,8 +1010,9 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
#error "Not Implemented"
#endif
}
#endif
#if defined( _WIN32 ) && !defined( _WIN64 )
#ifdef _WIN32
void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
{
// FIXME: This don't work!! It will overwrite memory in the write to dest
@ -821,7 +1043,7 @@ void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const floa
}
#endif
#if defined( _WIN32 ) && !defined( _WIN64 )
#ifdef _WIN32
#ifdef PFN_VECTORMA
void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
{
@ -886,4 +1108,6 @@ vec_t DotProduct (const vec_t *a, const vec_t *c)
ret
}
}
*/
*/
#endif // COMPILER_MSVC64