Port GetCPUInformation and mathlib from sdk2013

2025-09-20 20:46:03 +08:00 · 2024-04-20 13:34:13 -04:00
parent b099570391
commit 0d247b9566
35 changed files with 3188 additions and 693 deletions
--- a/mathlib/sse.cpp
+++ b/mathlib/sse.cpp
@ -1,4 +1,4 @@
-//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: SSE Math primitives.
 //
@ -16,7 +16,10 @@
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"

-#if defined ( _WIN32 ) && !defined ( _WIN64 )
+#ifndef COMPILER_MSVC64
+// Implement for 64-bit Windows if needed.
+
+#ifdef _WIN32
 static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
 static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
 #endif
@ -37,21 +40,21 @@ static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };

 	#define _PS_CONST(Name, Val) \
 		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 	#define _PS_EXTERN_CONST(Name, Val) \
-		const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }

 	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const __attribute__((aligned(16))) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \

 	#define _EPI32_CONST(Name, Val) \
-		static const __attribute__((aligned(16))) int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }

 	#define _PS_CONST(Name, Val) \
-		static const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
 #endif

-#if defined ( _WIN32 ) && !defined ( _WIN64 )
+#ifdef _WIN32
 _PS_EXTERN_CONST(am_0, 0.0f);
 _PS_EXTERN_CONST(am_1, 1.0f);
 _PS_EXTERN_CONST(am_m1, -1.0f);
@ -62,8 +65,8 @@ _PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
 _PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
 _PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
 _PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
-_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, (int32)0x80000000);
-_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
+_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, static_cast<int32>(0x80000000));
+_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, static_cast<int32>(~0x80000000));
 _PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
 _PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
 _PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
@ -86,9 +89,6 @@ void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direc
 //-----------------------------------------------------------------------------
 float _SSE_Sqrt(float x)
 {
-#if defined( _WIN64 )
-	return std::sqrt(x);
-#else
 	Assert( s_bMathlibInitialized );
 	float	root = 0.f;
 #ifdef _WIN32
@ -97,17 +97,10 @@ float _SSE_Sqrt(float x)
 		sqrtss		xmm0, x
 		movss		root, xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-	__asm__ __volatile__(
-		"movss %1,%%xmm2\n"
-		"sqrtss %%xmm2,%%xmm1\n"
-		"movss %%xmm1,%0"
-       	: "=m" (root)
-		: "m" (x)
-	);
+#elif POSIX
+	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
 #endif
 	return root;
-#endif // _WIN64
 }

 // Single iteration NewtonRaphson reciprocal square root:
@ -128,17 +121,21 @@ float _SSE_RSqrtAccurate(float x)
 	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
 }
 #else
+
+#ifdef POSIX
+const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
+const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
+#endif
+
 // Intel / Kipps SSE RSqrt.  Significantly faster than above.
 float _SSE_RSqrtAccurate(float a)
 {
-#if defined( _WIN64 )
-	return std::sqrt(a);
-#else
+
+#ifdef _WIN32
 	float x;
 	float half = 0.5f;
 	float three = 3.f;

-#ifdef _WIN32
 	__asm
 	{
 		movss   xmm3, a;
@ -154,27 +151,25 @@ float _SSE_RSqrtAccurate(float a)

 		movss   x,    xmm1;
 	}
-#elif defined _LINUX || defined __APPLE__
-	__asm__ __volatile__(
-		"movss   %1, %%xmm3 \n\t"
-        "movss   %2, %%xmm1 \n\t"
-        "movss   %3, %%xmm2 \n\t"
-        "rsqrtss %%xmm3, %%xmm0 \n\t"
-        "mulss   %%xmm0, %%xmm3 \n\t"
-        "mulss   %%xmm0, %%xmm1 \n\t"
-        "mulss   %%xmm0, %%xmm3 \n\t"
-        "subss   %%xmm3, %%xmm2 \n\t"
-        "mulss   %%xmm2, %%xmm1 \n\t"
-        "movss   %%xmm1, %0 \n\t"
-		: "=m" (x)
-		: "m" (a), "m" (half), "m" (three)
-);
+
+	return x;
+#elif POSIX	
+	__m128  xx = _mm_load_ss( &a );
+    __m128  xr = _mm_rsqrt_ss( xx );
+    __m128  xt;
+	
+    xt = _mm_mul_ss( xr, xr );
+    xt = _mm_mul_ss( xt, xx );
+    xt = _mm_sub_ss( f3, xt );
+    xt = _mm_mul_ss( xt, f05 );
+    xr = _mm_mul_ss( xr, xt );
+	
+    _mm_store_ss( &a, xr );
+    return a;
 #else
 	#error "Not Implemented"
 #endif

-	return x;
-#endif // _WIN64
 }
 #endif

@ -182,54 +177,40 @@ float _SSE_RSqrtAccurate(float a)
 // or so, so ok for closed transforms.  (ie, computing lighting normals)
 float _SSE_RSqrtFast(float x)
 {
-#if defined( _WIN64 )
-	return std::sqrt(x);
-#else
 	Assert( s_bMathlibInitialized );

-	float rroot = 0.0f;
+	float rroot;
 #ifdef _WIN32
 	_asm
 	{
 		rsqrtss	xmm0, x
 		movss	rroot, xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-	 __asm__ __volatile__(
-		"rsqrtss %1, %%xmm0 \n\t"
-		"movss %%xmm0, %0 \n\t"
-		: "=m" (x)
-		: "m" (rroot)
-		: "%xmm0"
-	);
+#elif POSIX
+	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
 #else
 #error
 #endif

 	return rroot;
-#endif // _WIN64
 }

 float FASTCALL _SSE_VectorNormalize (Vector& vec)
 {
-#if defined( _WIN64 )
-	float l = std::sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
-	vec.x /= l;
-	vec.y /= l;
-	vec.z /= l;
-	return l;
-#else
 	Assert( s_bMathlibInitialized );

 	// NOTE: This is necessary to prevent an memory overwrite...
 	// sice vec only has 3 floats, we can't "movaps" directly into it.
 #ifdef _WIN32
 	__declspec(align(16)) float result[4];
-#elif defined _LINUX || defined __APPLE__
-	__attribute__((aligned(16))) float result[4];
+#elif POSIX
+	 float result[4] __attribute__((aligned(16)));
 #endif

 	float *v = &vec[0];
+#ifdef _WIN32
+	float *r = &result[0];
+#endif

 	float	radius = 0.f;
 	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
@ -237,7 +218,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 	if ( v[0] || v[1] || v[2] )
 	{
 #ifdef _WIN32
-	float *r = &result[0];
 	_asm
 		{
 			mov			eax, v
@ -262,7 +242,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
 			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
 		}
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 		__asm__ __volatile__(
 #ifdef ALIGNED_VECTOR
            "movaps          %2, %%xmm4 \n\t"
@ -285,6 +265,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
            "movaps          %%xmm4, %1 \n\t"
            : "=m" (radius), "=m" (result)
            : "m" (*v)
+            : "xmm1", "xmm2", "xmm3", "xmm4"
 		);
 #else
 	#error "Not Implemented"
@ -296,7 +277,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 	}

 	return radius;
-#endif // _WIN64
 }

 void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
@ -310,10 +290,6 @@ void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)

 float _SSE_InvRSquared(const float* v)
 {
-#if defined( _WIN64 )
-	float	r2 = DotProduct(v, v);
-	return r2 < 1.f ? 1.f : 1/r2;
-#else
 	float	inv_r2 = 1.f;
 #ifdef _WIN32
 	_asm { // Intel SSE only routine
@ -331,12 +307,13 @@ float _SSE_InvRSquared(const float* v)
 		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
 		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
 		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-		maxss		xmm1, xmm5			// x1 = MAX( 1.0, x1 )
-		rcpss		xmm0, xmm1			// x0 = 1 / MAX( 1.0, x1 )
+		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
+		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
 		movss		inv_r2, xmm0		// inv_r2 = x0
 	}
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 		__asm__ __volatile__(
+		"movss			 %0, %%xmm5 \n\t"
 #ifdef ALIGNED_VECTOR
 		"movaps          %1, %%xmm4 \n\t"
 #else
@ -352,23 +329,64 @@ float _SSE_InvRSquared(const float* v)
 		"maxss           %%xmm5, %%xmm1 \n\t"
        "rcpss           %%xmm1, %%xmm0 \n\t"
 		"movss           %%xmm0, %0 \n\t" 
-        : "=m" (inv_r2)
-        : "m" (*v), "m" (inv_r2)
+        : "+m" (inv_r2)
+        : "m" (*v)
+        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 		);
 #else
 	#error "Not Implemented"
 #endif

 	return inv_r2;
-#endif // _WIN64
 }

+
+#ifdef POSIX
+// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { static_cast<Type>(Val), static_cast<Type>(Val), static_cast<Type>(Val), static_cast<Type>(Val) }
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+
+#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+#ifdef _WIN32
+_PI32_CONST(0x7f, 0x7f);
+#endif
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+typedef union xmm_mm_union {
+	__m128 xmm;
+	__m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
+
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+typedef __m64 v2si;   // vector of 2 int (mmx)
+
+#endif
+
 void _SSE_SinCos(float x, float* s, float* c)
 {
-#if defined( _WIN64 )
-	*s = std::sin(x);
-	*c = std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	float t4, t8, t12;

 	__asm
@ -453,8 +471,121 @@ void _SSE_SinCos(float x, float* s, float* c)
 		movss	[eax], xmm0
 		movss	[edx], xmm4
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE_sincos NOT implemented!"
+#elif POSIX
+	
+	Assert( "Needs testing, verify impl!\n" );
+	
+	v4sf  xx = _mm_load_ss( &x );
+	
+	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+	v2si mm0, mm1, mm2, mm3, mm4, mm5;
+	sign_bit_sin = xx;
+	/* take the absolute value */
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+	/* extract the sign bit (upper one) */
+	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+	
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm2:mm3 */
+	xmm3 = _mm_movehl_ps(xmm3, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm3);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	mm4 = mm2;
+	mm5 = mm3;
+	
+	/* get the swap sign flag for the sine */
+	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	v4sf swap_sign_bit_sin;
+	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+	
+	/* get the polynom selection mask for the sine */
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	v4sf poly_mask;
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* get the sign flag for the cosine */
+	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+	mm4 = _mm_slli_pi32(mm4, 29);
+	mm5 = _mm_slli_pi32(mm5, 29);
+	v4sf sign_bit_cos;
+	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+	_mm_empty(); /* good-bye mmx */
+	
+	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+	
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	v4sf z = _mm_mul_ps(xx,xx);
+	y = *(v4sf*)_ps_coscof_p0;
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	v4sf ysin2 = _mm_and_ps(xmm3, y2);
+	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+	y2 = _mm_sub_ps(y2,ysin2);
+	y = _mm_sub_ps(y, ysin1);
+	
+	xmm1 = _mm_add_ps(ysin1,ysin2);
+	xmm2 = _mm_add_ps(y,y2);
+	
+	/* update the sign */
+	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
+	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
+
 #else
 	#error "Not Implemented"
 #endif
@ -462,9 +593,7 @@ void _SSE_SinCos(float x, float* s, float* c)

 float _SSE_cos( float x )
 {
-#if defined ( _WIN64 )
-	return std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	float temp;
 	__asm
 	{
@ -513,8 +642,102 @@ float _SSE_cos( float x )
 		movss   x,    xmm0

 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE_cos NOT implemented!"
+#elif POSIX
+
+	Assert( "Needs testing, verify impl!\n" );
+
+	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+	v2si mm0, mm1, mm2, mm3;
+	/* take the absolute value */
+	v4sf  xx = _mm_load_ss( &x );
+
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+		
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm0:mm1 */
+	xmm2 = _mm_movehl_ps(xmm2, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm2);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	
+	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+	
+	/* get the swap sign flag in mm0:mm1 and the 
+	 polynom selection mask in mm2:mm3 */
+	
+	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	
+	v4sf sign_bit, poly_mask;
+	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	_mm_empty(); /* good-bye mmx */
+
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	y = *(v4sf*)_ps_coscof_p0;
+	v4sf z = _mm_mul_ps(xx,xx);
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+	y = _mm_andnot_ps(xmm3, y);
+	y = _mm_add_ps(y,y2);
+	/* update the sign */
+
+	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
+
 #else
 	#error "Not Implemented"
 #endif
@ -525,12 +748,10 @@ float _SSE_cos( float x )
 //-----------------------------------------------------------------------------
 // SSE2 implementations of optimized routines:
 //-----------------------------------------------------------------------------
+#ifdef PLATFORM_WINDOWS_PC32
 void _SSE2_SinCos(float x, float* s, float* c)  // any x
 {
-#if defined( _WIN64 )
-	*s = std::sin(x);
-	*c = std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	__asm
 	{
 		movss	xmm0, x
@ -606,18 +827,19 @@ void _SSE2_SinCos(float x, float* s, float* c)  // any x
 		movss	[eax], xmm0
 		movss	[edx], xmm6
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE2_SinCos NOT implemented!"
+#elif POSIX
+	#warning "_SSE2_SinCos NOT implemented!"
+	Assert( 0 );
 #else
 	#error "Not Implemented"
 #endif
 }
+#endif // PLATFORM_WINDOWS_PC32

+#ifdef PLATFORM_WINDOWS_PC32
 float _SSE2_cos(float x)  
 {
-#if defined ( _WIN64 )
-	return std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	__asm
 	{
 		movss	xmm0, x
@ -663,25 +885,25 @@ float _SSE2_cos(float x)
 		mulss	xmm0, xmm1
 		movss   x,    xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE2_cos NOT implemented!"
+#elif POSIX
+	#warning "_SSE2_cos NOT implemented!"
+	Assert( 0 );
 #else
 	#error "Not Implemented"
 #endif

 	return x;
 }
+#endif // PLATFORM_WINDOWS_PC32

+#if 0
 // SSE Version of VectorTransform
 void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 {
 	Assert( s_bMathlibInitialized );
 	Assert( in1 != out1 );
-#if defined ( _WIN64 )
-	out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
-	out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
-	out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
-#elif defined( _WIN32 )
+
+#ifdef _WIN32
 	__asm
 	{
 		mov eax, in1;
@ -723,8 +945,8 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 		addss xmm0, [ecx+12]
 		movss [edx+8], xmm0;
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "VectorTransformSSE C implementation only"
+#elif POSIX
+	#warning "VectorTransformSSE C implementation only"
 		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
 		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
 		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
@ -732,16 +954,15 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 	#error "Not Implemented"
 #endif
 }
+#endif

+#if 0
 void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 {
 	Assert( s_bMathlibInitialized );
 	Assert( in1 != out1 );
-#if defined ( _WIN64 )
-	out1[0] = DotProduct( in1, in2[0] );
-	out1[1] = DotProduct( in1, in2[1] );
-	out1[2] = DotProduct( in1, in2[2] );
-#elif defined( _WIN32 )
+
+#ifdef _WIN32
 	__asm
 	{
 		mov eax, in1;
@ -780,8 +1001,8 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 		addss xmm0, xmm2;
 		movss [edx+8], xmm0;
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "VectorRotateSSE C implementation only"
+#elif POSIX
+	#warning "VectorRotateSSE C implementation only"
 		out1[0] = DotProduct( in1, in2[0] );
 		out1[1] = DotProduct( in1, in2[1] );
 		out1[2] = DotProduct( in1, in2[2] );
@ -789,8 +1010,9 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 	#error "Not Implemented"
 #endif
 }
+#endif

-#if defined( _WIN32 ) && !defined( _WIN64 )
+#ifdef _WIN32
 void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
 {
 	// FIXME: This don't work!! It will overwrite memory in the write to dest
@ -821,7 +1043,7 @@ void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const floa
 }
 #endif

-#if defined( _WIN32 ) && !defined( _WIN64 )
+#ifdef _WIN32
 #ifdef PFN_VECTORMA
 void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
 {
@ -886,4 +1108,6 @@ vec_t DotProduct (const vec_t *a, const vec_t *c)
 		ret
 	}
 }
-*/
+*/
+
+#endif // COMPILER_MSVC64