Update to current-ish.

2025-09-20 12:36:05 +08:00 · 2016-08-23 21:38:05 -04:00
parent badee7a0de
commit 737599c056
49 changed files with 1912 additions and 2586 deletions
--- a/mathlib/sse.cpp
+++ b/mathlib/sse.cpp
@ -35,18 +35,18 @@ static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };

 	#define _PS_CONST(Name, Val) \
 		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 	#define _PS_EXTERN_CONST(Name, Val) \
-		const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }

 	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const __attribute__((aligned(16))) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \

 	#define _EPI32_CONST(Name, Val) \
-		static const __attribute__((aligned(16))) int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }

 	#define _PS_CONST(Name, Val) \
-		static const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
 #endif

 _PS_EXTERN_CONST(am_0, 0.0f);
@ -90,14 +90,8 @@ float _SSE_Sqrt(float x)
 		sqrtss		xmm0, x
 		movss		root, xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-	__asm__ __volatile__(
-		"movss %1,%%xmm2\n"
-		"sqrtss %%xmm2,%%xmm1\n"
-		"movss %%xmm1,%0"
-       	: "=m" (root)
-		: "m" (x)
-	);
+#elif POSIX
+	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
 #endif
 	return root;
 }
@ -120,14 +114,21 @@ float _SSE_RSqrtAccurate(float x)
 	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
 }
 #else
+
+#ifdef POSIX
+const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
+const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
+#endif
+
 // Intel / Kipps SSE RSqrt.  Significantly faster than above.
 float _SSE_RSqrtAccurate(float a)
 {
+
+#ifdef _WIN32
 	float x;
 	float half = 0.5f;
 	float three = 3.f;

-#ifdef _WIN32
 	__asm
 	{
 		movss   xmm3, a;
@ -143,26 +144,25 @@ float _SSE_RSqrtAccurate(float a)

 		movss   x,    xmm1;
 	}
-#elif defined _LINUX || defined __APPLE__
-	__asm__ __volatile__(
-		"movss   %1, %%xmm3 \n\t"
-        "movss   %2, %%xmm1 \n\t"
-        "movss   %3, %%xmm2 \n\t"
-        "rsqrtss %%xmm3, %%xmm0 \n\t"
-        "mulss   %%xmm0, %%xmm3 \n\t"
-        "mulss   %%xmm0, %%xmm1 \n\t"
-        "mulss   %%xmm0, %%xmm3 \n\t"
-        "subss   %%xmm3, %%xmm2 \n\t"
-        "mulss   %%xmm2, %%xmm1 \n\t"
-        "movss   %%xmm1, %0 \n\t"
-		: "=m" (x)
-		: "m" (a), "m" (half), "m" (three)
-);
+
+	return x;
+#elif POSIX	
+	__m128  xx = _mm_load_ss( &a );
+    __m128  xr = _mm_rsqrt_ss( xx );
+    __m128  xt;
+	
+    xt = _mm_mul_ss( xr, xr );
+    xt = _mm_mul_ss( xt, xx );
+    xt = _mm_sub_ss( f3, xt );
+    xt = _mm_mul_ss( xt, f05 );
+    xr = _mm_mul_ss( xr, xt );
+	
+    _mm_store_ss( &a, xr );
+    return a;
 #else
 	#error "Not Implemented"
 #endif

-	return x;
 }
 #endif

@ -179,14 +179,8 @@ float _SSE_RSqrtFast(float x)
 		rsqrtss	xmm0, x
 		movss	rroot, xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-	 __asm__ __volatile__(
-		"rsqrtss %1, %%xmm0 \n\t"
-		"movss %%xmm0, %0 \n\t"
-		: "=m" (x)
-		: "m" (rroot)
-		: "%xmm0"
-	);
+#elif POSIX
+	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
 #else
 #error
 #endif
@ -202,11 +196,14 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 	// sice vec only has 3 floats, we can't "movaps" directly into it.
 #ifdef _WIN32
 	__declspec(align(16)) float result[4];
-#elif defined _LINUX || defined __APPLE__
-	__attribute__((aligned(16))) float result[4];
+#elif POSIX
+	 float result[4] __attribute__((aligned(16)));
 #endif

 	float *v = &vec[0];
+#ifdef _WIN32
+	float *r = &result[0];
+#endif

 	float	radius = 0.f;
 	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
@ -214,7 +211,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 	if ( v[0] || v[1] || v[2] )
 	{
 #ifdef _WIN32
-	float *r = &result[0];
 	_asm
 		{
 			mov			eax, v
@ -239,7 +235,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
 			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
 		}
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 		__asm__ __volatile__(
 #ifdef ALIGNED_VECTOR
            "movaps          %2, %%xmm4 \n\t"
@ -262,6 +258,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
            "movaps          %%xmm4, %1 \n\t"
            : "=m" (radius), "=m" (result)
            : "m" (*v)
+            : "xmm1", "xmm2", "xmm3", "xmm4"
 		);
 #else
 	#error "Not Implemented"
@ -303,12 +300,13 @@ float _SSE_InvRSquared(const float* v)
 		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
 		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
 		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-		maxss		xmm1, xmm5			// x1 = MAX( 1.0, x1 )
-		rcpss		xmm0, xmm1			// x0 = 1 / MAX( 1.0, x1 )
+		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
+		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
 		movss		inv_r2, xmm0		// inv_r2 = x0
 	}
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 		__asm__ __volatile__(
+		"movss			 %0, %%xmm5 \n\t"
 #ifdef ALIGNED_VECTOR
 		"movaps          %1, %%xmm4 \n\t"
 #else
@ -324,8 +322,9 @@ float _SSE_InvRSquared(const float* v)
 		"maxss           %%xmm5, %%xmm1 \n\t"
        "rcpss           %%xmm1, %%xmm0 \n\t"
 		"movss           %%xmm0, %0 \n\t" 
-        : "=m" (inv_r2)
-        : "m" (*v), "m" (inv_r2)
+        : "+m" (inv_r2)
+        : "m" (*v)
+        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 		);
 #else
 	#error "Not Implemented"
@ -334,6 +333,48 @@ float _SSE_InvRSquared(const float* v)
 	return inv_r2;
 }

+
+#ifdef POSIX
+// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+
+#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+typedef union xmm_mm_union {
+	__m128 xmm;
+	__m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
+
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+typedef __m64 v2si;   // vector of 2 int (mmx)
+
+#endif
+
 void _SSE_SinCos(float x, float* s, float* c)
 {
 #ifdef _WIN32
@ -421,8 +462,121 @@ void _SSE_SinCos(float x, float* s, float* c)
 		movss	[eax], xmm0
 		movss	[edx], xmm4
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE_sincos NOT implemented!"
+#elif POSIX
+	
+	Assert( "Needs testing, verify impl!\n" );
+	
+	v4sf  xx = _mm_load_ss( &x );
+	
+	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+	v2si mm0, mm1, mm2, mm3, mm4, mm5;
+	sign_bit_sin = xx;
+	/* take the absolute value */
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+	/* extract the sign bit (upper one) */
+	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+	
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm2:mm3 */
+	xmm3 = _mm_movehl_ps(xmm3, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm3);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	mm4 = mm2;
+	mm5 = mm3;
+	
+	/* get the swap sign flag for the sine */
+	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	v4sf swap_sign_bit_sin;
+	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+	
+	/* get the polynom selection mask for the sine */
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	v4sf poly_mask;
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* get the sign flag for the cosine */
+	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+	mm4 = _mm_slli_pi32(mm4, 29);
+	mm5 = _mm_slli_pi32(mm5, 29);
+	v4sf sign_bit_cos;
+	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+	_mm_empty(); /* good-bye mmx */
+	
+	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+	
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	v4sf z = _mm_mul_ps(xx,xx);
+	y = *(v4sf*)_ps_coscof_p0;
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	v4sf ysin2 = _mm_and_ps(xmm3, y2);
+	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+	y2 = _mm_sub_ps(y2,ysin2);
+	y = _mm_sub_ps(y, ysin1);
+	
+	xmm1 = _mm_add_ps(ysin1,ysin2);
+	xmm2 = _mm_add_ps(y,y2);
+	
+	/* update the sign */
+	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
+	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
+
 #else
 	#error "Not Implemented"
 #endif
@ -479,8 +633,102 @@ float _SSE_cos( float x )
 		movss   x,    xmm0

 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE_cos NOT implemented!"
+#elif POSIX
+
+	Assert( "Needs testing, verify impl!\n" );
+
+	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+	v2si mm0, mm1, mm2, mm3;
+	/* take the absolute value */
+	v4sf  xx = _mm_load_ss( &x );
+
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+		
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm0:mm1 */
+	xmm2 = _mm_movehl_ps(xmm2, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm2);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	
+	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+	
+	/* get the swap sign flag in mm0:mm1 and the 
+	 polynom selection mask in mm2:mm3 */
+	
+	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	
+	v4sf sign_bit, poly_mask;
+	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	_mm_empty(); /* good-bye mmx */
+
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	y = *(v4sf*)_ps_coscof_p0;
+	v4sf z = _mm_mul_ps(xx,xx);
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+	y = _mm_andnot_ps(xmm3, y);
+	y = _mm_add_ps(y,y2);
+	/* update the sign */
+
+	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
+
 #else
 	#error "Not Implemented"
 #endif
@ -491,6 +739,7 @@ float _SSE_cos( float x )
 //-----------------------------------------------------------------------------
 // SSE2 implementations of optimized routines:
 //-----------------------------------------------------------------------------
+#ifdef PLATFORM_WINDOWS_PC32
 void _SSE2_SinCos(float x, float* s, float* c)  // any x
 {
 #ifdef _WIN32
@ -569,13 +818,16 @@ void _SSE2_SinCos(float x, float* s, float* c)  // any x
 		movss	[eax], xmm0
 		movss	[edx], xmm6
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE2_SinCos NOT implemented!"
+#elif POSIX
+	#warning "_SSE2_SinCos NOT implemented!"
+	Assert( 0 );
 #else
 	#error "Not Implemented"
 #endif
 }
+#endif // PLATFORM_WINDOWS_PC32

+#ifdef PLATFORM_WINDOWS_PC32
 float _SSE2_cos(float x)  
 {
 #ifdef _WIN32
@ -624,15 +876,18 @@ float _SSE2_cos(float x)
 		mulss	xmm0, xmm1
 		movss   x,    xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE2_cos NOT implemented!"
+#elif POSIX
+	#warning "_SSE2_cos NOT implemented!"
+	Assert( 0 );
 #else
 	#error "Not Implemented"
 #endif

 	return x;
 }
+#endif // PLATFORM_WINDOWS_PC32

+#if 0
 // SSE Version of VectorTransform
 void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 {
@ -681,8 +936,8 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 		addss xmm0, [ecx+12]
 		movss [edx+8], xmm0;
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "VectorTransformSSE C implementation only"
+#elif POSIX
+	#warning "VectorTransformSSE C implementation only"
 		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
 		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
 		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
@ -690,7 +945,9 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 	#error "Not Implemented"
 #endif
 }
+#endif

+#if 0
 void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 {
 	Assert( s_bMathlibInitialized );
@ -735,8 +992,8 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 		addss xmm0, xmm2;
 		movss [edx+8], xmm0;
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "VectorRotateSSE C implementation only"
+#elif POSIX
+	#warning "VectorRotateSSE C implementation only"
 		out1[0] = DotProduct( in1, in2[0] );
 		out1[1] = DotProduct( in1, in2[1] );
 		out1[2] = DotProduct( in1, in2[2] );
@ -744,6 +1001,7 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 	#error "Not Implemented"
 #endif
 }
+#endif

 #ifdef _WIN32
 void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )