diff --git a/lib/linux64/mathlib.a b/lib/linux64/mathlib.a
index 5f3e4e2a..a8b9de60 100644
Binary files a/lib/linux64/mathlib.a and b/lib/linux64/mathlib.a differ
diff --git a/lib/public/win64/mathlib.lib b/lib/public/win64/mathlib.lib
index b5a658dd..fe1e014c 100644
Binary files a/lib/public/win64/mathlib.lib and b/lib/public/win64/mathlib.lib differ
diff --git a/mathlib/mathlib_base.cpp b/mathlib/mathlib_base.cpp
index 06e791ba..fb519a31 100644
--- a/mathlib/mathlib_base.cpp
+++ b/mathlib/mathlib_base.cpp
@@ -80,8 +80,6 @@ float VectorNormalize (Vector& vec)
 	return radius;
 }
 
-
-
 // TODO: Add fast C VectorNormalizeFast.
 // Perhaps use approximate rsqrt trick, if the accuracy isn't too bad.
 void FASTCALL _VectorNormalizeFast (Vector& vec)
@@ -427,6 +425,33 @@ void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
 	out[2][column] = in.z;
 }
 
+void MatrixScaleBy ( const float flScale, matrix3x4_t &out )
+{
+	out[0][0] *= flScale;
+	out[1][0] *= flScale;
+	out[2][0] *= flScale;
+	out[0][1] *= flScale;
+	out[1][1] *= flScale;
+	out[2][1] *= flScale;
+	out[0][2] *= flScale;
+	out[1][2] *= flScale;
+	out[2][2] *= flScale;
+}
+
+void MatrixScaleByZero ( matrix3x4_t &out )
+{
+	out[0][0] = 0.0f;
+	out[1][0] = 0.0f;
+	out[2][0] = 0.0f;
+	out[0][1] = 0.0f;
+	out[1][1] = 0.0f;
+	out[2][1] = 0.0f;
+	out[0][2] = 0.0f;
+	out[1][2] = 0.0f;
+	out[2][2] = 0.0f;
+}
+
+
 
 int VectorCompare (const float *v1, const float *v2)
 {
@@ -566,53 +591,128 @@ void ConcatRotations (const float in1[3][3], const float in2[3][3], float out[3]
 				in1[2][2] * in2[2][2];
 }
 
+void ConcatTransforms_Aligned( const matrix3x4_t &m0, const matrix3x4_t &m1, matrix3x4_t &out )
+{
+	Assert( (((size_t)&m0) % 16) == 0 );
+	Assert( (((size_t)&m1) % 16) == 0 );
+	Assert( (((size_t)&out) % 16) == 0 );
+
+	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
+	fltx4 rowA0 = LoadAlignedSIMD( m0.m_flMatVal[0] );
+	fltx4 rowA1 = LoadAlignedSIMD( m0.m_flMatVal[1] );
+	fltx4 rowA2 = LoadAlignedSIMD( m0.m_flMatVal[2] );
+
+	fltx4 rowB0 = LoadAlignedSIMD( m1.m_flMatVal[0] );
+	fltx4 rowB1 = LoadAlignedSIMD( m1.m_flMatVal[1] );
+	fltx4 rowB2 = LoadAlignedSIMD( m1.m_flMatVal[2] );
+
+	// now we have the rows of m0 and the columns of m1
+	// first output row
+	fltx4 A0 = SplatXSIMD(rowA0);
+	fltx4 A1 = SplatYSIMD(rowA0);
+	fltx4 A2 = SplatZSIMD(rowA0);
+	fltx4 mul00 = MulSIMD( A0, rowB0 );
+	fltx4 mul01 = MulSIMD( A1, rowB1 );
+	fltx4 mul02 = MulSIMD( A2, rowB2 );
+	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
+
+	// second output row
+	A0 = SplatXSIMD(rowA1);
+	A1 = SplatYSIMD(rowA1);
+	A2 = SplatZSIMD(rowA1);
+	fltx4 mul10 = MulSIMD( A0, rowB0 );
+	fltx4 mul11 = MulSIMD( A1, rowB1 );
+	fltx4 mul12 = MulSIMD( A2, rowB2 );
+	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
+
+	// third output row
+	A0 = SplatXSIMD(rowA2);
+	A1 = SplatYSIMD(rowA2);
+	A2 = SplatZSIMD(rowA2);
+	fltx4 mul20 = MulSIMD( A0, rowB0 );
+	fltx4 mul21 = MulSIMD( A1, rowB1 );
+	fltx4 mul22 = MulSIMD( A2, rowB2 );
+	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
+
+	// add in translation vector
+	A0 = AndSIMD(rowA0,lastMask);
+	A1 = AndSIMD(rowA1,lastMask);
+	A2 = AndSIMD(rowA2,lastMask);
+	out0 = AddSIMD(out0, A0);
+	out1 = AddSIMD(out1, A1);
+	out2 = AddSIMD(out2, A2);
+
+	StoreAlignedSIMD( out.m_flMatVal[0], out0 );
+	StoreAlignedSIMD( out.m_flMatVal[1], out1 );
+	StoreAlignedSIMD( out.m_flMatVal[2], out2 );
+}
 
 /*
 ================
 R_ConcatTransforms
 ================
 */
+
 void ConcatTransforms (const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
 {
-	Assert( s_bMathlibInitialized );
-	if ( &in1 == &out )
+#if 0
+	// test for ones that'll be 2x faster
+	if ( (((size_t)&in1) % 16) == 0 && (((size_t)&in2) % 16) == 0 && (((size_t)&out) % 16) == 0 )
 	{
-		matrix3x4_t in1b;
-		MatrixCopy( in1, in1b );
-		ConcatTransforms( in1b, in2, out );
+		ConcatTransforms_Aligned( in1, in2, out );
 		return;
 	}
-	if ( &in2 == &out )
-	{
-		matrix3x4_t in2b;
-		MatrixCopy( in2, in2b );
-		ConcatTransforms( in1, in2b, out );
-		return;
-	}
-	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] +
-				in1[0][2] * in2[2][0];
-	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] +
-				in1[0][2] * in2[2][1];
-	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] +
-				in1[0][2] * in2[2][2];
-	out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] +
-				in1[0][2] * in2[2][3] + in1[0][3];
-	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] +
-				in1[1][2] * in2[2][0];
-	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] +
-				in1[1][2] * in2[2][1];
-	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] +
-				in1[1][2] * in2[2][2];
-	out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] +
-				in1[1][2] * in2[2][3] + in1[1][3];
-	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] +
-				in1[2][2] * in2[2][0];
-	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] +
-				in1[2][2] * in2[2][1];
-	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
-				in1[2][2] * in2[2][2];
-	out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] +
-				in1[2][2] * in2[2][3] + in1[2][3];
+#endif
+
+	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
+	fltx4 rowA0 = LoadUnalignedSIMD( in1.m_flMatVal[0] );
+	fltx4 rowA1 = LoadUnalignedSIMD( in1.m_flMatVal[1] );
+	fltx4 rowA2 = LoadUnalignedSIMD( in1.m_flMatVal[2] );
+
+	fltx4 rowB0 = LoadUnalignedSIMD( in2.m_flMatVal[0] );
+	fltx4 rowB1 = LoadUnalignedSIMD( in2.m_flMatVal[1] );
+	fltx4 rowB2 = LoadUnalignedSIMD( in2.m_flMatVal[2] );
+
+	// now we have the rows of m0 and the columns of m1
+	// first output row
+	fltx4 A0 = SplatXSIMD(rowA0);
+	fltx4 A1 = SplatYSIMD(rowA0);
+	fltx4 A2 = SplatZSIMD(rowA0);
+	fltx4 mul00 = MulSIMD( A0, rowB0 );
+	fltx4 mul01 = MulSIMD( A1, rowB1 );
+	fltx4 mul02 = MulSIMD( A2, rowB2 );
+	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
+
+	// second output row
+	A0 = SplatXSIMD(rowA1);
+	A1 = SplatYSIMD(rowA1);
+	A2 = SplatZSIMD(rowA1);
+	fltx4 mul10 = MulSIMD( A0, rowB0 );
+	fltx4 mul11 = MulSIMD( A1, rowB1 );
+	fltx4 mul12 = MulSIMD( A2, rowB2 );
+	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
+
+	// third output row
+	A0 = SplatXSIMD(rowA2);
+	A1 = SplatYSIMD(rowA2);
+	A2 = SplatZSIMD(rowA2);
+	fltx4 mul20 = MulSIMD( A0, rowB0 );
+	fltx4 mul21 = MulSIMD( A1, rowB1 );
+	fltx4 mul22 = MulSIMD( A2, rowB2 );
+	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
+
+	// add in translation vector
+	A0 = AndSIMD(rowA0,lastMask);
+	A1 = AndSIMD(rowA1,lastMask);
+	A2 = AndSIMD(rowA2,lastMask);
+	out0 = AddSIMD(out0, A0);
+	out1 = AddSIMD(out1, A1);
+	out2 = AddSIMD(out2, A2);
+
+	// write to output
+	StoreUnalignedSIMD( out.m_flMatVal[0], out0 );
+	StoreUnalignedSIMD( out.m_flMatVal[1], out1 );
+	StoreUnalignedSIMD( out.m_flMatVal[2], out2 );
 }
 
 
@@ -1359,7 +1459,9 @@ float Bias( float x, float biasAmt )
 	{
 		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
 	}
-	return pow( x, lastExponent );
+	float fRet = pow( x, lastExponent );
+	Assert ( !IS_NAN( fRet ) );
+	return fRet;
 }
 
 
@@ -1375,7 +1477,9 @@ float Gain( float x, float biasAmt )
 
 float SmoothCurve( float x )
 {
-	return (1 - cos( x * M_PI )) * 0.5f;
+	// Actual smooth curve. Visualization:
+	// http://www.wolframalpha.com/input/?i=plot%5B+0.5+*+%281+-+cos%5B2+*+pi+*+x%5D%29+for+x+%3D+%280%2C+1%29+%5D
+	return 0.5f * (1 - cos( 2.0f * M_PI * x ) );
 }
 
 
@@ -2408,9 +2512,7 @@ void Hermite_SplineBasis( float t, float basis[4] )
 //-----------------------------------------------------------------------------
 
 // BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled
-#ifdef _MSC_VER
 #pragma optimize( "g", off )
-#endif
 
 void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float t, Vector& output )
 {
@@ -2420,9 +2522,7 @@ void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float
 	Hermite_Spline( p1, p2, e10, e21, t, output );
 }
 
-#ifdef _MSC_VER
 #pragma optimize( "", on )
-#endif
 
 float Hermite_Spline( float p0, float p1, float p2,	float t )
 {
@@ -3188,18 +3288,15 @@ bool CalcLineToLineIntersectionSegment(
    return true;
 }
 
-#ifdef _MSC_VER
 #pragma optimize( "", off )
-#endif
 
 #ifndef EXCEPTION_EXECUTE_HANDLER
 #define EXCEPTION_EXECUTE_HANDLER       1
 #endif
 
-#ifdef _MSC_VER
 #pragma optimize( "", on )
-#endif
 
+static bool s_b3DNowEnabled = false;
 static bool s_bMMXEnabled = false;
 static bool s_bSSEEnabled = false;
 static bool s_bSSE2Enabled = false;
@@ -3213,7 +3310,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 
 #if !defined( _X360 )
 	// Grab the processor information:
-	const CPUInformation& pi = GetCPUInformation();
+	const CPUInformation& pi = *GetCPUInformation();
 
 	// Select the default generic routines.
 	pfSqrt = _sqrtf;
@@ -3235,38 +3332,54 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 		s_bMMXEnabled = false;
 	}
 
-	// GAMMACASE: Since the sse.cpp doesn't have any x64 code rn
-	// we can't use the sse stuff here
-#ifndef COMPILER_MSVC64
+	// SSE Generally performs better than 3DNow when present, so this is placed 
+	// first to allow SSE to override these settings.
+#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX)
+	if ( bAllow3DNow && pi.m_b3DNow )
+	{
+		s_b3DNowEnabled = true;
+
+		// Select the 3DNow specific routines if available;
+		pfVectorNormalize = _3DNow_VectorNormalize;
+		pfVectorNormalizeFast = _3DNow_VectorNormalizeFast;
+		pfInvRSquared = _3DNow_InvRSquared;
+		pfSqrt = _3DNow_Sqrt;
+		pfRSqrt = _3DNow_RSqrt;
+		pfRSqrtFast = _3DNow_RSqrt;
+	}
+	else
+#endif
+	{
+		s_b3DNowEnabled = false;
+	}
+
 	if ( bAllowSSE && pi.m_bSSE )
 	{
 		s_bSSEEnabled = true;
 
+#ifndef PLATFORM_WINDOWS_PC64
+		// These are not yet available.
 		// Select the SSE specific routines if available
 		pfVectorNormalizeFast = _SSE_VectorNormalizeFast;
 		pfInvRSquared = _SSE_InvRSquared;
 		pfSqrt = _SSE_Sqrt;
 		pfRSqrt = _SSE_RSqrtAccurate;
 		pfRSqrtFast = _SSE_RSqrtFast;
-#ifdef _WIN32
+#endif
+#ifdef PLATFORM_WINDOWS_PC32
 		pfFastSinCos = _SSE_SinCos;
 		pfFastCos = _SSE_cos;
 #endif
-
 	}
 	else
 	{
 		s_bSSEEnabled = false;
 	}
-#else
-	s_bSSEEnabled = false;
-#endif
 
-#ifndef COMPILER_MSVC64
 	if ( bAllowSSE2 && pi.m_bSSE2 )
 	{
 		s_bSSE2Enabled = true;
-#ifdef _WIN32
+#ifdef PLATFORM_WINDOWS_PC32
 		pfFastSinCos = _SSE2_SinCos;
 		pfFastCos = _SSE2_cos;
 #endif
@@ -3275,10 +3388,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	{
 		s_bSSE2Enabled = false;
 	}
-#else
-	s_bSSE2Enabled = false;
-#endif
-#endif
+#endif // !_X360
 
 	s_bMathlibInitialized = true;
 
@@ -3286,6 +3396,12 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	BuildGammaTable( gamma, texGamma, brightness, overbright );
 }
 
+bool MathLib_3DNowEnabled( void )
+{
+	Assert( s_bMathlibInitialized );
+	return s_b3DNowEnabled;
+}
+
 bool MathLib_MMXEnabled( void )
 {
 	Assert( s_bMathlibInitialized );
@@ -3304,6 +3420,20 @@ bool MathLib_SSE2Enabled( void )
 	return s_bSSE2Enabled;
 }
 
+float Approach( float target, float value, float speed )
+{
+	float delta = target - value;
+
+	if ( delta > speed )
+		value += speed;
+	else if ( delta < -speed )
+		value -= speed;
+	else 
+		value = target;
+
+	return value;
+}
+
 // BUGBUG: Why doesn't this call angle diff?!?!?
 float ApproachAngle( float target, float value, float speed )
 {
@@ -3990,8 +4120,8 @@ void HSVtoRGB( const Vector &hsv, Vector &rgb )
 		hue = 0.0F;
 	}
 	hue /= 60.0F;
-	int     i = static_cast<int>(hue);   // integer part
-	float32 f = hue - i;                 // fractional part
+	int     i = hue;        // integer part
+	float32 f = hue - i;    // fractional part
 	float32 p = hsv.z * (1.0F - hsv.y);
 	float32 q = hsv.z * (1.0F - hsv.y * f);
 	float32 t = hsv.z * (1.0F - hsv.y * (1.0F - f));
diff --git a/public/mathlib/mathlib.h b/public/mathlib/mathlib.h
index 59bf5e23..bdd5d9a6 100644
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
@@ -14,9 +14,7 @@
 #include "tier0/dbg.h"
 
 #include "mathlib/math_pfns.h"
-#ifndef ALIGN8_POST
-#define ALIGN8_POST
-#endif
+
 // plane_t structure
 // !!! if this is changed, it must be changed in asm code too !!!
 // FIXME: does the asm code even exist anymore?
@@ -151,6 +149,9 @@ struct matrix3x4_t
 	inline void ConcatRotations( const matrix3x4_t &other );
 	inline void ConcatTransforms( const matrix3x4_t &other );
 
+	inline void ScaleBy( const float value );
+	inline void ScaleByZero();
+
 	inline void Multiply( const matrix3x4_t &other );
 	inline void Transpose();
 
@@ -161,6 +162,9 @@ struct matrix3x4_t
 	inline float RowDotProduct( int row, const Vector &in ) const;
 	inline float ColumnDotProduct( MatrixAxisType_t column, const Vector &in ) const;
 
+	inline Vector GetTranslation() const;
+	inline void SetTranslation( const Vector &in );
+
 	inline Vector GetColumn( MatrixAxisType_t column ) const;
 	inline void SetColumn( const Vector &in, MatrixAxisType_t column );
 
@@ -205,7 +209,11 @@ struct matrix3x4_t
 		}
 	}
 
-	matrix3x4_t operator *( const matrix3x4_t &other ) { Multiply( other ); return *this; }
+	matrix3x4_t &operator *=( const matrix3x4_t &other ) { Multiply( other ); return *this; }
+	matrix3x4_t &operator *=( float value ) { ScaleBy( value ); return *this; }
+
+	matrix3x4_t operator *( const matrix3x4_t &other ) const { matrix3x4_t temp( *this ); temp.Multiply( other ); return temp; }
+	matrix3x4_t operator *( float value ) const { matrix3x4_t temp( *this ); temp.ScaleBy( value ); return temp; }
 
 	float *operator[]( int i )				{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
 	const float *operator[]( int i ) const	{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
@@ -378,7 +386,7 @@ void inline SinCos( float radians, float *sine, float *cosine )
 {
 #if defined( _X360 )
 	XMScalarSinCos( sine, cosine, radians );
-#elif defined( COMPILER_MSVC32 )
+#elif defined( PLATFORM_WINDOWS_PC32 )
 	_asm
 	{
 		fld		DWORD PTR [radians]
@@ -390,15 +398,15 @@ void inline SinCos( float radians, float *sine, float *cosine )
 		fstp DWORD PTR [edx]
 		fstp DWORD PTR [eax]
 	}
-#elif defined( GNUC )
+#elif defined( PLATFORM_WINDOWS_PC64 )
+	*sine = sin( radians );
+	*cosine = cos( radians );
+#elif defined( POSIX )
 	double __cosr, __sinr;
- 	__asm __volatile__ ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians));
+	__asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians));
 
   	*sine = __sinr;
   	*cosine = __cosr;
-#else
-	*sine = sinf(radians);
-	*cosine = cosf(radians);
 #endif
 }
 
@@ -505,6 +513,19 @@ bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float f
 void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out );
 void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out );
 
+inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out )
+{
+	MatrixGetColumn ( in, 3, out );
+}
+
+inline void MatrixSetTranslation( const Vector &in, matrix3x4_t &out )
+{
+	MatrixSetColumn ( in, 3, out );
+}
+
+void MatrixScaleBy ( const float flScale, matrix3x4_t &out );
+void MatrixScaleByZero ( matrix3x4_t &out );
+
 //void DecomposeRotation( const matrix3x4_t &mat, float *out );
 void ConcatRotations (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out);
 void ConcatTransforms (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out);
@@ -531,10 +552,6 @@ void QuaternionInvert( const Quaternion &p, Quaternion &q );
 float QuaternionNormalize( Quaternion &q );
 void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt );
 void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt );
-void QuaternionExp( const Quaternion &p, Quaternion &q );
-void QuaternionLn( const Quaternion &p, Quaternion &q );
-void QuaternionAverageExponential( Quaternion &q, int nCount, const Quaternion *pQuaternions, const float *pflWeights = NULL );
-void QuaternionLookAt( const Vector &vecForward, const Vector &referenceUp, Quaternion &q );
 void QuaternionMatrix( const Quaternion &q, matrix3x4_t &matrix );
 void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t &matrix );
 void QuaternionAngles( const Quaternion &q, QAngle &angles );
@@ -571,16 +588,16 @@ inline float anglemod(float a)
 inline float RemapVal( float val, float A, float B, float C, float D)
 {
 	if ( A == B )
-		return fsel( val - B , D , C );
+		return val >= B ? D : C;
 	return C + (D - C) * (val - A) / (B - A);
 }
 
 inline float RemapValClamped( float val, float A, float B, float C, float D)
 {
 	if ( A == B )
-		return fsel( val - B , D , C );
+		return val >= B ? D : C;
 	float cVal = (val - A) / (B - A);
-	cVal = clamp<float>( cVal, 0.0f, 1.0f );
+	cVal = clamp( cVal, 0.0f, 1.0f );
 
 	return C + (D - C) * cVal;
 }
@@ -671,7 +688,7 @@ template<> FORCEINLINE QAngleByValue Lerp<QAngleByValue>( float flPercent, const
 #endif // VECTOR_NO_SLOW_OPERATIONS
 
 
-// Swap two of anything.
+/// Same as swap(), but won't cause problems with std::swap
 template <class T> 
 FORCEINLINE void V_swap( T& x, T& y )
 {
@@ -686,15 +703,15 @@ template <class T> FORCEINLINE T AVG(T a, T b)
 }
 
 // number of elements in an array of static size
-#define NELEMS(x) ((sizeof(x))/sizeof(x[0]))
+#define NELEMS(x) ARRAYSIZE(x)
 
 // XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector));
 #define XYZ(v) (v).x,(v).y,(v).z
 
+
 inline float Sign( float x )
 {
-	return fsel( x, 1.0f, -1.0f ); // x >= 0 ? 1.0f : -1.0f
-	//return (x <0.0f) ? -1.0f : 1.0f;
+	return (x <0.0f) ? -1.0f : 1.0f;
 }
 
 //
@@ -726,16 +743,6 @@ inline int ClampArrayBounds( int n, unsigned maxindex )
 }
 
 
-
-// Turn a number "inside out". 
-// See Recording Animation in Binary Order for Progressive Temporal Refinement
-// by Paul Heckbert from "Graphics Gems".
-//
-// If you want to iterate something from 0 to n, you can use this to iterate non-sequentially, in
-// such a way that you will start with widely separated values and then refine the gaps between
-// them, as you would for progressive refinement. This works with non-power of two ranges.
-int InsideOut( int nTotal, int nCounter );
-
 #define BOX_ON_PLANE_SIDE(emins, emaxs, p)	\
 	(((p)->type < 3)?						\
 	(										\
@@ -796,9 +803,7 @@ inline void PositionMatrix( const Vector &position, matrix3x4_t &mat )
 
 inline void MatrixPosition( const matrix3x4_t &matrix, Vector &position )
 {
-	position[0] = matrix[0][3];
-	position[1] = matrix[1][3];
-	position[2] = matrix[2][3];
+	MatrixGetColumn( matrix, 3, position );
 }
 
 inline void VectorRotate( const Vector& in1, const matrix3x4_t &in2, Vector &out)
@@ -937,18 +942,6 @@ inline int FASTCALL BoxOnPlaneSide2 (const Vector& emins, const Vector& emaxs, c
 void ClearBounds (Vector& mins, Vector& maxs);
 void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs);
 
-//-----------------------------------------------------------------------------
-// Ensures that the min and max bounds values are valid. 
-// (ClearBounds() sets min > max, which is clearly invalid.)
-//-----------------------------------------------------------------------------
-bool AreBoundsValid( const Vector &vMin, const Vector &vMax );
-
-//-----------------------------------------------------------------------------
-// Returns true if the provided point is in the AABB defined by vMin
-// at the lower corner and vMax at the upper corner.
-//-----------------------------------------------------------------------------
-bool IsPointInBounds( const Vector &vPoint, const Vector &vMin, const Vector &vMax );
-
 //
 // COLORSPACE/GAMMA CONVERSION STUFF
 //
@@ -1157,7 +1150,9 @@ inline float SimpleSplineRemapValClamped( float val, float A, float B, float C,
 
 FORCEINLINE int RoundFloatToInt(float f)
 {
-#if defined( _X360 )
+#if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__)
+	return _mm_cvtss_si32(_mm_load_ss(&f));
+#elif defined( _X360 )
 #ifdef Assert
 	Assert( IsFPUControlWordSet() );
 #endif
@@ -1168,67 +1163,18 @@ FORCEINLINE int RoundFloatToInt(float f)
 	};
 	flResult = __fctiw( f );
 	return pResult[1];
-#else // !X360
-	int nResult;
-#if defined( COMPILER_MSVC32 )
-	__asm
-	{
-		fld f
-		fistp nResult
-	}
-#elif GNUC
-	__asm __volatile__ (
-		"fistpl %0;": "=m" (nResult): "t" (f) : "st"
-	);
 #else
-	nResult = static_cast<int>(f);
-#endif
-	return nResult;
+#error Unknown architecture
 #endif
 }
 
 FORCEINLINE unsigned char RoundFloatToByte(float f)
 {
-#if defined( _X360 )
+	int nResult = RoundFloatToInt(f);
 #ifdef Assert
-	Assert( IsFPUControlWordSet() );
-#endif
-	union
-	{
-		double flResult;
-		int pIntResult[2];
-		unsigned char pResult[8];
-	};
-	flResult = __fctiw( f );
-#ifdef Assert
-	Assert( pIntResult[1] >= 0 && pIntResult[1] <= 255 );
-#endif
-	return pResult[7];
-
-#else // !X360
-	
-	int nResult;
-
-#if defined( COMPILER_MSVC32 )
-	__asm
-	{
-		fld f
-		fistp nResult
-	}
-#elif GNUC
-	__asm __volatile__ (
-		"fistpl %0;": "=m" (nResult): "t" (f) : "st"
-	);
-#else
-	nResult = static_cast<unsigned int> (f) & 0xff;
-#endif
-
-#ifdef Assert
-	Assert( nResult >= 0 && nResult <= 255 );
-#endif 
-	return nResult;
-
+	Assert( (nResult & ~0xFF) == 0 );
 #endif
+	return (unsigned char) nResult;
 }
 
 FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f)
@@ -1248,25 +1194,41 @@ FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f)
 	return pResult[1];
 #else  // !X360
 	
-#if defined( COMPILER_MSVC32 )
-	unsigned char nResult[8];
-	__asm
+#if defined( PLATFORM_WINDOWS_PC64 )
+	uint nRet = ( uint ) f;
+	if ( nRet & 1 )
 	{
-		fld f
-		fistp       qword ptr nResult
+		if ( ( f - floor( f ) >= 0.5 ) )
+		{
+			nRet++;
+		}
 	}
-	return *((unsigned long*)nResult);
-#elif defined( COMPILER_GCC )
+	else
+	{
+		if ( ( f - floor( f ) > 0.5 ) )
+		{
+			nRet++;
+		}
+	}
+	return nRet;
+#else // PLATFORM_WINDOWS_PC64
 	unsigned char nResult[8];
-	__asm __volatile__ (
-		"fistpl %0;": "=m" (nResult): "t" (f) : "st"
-	);
-	return *((unsigned long*)nResult);
-#else
-	return static_cast<unsigned long>(f);
-#endif
 
-#endif
+	#if defined( _WIN32 )
+		__asm
+		{
+			fld f
+			fistp       qword ptr nResult
+		}
+	#elif POSIX
+		__asm __volatile__ (
+			"fistpl %0;": "=m" (nResult): "t" (f) : "st"
+		);
+	#endif
+
+		return *((unsigned long*)nResult);
+#endif // PLATFORM_WINDOWS_PC64
+#endif // !X360
 }
 
 FORCEINLINE bool IsIntegralValue( float flValue, float flTolerance = 0.001f )
@@ -1286,76 +1248,54 @@ FORCEINLINE int Float2Int( float a )
 	flResult = __fctiwz( a );
 	return pResult[1];
 #else  // !X360
-	
-	int RetVal;
-
-#if defined( COMPILER_MSVC32 )
-	int CtrlwdHolder;
-	int CtrlwdSetter;
-	__asm 
-	{
-		fld    a					// push 'a' onto the FP stack
-		fnstcw CtrlwdHolder		// store FPU control word
-		movzx  eax, CtrlwdHolder	// move and zero extend word into eax
-		and    eax, 0xFFFFF3FF	// set all bits except rounding bits to 1
-		or     eax, 0x00000C00	// set rounding mode bits to round towards zero
-		mov    CtrlwdSetter, eax	// Prepare to set the rounding mode -- prepare to enter plaid!
-		fldcw  CtrlwdSetter		// Entering plaid!
-		fistp  RetVal				// Store and converted (to int) result
-		fldcw  CtrlwdHolder		// Restore control word
-	}
-#else
-	RetVal = static_cast<int>( a );
-#endif
-
-	return RetVal;
+	// Rely on compiler to generate CVTTSS2SI on x86
+	return (int) a;
 #endif
 }
 
 // Over 15x faster than: (int)floor(value)
 inline int Floor2Int( float a )
 {
-   int RetVal;
-
-#if defined( _X360 )
-	RetVal = (int)floor( a );
-#elif defined( COMPILER_MSVC32 )
-   int CtrlwdHolder;
-   int CtrlwdSetter;
-   __asm 
-   {
-      fld    a					// push 'a' onto the FP stack
-      fnstcw CtrlwdHolder		// store FPU control word
-      movzx  eax, CtrlwdHolder	// move and zero extend word into eax
-      and    eax, 0xFFFFF3FF	// set all bits except rounding bits to 1
-      or     eax, 0x00000400	// set rounding mode bits to round down
-      mov    CtrlwdSetter, eax	// Prepare to set the rounding mode -- prepare to enter plaid!
-      fldcw  CtrlwdSetter		// Entering plaid!
-      fistp  RetVal				// Store floored and converted (to int) result
-      fldcw  CtrlwdHolder		// Restore control word
-   }
+	int RetVal;
+#if defined( __i386__ )
+	// Convert to int and back, compare, subtract one if too big
+	__m128 a128 = _mm_set_ss(a);
+	RetVal = _mm_cvtss_si32(a128);
+    __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
+	RetVal -= _mm_comigt_ss( rounded128, a128 );
 #else
 	RetVal = static_cast<int>( floor(a) );
 #endif
-
 	return RetVal;
 }
 
 //-----------------------------------------------------------------------------
 // Fast color conversion from float to unsigned char
 //-----------------------------------------------------------------------------
-FORCEINLINE unsigned char FastFToC( float c )
+FORCEINLINE unsigned int FastFToC( float c )
 {
-	volatile float dc;
-
-	// ieee trick
-	dc = c * 255.0f + (float)(1 << 23);
-	
-	// return the lsb
-#if defined( _X360 )
-	return ((unsigned char*)&dc)[3];
+#if defined( __i386__ )
+	// IEEE float bit manipulation works for values between [0, 1<<23)
+	union { float f; int i; } convert = { c*255.0f + (float)(1<<23) };
+	return convert.i & 255;
 #else
-	return *(unsigned char*)&dc;
+	// consoles CPUs suffer from load-hit-store penalty
+	return Float2Int( c * 255.0f );
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Fast conversion from float to integer with magnitude less than 2**22
+//-----------------------------------------------------------------------------
+FORCEINLINE int FastFloatToSmallInt( float c )
+{
+#if defined( __i386__ )
+	// IEEE float bit manipulation works for values between [-1<<22, 1<<22)
+	union { float f; int i; } convert = { c + (float)(3<<22) };
+	return (convert.i & ((1<<23)-1)) - (1<<22);
+#else
+	// consoles CPUs suffer from load-hit-store penalty
+	return Float2Int( c );
 #endif
 }
 
@@ -1367,35 +1307,22 @@ FORCEINLINE unsigned char FastFToC( float c )
 inline float ClampToMsec( float in )
 {
 	int msec = Floor2Int( in * 1000.0f + 0.5f );
-	return msec / 1000.0f;
+	return 0.001f * msec;
 }
 
 // Over 15x faster than: (int)ceil(value)
 inline int Ceil2Int( float a )
 {
    int RetVal;
-
-#if defined( _X360 )
-	RetVal = (int)ceil( a );
-#elif defined( COMPILER_MSVC32 )
-   int CtrlwdHolder;
-   int CtrlwdSetter;
-   __asm 
-   {
-      fld    a					// push 'a' onto the FP stack
-      fnstcw CtrlwdHolder		// store FPU control word
-      movzx  eax, CtrlwdHolder	// move and zero extend word into eax
-      and    eax, 0xFFFFF3FF	// set all bits except rounding bits to 1
-      or     eax, 0x00000800	// set rounding mode bits to round down
-      mov    CtrlwdSetter, eax	// Prepare to set the rounding mode -- prepare to enter plaid!
-      fldcw  CtrlwdSetter		// Entering plaid!
-      fistp  RetVal				// Store floored and converted (to int) result
-      fldcw  CtrlwdHolder		// Restore control word
-   }
+#if defined( __i386__ )
+   // Convert to int and back, compare, add one if too small
+   __m128 a128 = _mm_load_ss(&a);
+   RetVal = _mm_cvtss_si32(a128);
+   __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
+   RetVal += _mm_comilt_ss( rounded128, a128 );
 #else
-	RetVal = static_cast<int>( ceil(a) );
+   RetVal = static_cast<int>( ceil(a) );
 #endif
-
 	return RetVal;
 }
 
@@ -1736,13 +1663,6 @@ void Parabolic_Spline_NormalizeX(
 	float t, 
 	Vector& output );
 
-// Evaluate the cubic Bernstein basis for the input parametric coordinate.
-// Output is the coefficient for that basis polynomial.
-float CubicBasis0( float t );
-float CubicBasis1( float t );
-float CubicBasis2( float t );
-float CubicBasis3( float t );
-
 // quintic interpolating polynomial from Perlin.
 // 0->0, 1->1, smooth-in between with smooth tangents
 FORCEINLINE float QuinticInterpolatingPolynomial(float t)
@@ -1763,6 +1683,7 @@ void GetInterpolationData( float const *pKnotPositions,
 						   float *pValueA, 
 						   float *pValueB,
 						   float *pInterpolationValue);
+
 float RangeCompressor( float flValue, float flMin, float flMax, float flBase );
 
 // Get the minimum distance from vOrigin to the bounding box defined by [mins,maxs]
@@ -1806,11 +1727,12 @@ float CalcDistanceSqrToLineSegment2D( Vector2D const &P, Vector2D const &vLineA,
 
 // Init the mathlib
 void MathLib_Init( float gamma = 2.2f, float texGamma = 2.2f, float brightness = 0.0f, int overbright = 2.0f, bool bAllow3DNow = true, bool bAllowSSE = true, bool bAllowSSE2 = true, bool bAllowMMX = true );
+bool MathLib_3DNowEnabled( void );
 bool MathLib_MMXEnabled( void );
 bool MathLib_SSEEnabled( void );
 bool MathLib_SSE2Enabled( void );
 
-inline float Approach( float target, float value, float speed );
+float Approach( float target, float value, float speed );
 float ApproachAngle( float target, float value, float speed );
 float AngleDiff( float destAngle, float srcAngle );
 float AngleDistance( float next, float cur );
@@ -1825,23 +1747,10 @@ bool AnglesAreEqual( float a, float b, float tolerance = 0.0f );
 void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle );
 void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out );
 
-//-----------------------------------------------------------------------------
-// Clips a line segment such that only the portion in the positive half-space
-// of the plane remains.  If the segment is entirely clipped, the vectors
-// are set to vec3_invalid (all components are FLT_MAX).
-//
-// flBias is added to the dot product with the normal.  A positive bias 
-// results in a more inclusive positive half-space, while a negative bias
-// results in a more exclusive positive half-space.
-//-----------------------------------------------------------------------------
-void ClipLineSegmentToPlane( const Vector &vNormal, const Vector &vPlanePoint, Vector *p1, Vector *p2, float flBias = 0.0f );
-
 void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept );
 int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale = 9000.0f );
 int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon = 0.1f );
 int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon = 0.1 );
-float TetrahedronVolume( const Vector &p0, const Vector &p1, const Vector &p2, const Vector &p3 );
-float TriangleArea( const Vector &p0, const Vector &p1, const Vector &p2 );
 
 //-----------------------------------------------------------------------------
 // Computes a reasonable tangent space for a triangle
@@ -2001,12 +1910,7 @@ FORCEINLINE float * UnpackNormal_SHORT2( const unsigned int *pPackedNormal, floa
 
 	pNormal[0] = ( iX - 16384.0f ) / 16384.0f;
 	pNormal[1] = ( iY - 16384.0f ) / 16384.0f;
-	float mag = ( pNormal[0]*pNormal[0] + pNormal[1]*pNormal[1] );
-	if ( mag > 1.0f )
-	{
-		mag = 1.0f;
-	}
-	pNormal[2] = zSign*sqrtf( 1.0f - mag );
+	pNormal[2] = zSign*sqrtf( 1.0f - ( pNormal[0]*pNormal[0] + pNormal[1]*pNormal[1] ) );
 	if ( bIsTangent )
 	{
 		pNormal[3] = tSign;
@@ -2272,33 +2176,6 @@ inline bool AlmostEqual( const Vector &a, const Vector &b, int maxUlps = 10)
 		AlmostEqual( a.z, b.z, maxUlps );
 }
 
-inline float Approach( float target, float value, float speed )
-{
-	float delta = target - value;
-
-#if defined(_X360) || defined( PS3 ) // use conditional move for speed on 360
-
-	return fsel( delta-speed,	// delta >= speed ?
-				 value + speed,	// if delta == speed, then value + speed == value + delta == target  
-				 fsel( (-speed) - delta, // delta <= -speed
-						value - speed,
-						target )
-				);  // delta < speed && delta > -speed
-
-#else
-
-	if ( delta > speed )
-		value += speed;
-	else if ( delta < -speed )
-		value -= speed;
-	else 
-		value = target;
-		
-	return value;
-
-#endif
-}
-
 inline void matrix3x4_t::Init( const Vector &xAxis, const Vector &yAxis, const Vector &zAxis, const Vector &vecOrigin )
 {
 	MatrixInitialize( *this, vecOrigin, xAxis, yAxis, zAxis );
@@ -2373,6 +2250,16 @@ inline void matrix3x4_t::ConcatTransforms( const matrix3x4_t &other )
 	::ConcatTransforms( *this, other, *this );
 }
 
+inline void matrix3x4_t::ScaleBy( const float value )
+{
+	MatrixScaleBy( value, *this );
+}
+
+inline void matrix3x4_t::ScaleByZero()
+{
+	MatrixScaleByZero( *this );
+}
+
 inline void matrix3x4_t::Multiply( const matrix3x4_t &other )
 {
 	MatrixMultiply( *this, other, *this );
@@ -2414,6 +2301,18 @@ inline float matrix3x4_t::ColumnDotProduct( MatrixAxisType_t column, const Vecto
 	return MatrixColumnDotProduct( *this, column, in );
 }
 
+inline Vector matrix3x4_t::GetTranslation() const
+{
+	Vector out;
+	MatrixGetTranslation( *this, out );
+	return out;
+}
+
+inline void matrix3x4_t::SetTranslation( const Vector &in )
+{
+	MatrixSetTranslation( in, *this );
+}
+
 inline Vector matrix3x4_t::GetColumn( MatrixAxisType_t column ) const
 {
 	Vector out;