diff --git a/lib/linux64/mathlib.a b/lib/linux64/mathlib.a index 5f3e4e2a..a8b9de60 100644 Binary files a/lib/linux64/mathlib.a and b/lib/linux64/mathlib.a differ diff --git a/lib/public/win64/mathlib.lib b/lib/public/win64/mathlib.lib index b5a658dd..fe1e014c 100644 Binary files a/lib/public/win64/mathlib.lib and b/lib/public/win64/mathlib.lib differ diff --git a/mathlib/mathlib_base.cpp b/mathlib/mathlib_base.cpp index 06e791ba..fb519a31 100644 --- a/mathlib/mathlib_base.cpp +++ b/mathlib/mathlib_base.cpp @@ -80,8 +80,6 @@ float VectorNormalize (Vector& vec) return radius; } - - // TODO: Add fast C VectorNormalizeFast. // Perhaps use approximate rsqrt trick, if the accuracy isn't too bad. void FASTCALL _VectorNormalizeFast (Vector& vec) @@ -427,6 +425,33 @@ void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out ) out[2][column] = in.z; } +void MatrixScaleBy ( const float flScale, matrix3x4_t &out ) +{ + out[0][0] *= flScale; + out[1][0] *= flScale; + out[2][0] *= flScale; + out[0][1] *= flScale; + out[1][1] *= flScale; + out[2][1] *= flScale; + out[0][2] *= flScale; + out[1][2] *= flScale; + out[2][2] *= flScale; +} + +void MatrixScaleByZero ( matrix3x4_t &out ) +{ + out[0][0] = 0.0f; + out[1][0] = 0.0f; + out[2][0] = 0.0f; + out[0][1] = 0.0f; + out[1][1] = 0.0f; + out[2][1] = 0.0f; + out[0][2] = 0.0f; + out[1][2] = 0.0f; + out[2][2] = 0.0f; +} + + int VectorCompare (const float *v1, const float *v2) { @@ -566,53 +591,128 @@ void ConcatRotations (const float in1[3][3], const float in2[3][3], float out[3] in1[2][2] * in2[2][2]; } +void ConcatTransforms_Aligned( const matrix3x4_t &m0, const matrix3x4_t &m1, matrix3x4_t &out ) +{ + Assert( (((size_t)&m0) % 16) == 0 ); + Assert( (((size_t)&m1) % 16) == 0 ); + Assert( (((size_t)&out) % 16) == 0 ); + + fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]); + fltx4 rowA0 = LoadAlignedSIMD( m0.m_flMatVal[0] ); + fltx4 rowA1 = LoadAlignedSIMD( m0.m_flMatVal[1] ); + fltx4 rowA2 = LoadAlignedSIMD( m0.m_flMatVal[2] ); + + fltx4 rowB0 = LoadAlignedSIMD( m1.m_flMatVal[0] ); + fltx4 rowB1 = LoadAlignedSIMD( m1.m_flMatVal[1] ); + fltx4 rowB2 = LoadAlignedSIMD( m1.m_flMatVal[2] ); + + // now we have the rows of m0 and the columns of m1 + // first output row + fltx4 A0 = SplatXSIMD(rowA0); + fltx4 A1 = SplatYSIMD(rowA0); + fltx4 A2 = SplatZSIMD(rowA0); + fltx4 mul00 = MulSIMD( A0, rowB0 ); + fltx4 mul01 = MulSIMD( A1, rowB1 ); + fltx4 mul02 = MulSIMD( A2, rowB2 ); + fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) ); + + // second output row + A0 = SplatXSIMD(rowA1); + A1 = SplatYSIMD(rowA1); + A2 = SplatZSIMD(rowA1); + fltx4 mul10 = MulSIMD( A0, rowB0 ); + fltx4 mul11 = MulSIMD( A1, rowB1 ); + fltx4 mul12 = MulSIMD( A2, rowB2 ); + fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) ); + + // third output row + A0 = SplatXSIMD(rowA2); + A1 = SplatYSIMD(rowA2); + A2 = SplatZSIMD(rowA2); + fltx4 mul20 = MulSIMD( A0, rowB0 ); + fltx4 mul21 = MulSIMD( A1, rowB1 ); + fltx4 mul22 = MulSIMD( A2, rowB2 ); + fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) ); + + // add in translation vector + A0 = AndSIMD(rowA0,lastMask); + A1 = AndSIMD(rowA1,lastMask); + A2 = AndSIMD(rowA2,lastMask); + out0 = AddSIMD(out0, A0); + out1 = AddSIMD(out1, A1); + out2 = AddSIMD(out2, A2); + + StoreAlignedSIMD( out.m_flMatVal[0], out0 ); + StoreAlignedSIMD( out.m_flMatVal[1], out1 ); + StoreAlignedSIMD( out.m_flMatVal[2], out2 ); +} /* ================ R_ConcatTransforms ================ */ + void ConcatTransforms (const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out) { - Assert( s_bMathlibInitialized ); - if ( &in1 == &out ) +#if 0 + // test for ones that'll be 2x faster + if ( (((size_t)&in1) % 16) == 0 && (((size_t)&in2) % 16) == 0 && (((size_t)&out) % 16) == 0 ) { - matrix3x4_t in1b; - MatrixCopy( in1, in1b ); - ConcatTransforms( in1b, in2, out ); + ConcatTransforms_Aligned( in1, in2, out ); return; } - if ( &in2 == &out ) - { - matrix3x4_t in2b; - MatrixCopy( in2, in2b ); - ConcatTransforms( in1, in2b, out ); - return; - } - out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] + - in1[0][2] * in2[2][0]; - out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] + - in1[0][2] * in2[2][1]; - out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] + - in1[0][2] * in2[2][2]; - out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] + - in1[0][2] * in2[2][3] + in1[0][3]; - out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] + - in1[1][2] * in2[2][0]; - out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] + - in1[1][2] * in2[2][1]; - out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] + - in1[1][2] * in2[2][2]; - out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] + - in1[1][2] * in2[2][3] + in1[1][3]; - out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] + - in1[2][2] * in2[2][0]; - out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] + - in1[2][2] * in2[2][1]; - out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] + - in1[2][2] * in2[2][2]; - out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] + - in1[2][2] * in2[2][3] + in1[2][3]; +#endif + + fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]); + fltx4 rowA0 = LoadUnalignedSIMD( in1.m_flMatVal[0] ); + fltx4 rowA1 = LoadUnalignedSIMD( in1.m_flMatVal[1] ); + fltx4 rowA2 = LoadUnalignedSIMD( in1.m_flMatVal[2] ); + + fltx4 rowB0 = LoadUnalignedSIMD( in2.m_flMatVal[0] ); + fltx4 rowB1 = LoadUnalignedSIMD( in2.m_flMatVal[1] ); + fltx4 rowB2 = LoadUnalignedSIMD( in2.m_flMatVal[2] ); + + // now we have the rows of m0 and the columns of m1 + // first output row + fltx4 A0 = SplatXSIMD(rowA0); + fltx4 A1 = SplatYSIMD(rowA0); + fltx4 A2 = SplatZSIMD(rowA0); + fltx4 mul00 = MulSIMD( A0, rowB0 ); + fltx4 mul01 = MulSIMD( A1, rowB1 ); + fltx4 mul02 = MulSIMD( A2, rowB2 ); + fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) ); + + // second output row + A0 = SplatXSIMD(rowA1); + A1 = SplatYSIMD(rowA1); + A2 = SplatZSIMD(rowA1); + fltx4 mul10 = MulSIMD( A0, rowB0 ); + fltx4 mul11 = MulSIMD( A1, rowB1 ); + fltx4 mul12 = MulSIMD( A2, rowB2 ); + fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) ); + + // third output row + A0 = SplatXSIMD(rowA2); + A1 = SplatYSIMD(rowA2); + A2 = SplatZSIMD(rowA2); + fltx4 mul20 = MulSIMD( A0, rowB0 ); + fltx4 mul21 = MulSIMD( A1, rowB1 ); + fltx4 mul22 = MulSIMD( A2, rowB2 ); + fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) ); + + // add in translation vector + A0 = AndSIMD(rowA0,lastMask); + A1 = AndSIMD(rowA1,lastMask); + A2 = AndSIMD(rowA2,lastMask); + out0 = AddSIMD(out0, A0); + out1 = AddSIMD(out1, A1); + out2 = AddSIMD(out2, A2); + + // write to output + StoreUnalignedSIMD( out.m_flMatVal[0], out0 ); + StoreUnalignedSIMD( out.m_flMatVal[1], out1 ); + StoreUnalignedSIMD( out.m_flMatVal[2], out2 ); } @@ -1359,7 +1459,9 @@ float Bias( float x, float biasAmt ) { lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5)) } - return pow( x, lastExponent ); + float fRet = pow( x, lastExponent ); + Assert ( !IS_NAN( fRet ) ); + return fRet; } @@ -1375,7 +1477,9 @@ float Gain( float x, float biasAmt ) float SmoothCurve( float x ) { - return (1 - cos( x * M_PI )) * 0.5f; + // Actual smooth curve. Visualization: + // http://www.wolframalpha.com/input/?i=plot%5B+0.5+*+%281+-+cos%5B2+*+pi+*+x%5D%29+for+x+%3D+%280%2C+1%29+%5D + return 0.5f * (1 - cos( 2.0f * M_PI * x ) ); } @@ -2408,9 +2512,7 @@ void Hermite_SplineBasis( float t, float basis[4] ) //----------------------------------------------------------------------------- // BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled -#ifdef _MSC_VER #pragma optimize( "g", off ) -#endif void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float t, Vector& output ) { @@ -2420,9 +2522,7 @@ void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float Hermite_Spline( p1, p2, e10, e21, t, output ); } -#ifdef _MSC_VER #pragma optimize( "", on ) -#endif float Hermite_Spline( float p0, float p1, float p2, float t ) { @@ -3188,18 +3288,15 @@ bool CalcLineToLineIntersectionSegment( return true; } -#ifdef _MSC_VER #pragma optimize( "", off ) -#endif #ifndef EXCEPTION_EXECUTE_HANDLER #define EXCEPTION_EXECUTE_HANDLER 1 #endif -#ifdef _MSC_VER #pragma optimize( "", on ) -#endif +static bool s_b3DNowEnabled = false; static bool s_bMMXEnabled = false; static bool s_bSSEEnabled = false; static bool s_bSSE2Enabled = false; @@ -3213,7 +3310,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright #if !defined( _X360 ) // Grab the processor information: - const CPUInformation& pi = GetCPUInformation(); + const CPUInformation& pi = *GetCPUInformation(); // Select the default generic routines. pfSqrt = _sqrtf; @@ -3235,38 +3332,54 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright s_bMMXEnabled = false; } - // GAMMACASE: Since the sse.cpp doesn't have any x64 code rn - // we can't use the sse stuff here -#ifndef COMPILER_MSVC64 + // SSE Generally performs better than 3DNow when present, so this is placed + // first to allow SSE to override these settings. +#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX) + if ( bAllow3DNow && pi.m_b3DNow ) + { + s_b3DNowEnabled = true; + + // Select the 3DNow specific routines if available; + pfVectorNormalize = _3DNow_VectorNormalize; + pfVectorNormalizeFast = _3DNow_VectorNormalizeFast; + pfInvRSquared = _3DNow_InvRSquared; + pfSqrt = _3DNow_Sqrt; + pfRSqrt = _3DNow_RSqrt; + pfRSqrtFast = _3DNow_RSqrt; + } + else +#endif + { + s_b3DNowEnabled = false; + } + if ( bAllowSSE && pi.m_bSSE ) { s_bSSEEnabled = true; +#ifndef PLATFORM_WINDOWS_PC64 + // These are not yet available. // Select the SSE specific routines if available pfVectorNormalizeFast = _SSE_VectorNormalizeFast; pfInvRSquared = _SSE_InvRSquared; pfSqrt = _SSE_Sqrt; pfRSqrt = _SSE_RSqrtAccurate; pfRSqrtFast = _SSE_RSqrtFast; -#ifdef _WIN32 +#endif +#ifdef PLATFORM_WINDOWS_PC32 pfFastSinCos = _SSE_SinCos; pfFastCos = _SSE_cos; #endif - } else { s_bSSEEnabled = false; } -#else - s_bSSEEnabled = false; -#endif -#ifndef COMPILER_MSVC64 if ( bAllowSSE2 && pi.m_bSSE2 ) { s_bSSE2Enabled = true; -#ifdef _WIN32 +#ifdef PLATFORM_WINDOWS_PC32 pfFastSinCos = _SSE2_SinCos; pfFastCos = _SSE2_cos; #endif @@ -3275,10 +3388,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright { s_bSSE2Enabled = false; } -#else - s_bSSE2Enabled = false; -#endif -#endif +#endif // !_X360 s_bMathlibInitialized = true; @@ -3286,6 +3396,12 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright BuildGammaTable( gamma, texGamma, brightness, overbright ); } +bool MathLib_3DNowEnabled( void ) +{ + Assert( s_bMathlibInitialized ); + return s_b3DNowEnabled; +} + bool MathLib_MMXEnabled( void ) { Assert( s_bMathlibInitialized ); @@ -3304,6 +3420,20 @@ bool MathLib_SSE2Enabled( void ) return s_bSSE2Enabled; } +float Approach( float target, float value, float speed ) +{ + float delta = target - value; + + if ( delta > speed ) + value += speed; + else if ( delta < -speed ) + value -= speed; + else + value = target; + + return value; +} + // BUGBUG: Why doesn't this call angle diff?!?!? float ApproachAngle( float target, float value, float speed ) { @@ -3990,8 +4120,8 @@ void HSVtoRGB( const Vector &hsv, Vector &rgb ) hue = 0.0F; } hue /= 60.0F; - int i = static_cast(hue); // integer part - float32 f = hue - i; // fractional part + int i = hue; // integer part + float32 f = hue - i; // fractional part float32 p = hsv.z * (1.0F - hsv.y); float32 q = hsv.z * (1.0F - hsv.y * f); float32 t = hsv.z * (1.0F - hsv.y * (1.0F - f)); diff --git a/public/mathlib/mathlib.h b/public/mathlib/mathlib.h index 59bf5e23..bdd5d9a6 100644 --- a/public/mathlib/mathlib.h +++ b/public/mathlib/mathlib.h @@ -14,9 +14,7 @@ #include "tier0/dbg.h" #include "mathlib/math_pfns.h" -#ifndef ALIGN8_POST -#define ALIGN8_POST -#endif + // plane_t structure // !!! if this is changed, it must be changed in asm code too !!! // FIXME: does the asm code even exist anymore? @@ -151,6 +149,9 @@ struct matrix3x4_t inline void ConcatRotations( const matrix3x4_t &other ); inline void ConcatTransforms( const matrix3x4_t &other ); + inline void ScaleBy( const float value ); + inline void ScaleByZero(); + inline void Multiply( const matrix3x4_t &other ); inline void Transpose(); @@ -161,6 +162,9 @@ struct matrix3x4_t inline float RowDotProduct( int row, const Vector &in ) const; inline float ColumnDotProduct( MatrixAxisType_t column, const Vector &in ) const; + inline Vector GetTranslation() const; + inline void SetTranslation( const Vector &in ); + inline Vector GetColumn( MatrixAxisType_t column ) const; inline void SetColumn( const Vector &in, MatrixAxisType_t column ); @@ -205,7 +209,11 @@ struct matrix3x4_t } } - matrix3x4_t operator *( const matrix3x4_t &other ) { Multiply( other ); return *this; } + matrix3x4_t &operator *=( const matrix3x4_t &other ) { Multiply( other ); return *this; } + matrix3x4_t &operator *=( float value ) { ScaleBy( value ); return *this; } + + matrix3x4_t operator *( const matrix3x4_t &other ) const { matrix3x4_t temp( *this ); temp.Multiply( other ); return temp; } + matrix3x4_t operator *( float value ) const { matrix3x4_t temp( *this ); temp.ScaleBy( value ); return temp; } float *operator[]( int i ) { Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; } const float *operator[]( int i ) const { Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; } @@ -378,7 +386,7 @@ void inline SinCos( float radians, float *sine, float *cosine ) { #if defined( _X360 ) XMScalarSinCos( sine, cosine, radians ); -#elif defined( COMPILER_MSVC32 ) +#elif defined( PLATFORM_WINDOWS_PC32 ) _asm { fld DWORD PTR [radians] @@ -390,15 +398,15 @@ void inline SinCos( float radians, float *sine, float *cosine ) fstp DWORD PTR [edx] fstp DWORD PTR [eax] } -#elif defined( GNUC ) +#elif defined( PLATFORM_WINDOWS_PC64 ) + *sine = sin( radians ); + *cosine = cos( radians ); +#elif defined( POSIX ) double __cosr, __sinr; - __asm __volatile__ ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians)); + __asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians)); *sine = __sinr; *cosine = __cosr; -#else - *sine = sinf(radians); - *cosine = cosf(radians); #endif } @@ -505,6 +513,19 @@ bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float f void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out ); void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out ); +inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out ) +{ + MatrixGetColumn ( in, 3, out ); +} + +inline void MatrixSetTranslation( const Vector &in, matrix3x4_t &out ) +{ + MatrixSetColumn ( in, 3, out ); +} + +void MatrixScaleBy ( const float flScale, matrix3x4_t &out ); +void MatrixScaleByZero ( matrix3x4_t &out ); + //void DecomposeRotation( const matrix3x4_t &mat, float *out ); void ConcatRotations (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out); void ConcatTransforms (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out); @@ -531,10 +552,6 @@ void QuaternionInvert( const Quaternion &p, Quaternion &q ); float QuaternionNormalize( Quaternion &q ); void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt ); void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt ); -void QuaternionExp( const Quaternion &p, Quaternion &q ); -void QuaternionLn( const Quaternion &p, Quaternion &q ); -void QuaternionAverageExponential( Quaternion &q, int nCount, const Quaternion *pQuaternions, const float *pflWeights = NULL ); -void QuaternionLookAt( const Vector &vecForward, const Vector &referenceUp, Quaternion &q ); void QuaternionMatrix( const Quaternion &q, matrix3x4_t &matrix ); void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t &matrix ); void QuaternionAngles( const Quaternion &q, QAngle &angles ); @@ -571,16 +588,16 @@ inline float anglemod(float a) inline float RemapVal( float val, float A, float B, float C, float D) { if ( A == B ) - return fsel( val - B , D , C ); + return val >= B ? D : C; return C + (D - C) * (val - A) / (B - A); } inline float RemapValClamped( float val, float A, float B, float C, float D) { if ( A == B ) - return fsel( val - B , D , C ); + return val >= B ? D : C; float cVal = (val - A) / (B - A); - cVal = clamp( cVal, 0.0f, 1.0f ); + cVal = clamp( cVal, 0.0f, 1.0f ); return C + (D - C) * cVal; } @@ -671,7 +688,7 @@ template<> FORCEINLINE QAngleByValue Lerp( float flPercent, const #endif // VECTOR_NO_SLOW_OPERATIONS -// Swap two of anything. +/// Same as swap(), but won't cause problems with std::swap template FORCEINLINE void V_swap( T& x, T& y ) { @@ -686,15 +703,15 @@ template FORCEINLINE T AVG(T a, T b) } // number of elements in an array of static size -#define NELEMS(x) ((sizeof(x))/sizeof(x[0])) +#define NELEMS(x) ARRAYSIZE(x) // XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector)); #define XYZ(v) (v).x,(v).y,(v).z + inline float Sign( float x ) { - return fsel( x, 1.0f, -1.0f ); // x >= 0 ? 1.0f : -1.0f - //return (x <0.0f) ? -1.0f : 1.0f; + return (x <0.0f) ? -1.0f : 1.0f; } // @@ -726,16 +743,6 @@ inline int ClampArrayBounds( int n, unsigned maxindex ) } - -// Turn a number "inside out". -// See Recording Animation in Binary Order for Progressive Temporal Refinement -// by Paul Heckbert from "Graphics Gems". -// -// If you want to iterate something from 0 to n, you can use this to iterate non-sequentially, in -// such a way that you will start with widely separated values and then refine the gaps between -// them, as you would for progressive refinement. This works with non-power of two ranges. -int InsideOut( int nTotal, int nCounter ); - #define BOX_ON_PLANE_SIDE(emins, emaxs, p) \ (((p)->type < 3)? \ ( \ @@ -796,9 +803,7 @@ inline void PositionMatrix( const Vector &position, matrix3x4_t &mat ) inline void MatrixPosition( const matrix3x4_t &matrix, Vector &position ) { - position[0] = matrix[0][3]; - position[1] = matrix[1][3]; - position[2] = matrix[2][3]; + MatrixGetColumn( matrix, 3, position ); } inline void VectorRotate( const Vector& in1, const matrix3x4_t &in2, Vector &out) @@ -937,18 +942,6 @@ inline int FASTCALL BoxOnPlaneSide2 (const Vector& emins, const Vector& emaxs, c void ClearBounds (Vector& mins, Vector& maxs); void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs); -//----------------------------------------------------------------------------- -// Ensures that the min and max bounds values are valid. -// (ClearBounds() sets min > max, which is clearly invalid.) -//----------------------------------------------------------------------------- -bool AreBoundsValid( const Vector &vMin, const Vector &vMax ); - -//----------------------------------------------------------------------------- -// Returns true if the provided point is in the AABB defined by vMin -// at the lower corner and vMax at the upper corner. -//----------------------------------------------------------------------------- -bool IsPointInBounds( const Vector &vPoint, const Vector &vMin, const Vector &vMax ); - // // COLORSPACE/GAMMA CONVERSION STUFF // @@ -1157,7 +1150,9 @@ inline float SimpleSplineRemapValClamped( float val, float A, float B, float C, FORCEINLINE int RoundFloatToInt(float f) { -#if defined( _X360 ) +#if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__) + return _mm_cvtss_si32(_mm_load_ss(&f)); +#elif defined( _X360 ) #ifdef Assert Assert( IsFPUControlWordSet() ); #endif @@ -1168,67 +1163,18 @@ FORCEINLINE int RoundFloatToInt(float f) }; flResult = __fctiw( f ); return pResult[1]; -#else // !X360 - int nResult; -#if defined( COMPILER_MSVC32 ) - __asm - { - fld f - fistp nResult - } -#elif GNUC - __asm __volatile__ ( - "fistpl %0;": "=m" (nResult): "t" (f) : "st" - ); #else - nResult = static_cast(f); -#endif - return nResult; +#error Unknown architecture #endif } FORCEINLINE unsigned char RoundFloatToByte(float f) { -#if defined( _X360 ) + int nResult = RoundFloatToInt(f); #ifdef Assert - Assert( IsFPUControlWordSet() ); -#endif - union - { - double flResult; - int pIntResult[2]; - unsigned char pResult[8]; - }; - flResult = __fctiw( f ); -#ifdef Assert - Assert( pIntResult[1] >= 0 && pIntResult[1] <= 255 ); -#endif - return pResult[7]; - -#else // !X360 - - int nResult; - -#if defined( COMPILER_MSVC32 ) - __asm - { - fld f - fistp nResult - } -#elif GNUC - __asm __volatile__ ( - "fistpl %0;": "=m" (nResult): "t" (f) : "st" - ); -#else - nResult = static_cast (f) & 0xff; -#endif - -#ifdef Assert - Assert( nResult >= 0 && nResult <= 255 ); -#endif - return nResult; - + Assert( (nResult & ~0xFF) == 0 ); #endif + return (unsigned char) nResult; } FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f) @@ -1248,25 +1194,41 @@ FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f) return pResult[1]; #else // !X360 -#if defined( COMPILER_MSVC32 ) - unsigned char nResult[8]; - __asm +#if defined( PLATFORM_WINDOWS_PC64 ) + uint nRet = ( uint ) f; + if ( nRet & 1 ) { - fld f - fistp qword ptr nResult + if ( ( f - floor( f ) >= 0.5 ) ) + { + nRet++; + } } - return *((unsigned long*)nResult); -#elif defined( COMPILER_GCC ) + else + { + if ( ( f - floor( f ) > 0.5 ) ) + { + nRet++; + } + } + return nRet; +#else // PLATFORM_WINDOWS_PC64 unsigned char nResult[8]; - __asm __volatile__ ( - "fistpl %0;": "=m" (nResult): "t" (f) : "st" - ); - return *((unsigned long*)nResult); -#else - return static_cast(f); -#endif -#endif + #if defined( _WIN32 ) + __asm + { + fld f + fistp qword ptr nResult + } + #elif POSIX + __asm __volatile__ ( + "fistpl %0;": "=m" (nResult): "t" (f) : "st" + ); + #endif + + return *((unsigned long*)nResult); +#endif // PLATFORM_WINDOWS_PC64 +#endif // !X360 } FORCEINLINE bool IsIntegralValue( float flValue, float flTolerance = 0.001f ) @@ -1286,76 +1248,54 @@ FORCEINLINE int Float2Int( float a ) flResult = __fctiwz( a ); return pResult[1]; #else // !X360 - - int RetVal; - -#if defined( COMPILER_MSVC32 ) - int CtrlwdHolder; - int CtrlwdSetter; - __asm - { - fld a // push 'a' onto the FP stack - fnstcw CtrlwdHolder // store FPU control word - movzx eax, CtrlwdHolder // move and zero extend word into eax - and eax, 0xFFFFF3FF // set all bits except rounding bits to 1 - or eax, 0x00000C00 // set rounding mode bits to round towards zero - mov CtrlwdSetter, eax // Prepare to set the rounding mode -- prepare to enter plaid! - fldcw CtrlwdSetter // Entering plaid! - fistp RetVal // Store and converted (to int) result - fldcw CtrlwdHolder // Restore control word - } -#else - RetVal = static_cast( a ); -#endif - - return RetVal; + // Rely on compiler to generate CVTTSS2SI on x86 + return (int) a; #endif } // Over 15x faster than: (int)floor(value) inline int Floor2Int( float a ) { - int RetVal; - -#if defined( _X360 ) - RetVal = (int)floor( a ); -#elif defined( COMPILER_MSVC32 ) - int CtrlwdHolder; - int CtrlwdSetter; - __asm - { - fld a // push 'a' onto the FP stack - fnstcw CtrlwdHolder // store FPU control word - movzx eax, CtrlwdHolder // move and zero extend word into eax - and eax, 0xFFFFF3FF // set all bits except rounding bits to 1 - or eax, 0x00000400 // set rounding mode bits to round down - mov CtrlwdSetter, eax // Prepare to set the rounding mode -- prepare to enter plaid! - fldcw CtrlwdSetter // Entering plaid! - fistp RetVal // Store floored and converted (to int) result - fldcw CtrlwdHolder // Restore control word - } + int RetVal; +#if defined( __i386__ ) + // Convert to int and back, compare, subtract one if too big + __m128 a128 = _mm_set_ss(a); + RetVal = _mm_cvtss_si32(a128); + __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal); + RetVal -= _mm_comigt_ss( rounded128, a128 ); #else RetVal = static_cast( floor(a) ); #endif - return RetVal; } //----------------------------------------------------------------------------- // Fast color conversion from float to unsigned char //----------------------------------------------------------------------------- -FORCEINLINE unsigned char FastFToC( float c ) +FORCEINLINE unsigned int FastFToC( float c ) { - volatile float dc; - - // ieee trick - dc = c * 255.0f + (float)(1 << 23); - - // return the lsb -#if defined( _X360 ) - return ((unsigned char*)&dc)[3]; +#if defined( __i386__ ) + // IEEE float bit manipulation works for values between [0, 1<<23) + union { float f; int i; } convert = { c*255.0f + (float)(1<<23) }; + return convert.i & 255; #else - return *(unsigned char*)&dc; + // consoles CPUs suffer from load-hit-store penalty + return Float2Int( c * 255.0f ); +#endif +} + +//----------------------------------------------------------------------------- +// Fast conversion from float to integer with magnitude less than 2**22 +//----------------------------------------------------------------------------- +FORCEINLINE int FastFloatToSmallInt( float c ) +{ +#if defined( __i386__ ) + // IEEE float bit manipulation works for values between [-1<<22, 1<<22) + union { float f; int i; } convert = { c + (float)(3<<22) }; + return (convert.i & ((1<<23)-1)) - (1<<22); +#else + // consoles CPUs suffer from load-hit-store penalty + return Float2Int( c ); #endif } @@ -1367,35 +1307,22 @@ FORCEINLINE unsigned char FastFToC( float c ) inline float ClampToMsec( float in ) { int msec = Floor2Int( in * 1000.0f + 0.5f ); - return msec / 1000.0f; + return 0.001f * msec; } // Over 15x faster than: (int)ceil(value) inline int Ceil2Int( float a ) { int RetVal; - -#if defined( _X360 ) - RetVal = (int)ceil( a ); -#elif defined( COMPILER_MSVC32 ) - int CtrlwdHolder; - int CtrlwdSetter; - __asm - { - fld a // push 'a' onto the FP stack - fnstcw CtrlwdHolder // store FPU control word - movzx eax, CtrlwdHolder // move and zero extend word into eax - and eax, 0xFFFFF3FF // set all bits except rounding bits to 1 - or eax, 0x00000800 // set rounding mode bits to round down - mov CtrlwdSetter, eax // Prepare to set the rounding mode -- prepare to enter plaid! - fldcw CtrlwdSetter // Entering plaid! - fistp RetVal // Store floored and converted (to int) result - fldcw CtrlwdHolder // Restore control word - } +#if defined( __i386__ ) + // Convert to int and back, compare, add one if too small + __m128 a128 = _mm_load_ss(&a); + RetVal = _mm_cvtss_si32(a128); + __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal); + RetVal += _mm_comilt_ss( rounded128, a128 ); #else - RetVal = static_cast( ceil(a) ); + RetVal = static_cast( ceil(a) ); #endif - return RetVal; } @@ -1736,13 +1663,6 @@ void Parabolic_Spline_NormalizeX( float t, Vector& output ); -// Evaluate the cubic Bernstein basis for the input parametric coordinate. -// Output is the coefficient for that basis polynomial. -float CubicBasis0( float t ); -float CubicBasis1( float t ); -float CubicBasis2( float t ); -float CubicBasis3( float t ); - // quintic interpolating polynomial from Perlin. // 0->0, 1->1, smooth-in between with smooth tangents FORCEINLINE float QuinticInterpolatingPolynomial(float t) @@ -1763,6 +1683,7 @@ void GetInterpolationData( float const *pKnotPositions, float *pValueA, float *pValueB, float *pInterpolationValue); + float RangeCompressor( float flValue, float flMin, float flMax, float flBase ); // Get the minimum distance from vOrigin to the bounding box defined by [mins,maxs] @@ -1806,11 +1727,12 @@ float CalcDistanceSqrToLineSegment2D( Vector2D const &P, Vector2D const &vLineA, // Init the mathlib void MathLib_Init( float gamma = 2.2f, float texGamma = 2.2f, float brightness = 0.0f, int overbright = 2.0f, bool bAllow3DNow = true, bool bAllowSSE = true, bool bAllowSSE2 = true, bool bAllowMMX = true ); +bool MathLib_3DNowEnabled( void ); bool MathLib_MMXEnabled( void ); bool MathLib_SSEEnabled( void ); bool MathLib_SSE2Enabled( void ); -inline float Approach( float target, float value, float speed ); +float Approach( float target, float value, float speed ); float ApproachAngle( float target, float value, float speed ); float AngleDiff( float destAngle, float srcAngle ); float AngleDistance( float next, float cur ); @@ -1825,23 +1747,10 @@ bool AnglesAreEqual( float a, float b, float tolerance = 0.0f ); void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle ); void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out ); -//----------------------------------------------------------------------------- -// Clips a line segment such that only the portion in the positive half-space -// of the plane remains. If the segment is entirely clipped, the vectors -// are set to vec3_invalid (all components are FLT_MAX). -// -// flBias is added to the dot product with the normal. A positive bias -// results in a more inclusive positive half-space, while a negative bias -// results in a more exclusive positive half-space. -//----------------------------------------------------------------------------- -void ClipLineSegmentToPlane( const Vector &vNormal, const Vector &vPlanePoint, Vector *p1, Vector *p2, float flBias = 0.0f ); - void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept ); int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale = 9000.0f ); int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon = 0.1f ); int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon = 0.1 ); -float TetrahedronVolume( const Vector &p0, const Vector &p1, const Vector &p2, const Vector &p3 ); -float TriangleArea( const Vector &p0, const Vector &p1, const Vector &p2 ); //----------------------------------------------------------------------------- // Computes a reasonable tangent space for a triangle @@ -2001,12 +1910,7 @@ FORCEINLINE float * UnpackNormal_SHORT2( const unsigned int *pPackedNormal, floa pNormal[0] = ( iX - 16384.0f ) / 16384.0f; pNormal[1] = ( iY - 16384.0f ) / 16384.0f; - float mag = ( pNormal[0]*pNormal[0] + pNormal[1]*pNormal[1] ); - if ( mag > 1.0f ) - { - mag = 1.0f; - } - pNormal[2] = zSign*sqrtf( 1.0f - mag ); + pNormal[2] = zSign*sqrtf( 1.0f - ( pNormal[0]*pNormal[0] + pNormal[1]*pNormal[1] ) ); if ( bIsTangent ) { pNormal[3] = tSign; @@ -2272,33 +2176,6 @@ inline bool AlmostEqual( const Vector &a, const Vector &b, int maxUlps = 10) AlmostEqual( a.z, b.z, maxUlps ); } -inline float Approach( float target, float value, float speed ) -{ - float delta = target - value; - -#if defined(_X360) || defined( PS3 ) // use conditional move for speed on 360 - - return fsel( delta-speed, // delta >= speed ? - value + speed, // if delta == speed, then value + speed == value + delta == target - fsel( (-speed) - delta, // delta <= -speed - value - speed, - target ) - ); // delta < speed && delta > -speed - -#else - - if ( delta > speed ) - value += speed; - else if ( delta < -speed ) - value -= speed; - else - value = target; - - return value; - -#endif -} - inline void matrix3x4_t::Init( const Vector &xAxis, const Vector &yAxis, const Vector &zAxis, const Vector &vecOrigin ) { MatrixInitialize( *this, vecOrigin, xAxis, yAxis, zAxis ); @@ -2373,6 +2250,16 @@ inline void matrix3x4_t::ConcatTransforms( const matrix3x4_t &other ) ::ConcatTransforms( *this, other, *this ); } +inline void matrix3x4_t::ScaleBy( const float value ) +{ + MatrixScaleBy( value, *this ); +} + +inline void matrix3x4_t::ScaleByZero() +{ + MatrixScaleByZero( *this ); +} + inline void matrix3x4_t::Multiply( const matrix3x4_t &other ) { MatrixMultiply( *this, other, *this ); @@ -2414,6 +2301,18 @@ inline float matrix3x4_t::ColumnDotProduct( MatrixAxisType_t column, const Vecto return MatrixColumnDotProduct( *this, column, in ); } +inline Vector matrix3x4_t::GetTranslation() const +{ + Vector out; + MatrixGetTranslation( *this, out ); + return out; +} + +inline void matrix3x4_t::SetTranslation( const Vector &in ) +{ + MatrixSetTranslation( in, *this ); +} + inline Vector matrix3x4_t::GetColumn( MatrixAxisType_t column ) const { Vector out;