Added most recent version of unmodified HL2 SDK for Orange Box engine

2025-09-19 12:06:07 +08:00 · 2008-09-15 01:07:45 -05:00
commit 055f5cd168
2907 changed files with 1271781 additions and 0 deletions
--- a/mathlib/3dnow.cpp
+++ b/mathlib/3dnow.cpp
@ -0,0 +1,189 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 3DNow Math primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/amd3dx.h"
+#include "mathlib/vector.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
+#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
+
+//-----------------------------------------------------------------------------
+// 3D Now Implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _3DNow_Sqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+	float	root = 0.f;
+#ifdef _WIN32
+	_asm
+	{
+		femms
+		movd		mm0, x
+		PFRSQRT		(mm1,mm0)
+		punpckldq	mm0, mm0
+		PFMUL		(mm0, mm1)
+		movd		root, mm0
+		femms
+	}
+#elif _LINUX
+ 	__asm __volatile__( "femms" );
+ 	__asm __volatile__
+	(
+		"pfrsqrt    %y0, %y1 \n\t"
+		"punpckldq   %y1, %y1 \n\t"
+		"pfmul      %y1, %y0 \n\t"
+		: "=y" (root), "=y" (x)
+ 		:"0" (x)
+ 	);
+ 	__asm __volatile__( "femms" );
+#else
+#error
+#endif
+
+	return root;
+}
+
+// NJS FIXME: Need to test Recripricol squareroot performance and accuraccy
+// on AMD's before using the specialized instruction.
+float _3DNow_RSqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	return 1.f / _3DNow_Sqrt(x);
+}
+
+
+float FASTCALL _3DNow_VectorNormalize (Vector& vec)
+{
+	Assert( s_bMathlibInitialized );
+	float *v = &vec[0];
+	float	radius = 0.f;
+
+	if ( v[0] || v[1] || v[2] )
+	{
+#ifdef _WIN32
+	_asm
+		{
+			mov			eax, v
+			femms
+			movq		mm0, QWORD PTR [eax]
+			movd		mm1, DWORD PTR [eax+8]
+			movq		mm2, mm0
+			movq		mm3, mm1
+			PFMUL		(mm0, mm0)
+			PFMUL		(mm1, mm1)
+			PFACC		(mm0, mm0)
+			PFADD		(mm1, mm0)
+			PFRSQRT		(mm0, mm1)
+			punpckldq	mm1, mm1
+			PFMUL		(mm1, mm0)
+			PFMUL		(mm2, mm0)
+			PFMUL		(mm3, mm0)
+			movq		QWORD PTR [eax], mm2
+			movd		DWORD PTR [eax+8], mm3
+			movd		radius, mm1
+			femms
+		}
+#elif _LINUX	
+		long long a,c;
+    		int b,d;
+    		memcpy(&a,&vec[0],sizeof(a));
+    		memcpy(&b,&vec[2],sizeof(b));
+    		memcpy(&c,&vec[0],sizeof(c));
+    		memcpy(&d,&vec[2],sizeof(d));
+
+      		__asm __volatile__( "femms" );
+        	__asm __volatile__
+        	(
+        		"pfmul           %y3, %y3\n\t"
+        		"pfmul           %y0, %y0 \n\t"
+        		"pfacc           %y3, %y3 \n\t"
+        		"pfadd           %y3, %y0 \n\t"
+        		"pfrsqrt         %y0, %y3 \n\t"
+        		"punpckldq       %y0, %y0 \n\t"
+        		"pfmul           %y3, %y0 \n\t"
+        		"pfmul           %y3, %y2 \n\t"
+        		"pfmul           %y3, %y1 \n\t"
+        		: "=y" (radius), "=y" (c), "=y" (d)
+        		: "y" (a), "0" (b), "1" (c), "2" (d)
+        	);
+      		memcpy(&vec[0],&c,sizeof(c));
+      		memcpy(&vec[2],&d,sizeof(d));		
+        	__asm __volatile__( "femms" );
+
+#else
+#error
+#endif
+	}
+    return radius;
+}
+
+
+void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec)
+{
+	_3DNow_VectorNormalize( vec );
+}
+
+
+// JAY: This complains with the latest processor pack
+#pragma warning(disable: 4730)
+
+float _3DNow_InvRSquared(const float* v)
+{
+	Assert( s_bMathlibInitialized );
+	float	r2 = 1.f;
+#ifdef _WIN32
+	_asm { // AMD 3DNow only routine
+		mov			eax, v
+		femms
+		movq		mm0, QWORD PTR [eax]
+		movd		mm1, DWORD PTR [eax+8]
+		movd		mm2, [r2]
+		PFMUL		(mm0, mm0)
+		PFMUL		(mm1, mm1)
+		PFACC		(mm0, mm0)
+		PFADD		(mm1, mm0)
+		PFMAX		(mm1, mm2)
+		PFRCP		(mm0, mm1)
+		movd		[r2], mm0
+		femms
+	}
+#elif _LINUX
+		long long a,c;
+    		int b;
+    		memcpy(&a,&v[0],sizeof(a));
+    		memcpy(&b,&v[2],sizeof(b));
+    		memcpy(&c,&v[0],sizeof(c));
+
+      		__asm __volatile__( "femms" );
+        	__asm __volatile__
+        	(
+			"PFMUL          %y2, %y2 \n\t"
+                        "PFMUL          %y3, %y3 \n\t"
+                        "PFACC          %y2, %y2 \n\t"
+                        "PFADD          %y2, %y3 \n\t"
+                        "PFMAX          %y3, %y4 \n\t"
+                        "PFRCP          %y3, %y2 \n\t"
+                        "movq           %y2, %y0 \n\t"
+        		: "=y" (r2)
+        		: "0" (r2), "y" (a), "y" (b), "y" (c)
+        	);
+        	__asm __volatile__( "femms" );
+#else
+#error
+#endif
+
+	return r2;
+}
--- a/mathlib/3dnow.h
+++ b/mathlib/3dnow.h
@ -0,0 +1,16 @@
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef _3DNOW_H
+#define _3DNOW_H
+
+float _3DNow_Sqrt(float x);
+float _3DNow_RSqrt(float x);
+float FASTCALL _3DNow_VectorNormalize (Vector& vec);
+void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec);
+float _3DNow_InvRSquared(const float* v);
+
+#endif // _3DNOW_H
--- a/mathlib/IceKey.cpp
+++ b/mathlib/IceKey.cpp
@ -0,0 +1,393 @@
+// Purpose: C++ implementation of the ICE encryption algorithm.
+//			Taken from public domain code, as written by Matthew Kwan - July 1996
+//			http://www.darkside.com.au/ice/
+
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+#include "mathlib/IceKey.H"
+
+#pragma warning(disable: 4244)
+
+
+	/* Structure of a single round subkey */
+class IceSubkey {
+    public:
+	unsigned long	val[3];
+};
+
+
+	/* The S-boxes */
+static unsigned long	ice_sbox[4][1024];
+static int		ice_sboxes_initialised = 0;
+
+
+	/* Modulo values for the S-boxes */
+static const int	ice_smod[4][4] = {
+				{333, 313, 505, 369},
+				{379, 375, 319, 391},
+				{361, 445, 451, 397},
+				{397, 425, 395, 505}};
+
+	/* XOR values for the S-boxes */
+static const int	ice_sxor[4][4] = {
+				{0x83, 0x85, 0x9b, 0xcd},
+				{0xcc, 0xa7, 0xad, 0x41},
+				{0x4b, 0x2e, 0xd4, 0x33},
+				{0xea, 0xcb, 0x2e, 0x04}};
+
+	/* Permutation values for the P-box */
+static const unsigned long	ice_pbox[32] = {
+		0x00000001, 0x00000080, 0x00000400, 0x00002000,
+		0x00080000, 0x00200000, 0x01000000, 0x40000000,
+		0x00000008, 0x00000020, 0x00000100, 0x00004000,
+		0x00010000, 0x00800000, 0x04000000, 0x20000000,
+		0x00000004, 0x00000010, 0x00000200, 0x00008000,
+		0x00020000, 0x00400000, 0x08000000, 0x10000000,
+		0x00000002, 0x00000040, 0x00000800, 0x00001000,
+		0x00040000, 0x00100000, 0x02000000, 0x80000000};
+
+	/* The key rotation schedule */
+static const int	ice_keyrot[16] = {
+				0, 1, 2, 3, 2, 1, 3, 0,
+				1, 3, 2, 0, 3, 1, 0, 2};
+
+
+/*
+ * 8-bit Galois Field multiplication of a by b, modulo m.
+ * Just like arithmetic multiplication, except that additions and
+ * subtractions are replaced by XOR.
+ */
+
+static unsigned int
+gf_mult (
+	register unsigned int	a,
+	register unsigned int	b,
+	register unsigned int	m
+) {
+	register unsigned int	res = 0;
+
+	while (b) {
+	    if (b & 1)
+		res ^= a;
+
+	    a <<= 1;
+	    b >>= 1;
+
+	    if (a >= 256)
+		a ^= m;
+	}
+
+	return (res);
+}
+
+
+/*
+ * Galois Field exponentiation.
+ * Raise the base to the power of 7, modulo m.
+ */
+
+static unsigned long
+gf_exp7 (
+	register unsigned int	b,
+	unsigned int		m
+) {
+	register unsigned int	x;
+
+	if (b == 0)
+	    return (0);
+
+	x = gf_mult (b, b, m);
+	x = gf_mult (b, x, m);
+	x = gf_mult (x, x, m);
+	return (gf_mult (b, x, m));
+}
+
+
+/*
+ * Carry out the ICE 32-bit P-box permutation.
+ */
+
+static unsigned long
+ice_perm32 (
+	register unsigned long	x
+) {
+	register unsigned long		res = 0;
+	register const unsigned long	*pbox = ice_pbox;
+
+	while (x) {
+	    if (x & 1)
+		res |= *pbox;
+	    pbox++;
+	    x >>= 1;
+	}
+
+	return (res);
+}
+
+
+/*
+ * Initialise the ICE S-boxes.
+ * This only has to be done once.
+ */
+
+static void
+ice_sboxes_init (void)
+{
+	register int	i;
+
+	for (i=0; i<1024; i++) {
+	    int			col = (i >> 1) & 0xff;
+	    int			row = (i & 0x1) | ((i & 0x200) >> 8);
+	    unsigned long	x;
+
+	    x = gf_exp7 (col ^ ice_sxor[0][row], ice_smod[0][row]) << 24;
+	    ice_sbox[0][i] = ice_perm32 (x);
+
+	    x = gf_exp7 (col ^ ice_sxor[1][row], ice_smod[1][row]) << 16;
+	    ice_sbox[1][i] = ice_perm32 (x);
+
+	    x = gf_exp7 (col ^ ice_sxor[2][row], ice_smod[2][row]) << 8;
+	    ice_sbox[2][i] = ice_perm32 (x);
+
+	    x = gf_exp7 (col ^ ice_sxor[3][row], ice_smod[3][row]);
+	    ice_sbox[3][i] = ice_perm32 (x);
+	}
+}
+
+
+/*
+ * Create a new ICE key.
+ */
+
+IceKey::IceKey (int n)
+{
+	if (!ice_sboxes_initialised) {
+	    ice_sboxes_init ();
+	    ice_sboxes_initialised = 1;
+	}
+
+	if (n < 1) {
+	    _size = 1;
+	    _rounds = 8;
+	} else {
+	    _size = n;
+	    _rounds = n * 16;
+	}
+
+	_keysched = new IceSubkey[_rounds];
+}
+
+
+/*
+ * Destroy an ICE key.
+ */
+
+IceKey::~IceKey ()
+{
+	int	i, j;
+
+	for (i=0; i<_rounds; i++)
+	    for (j=0; j<3; j++)
+		_keysched[i].val[j] = 0;
+
+	_rounds = _size = 0;
+
+	delete[] _keysched;
+}
+
+
+/*
+ * The single round ICE f function.
+ */
+
+static unsigned long
+ice_f (
+	register unsigned long	p,
+	const IceSubkey		*sk
+) {
+	unsigned long	tl, tr;		/* Expanded 40-bit values */
+	unsigned long	al, ar;		/* Salted expanded 40-bit values */
+
+					/* Left half expansion */
+	tl = ((p >> 16) & 0x3ff) | (((p >> 14) | (p << 18)) & 0xffc00);
+
+					/* Right half expansion */
+	tr = (p & 0x3ff) | ((p << 2) & 0xffc00);
+
+					/* Perform the salt permutation */
+			// al = (tr & sk->val[2]) | (tl & ~sk->val[2]);
+			// ar = (tl & sk->val[2]) | (tr & ~sk->val[2]);
+	al = sk->val[2] & (tl ^ tr);
+	ar = al ^ tr;
+	al ^= tl;
+
+	al ^= sk->val[0];		/* XOR with the subkey */
+	ar ^= sk->val[1];
+
+					/* S-box lookup and permutation */
+	return (ice_sbox[0][al >> 10] | ice_sbox[1][al & 0x3ff]
+		| ice_sbox[2][ar >> 10] | ice_sbox[3][ar & 0x3ff]);
+}
+
+
+/*
+ * Encrypt a block of 8 bytes of data with the given ICE key.
+ */
+
+void
+IceKey::encrypt (
+	const unsigned char	*ptext,
+	unsigned char		*ctext
+) const
+{
+	register int		i;
+	register unsigned long	l, r;
+
+	l = (((unsigned long) ptext[0]) << 24)
+				| (((unsigned long) ptext[1]) << 16)
+				| (((unsigned long) ptext[2]) << 8) | ptext[3];
+	r = (((unsigned long) ptext[4]) << 24)
+				| (((unsigned long) ptext[5]) << 16)
+				| (((unsigned long) ptext[6]) << 8) | ptext[7];
+
+	for (i = 0; i < _rounds; i += 2) {
+	    l ^= ice_f (r, &_keysched[i]);
+	    r ^= ice_f (l, &_keysched[i + 1]);
+	}
+
+	for (i = 0; i < 4; i++) {
+	    ctext[3 - i] = r & 0xff;
+	    ctext[7 - i] = l & 0xff;
+
+	    r >>= 8;
+	    l >>= 8;
+	}
+}
+
+
+/*
+ * Decrypt a block of 8 bytes of data with the given ICE key.
+ */
+
+void
+IceKey::decrypt (
+	const unsigned char	*ctext,
+	unsigned char		*ptext
+) const
+{
+	register int		i;
+	register unsigned long	l, r;
+
+	l = (((unsigned long) ctext[0]) << 24)
+				| (((unsigned long) ctext[1]) << 16)
+				| (((unsigned long) ctext[2]) << 8) | ctext[3];
+	r = (((unsigned long) ctext[4]) << 24)
+				| (((unsigned long) ctext[5]) << 16)
+				| (((unsigned long) ctext[6]) << 8) | ctext[7];
+
+	for (i = _rounds - 1; i > 0; i -= 2) {
+	    l ^= ice_f (r, &_keysched[i]);
+	    r ^= ice_f (l, &_keysched[i - 1]);
+	}
+
+	for (i = 0; i < 4; i++) {
+	    ptext[3 - i] = r & 0xff;
+	    ptext[7 - i] = l & 0xff;
+
+	    r >>= 8;
+	    l >>= 8;
+	}
+}
+
+
+/*
+ * Set 8 rounds [n, n+7] of the key schedule of an ICE key.
+ */
+
+void
+IceKey::scheduleBuild (
+	unsigned short	*kb,
+	int		n,
+	const int	*keyrot
+) {
+	int		i;
+
+	for (i=0; i<8; i++) {
+	    register int	j;
+	    register int	kr = keyrot[i];
+	    IceSubkey		*isk = &_keysched[n + i];
+
+	    for (j=0; j<3; j++)
+		isk->val[j] = 0;
+
+	    for (j=0; j<15; j++) {
+		register int	k;
+		unsigned long	*curr_sk = &isk->val[j % 3];
+
+		for (k=0; k<4; k++) {
+		    unsigned short	*curr_kb = &kb[(kr + k) & 3];
+		    register int	bit = *curr_kb & 1;
+
+		    *curr_sk = (*curr_sk << 1) | bit;
+		    *curr_kb = (*curr_kb >> 1) | ((bit ^ 1) << 15);
+		}
+	    }
+	}
+}
+
+
+/*
+ * Set the key schedule of an ICE key.
+ */
+
+void
+IceKey::set (
+	const unsigned char	*key
+) {
+	int		i;
+
+	if (_rounds == 8) {
+	    unsigned short	kb[4];
+
+	    for (i=0; i<4; i++)
+		kb[3 - i] = (key[i*2] << 8) | key[i*2 + 1];
+
+	    scheduleBuild (kb, 0, ice_keyrot);
+	    return;
+	}
+
+	for (i=0; i<_size; i++) {
+	    int			j;
+	    unsigned short	kb[4];
+
+	    for (j=0; j<4; j++)
+		kb[3 - j] = (key[i*8 + j*2] << 8) | key[i*8 + j*2 + 1];
+
+	    scheduleBuild (kb, i*8, ice_keyrot);
+	    scheduleBuild (kb, _rounds - 8 - i*8, &ice_keyrot[8]);
+	}
+}
+
+
+/*
+ * Return the key size, in bytes.
+ */
+
+int
+IceKey::keySize () const
+{
+	return (_size * 8);
+}
+
+
+/*
+ * Return the block size, in bytes.
+ */
+
+int
+IceKey::blockSize () const
+{
+	return (8);
+}
+
+#endif // !_STATIC_LINKED || _SHARED_LIB
--- a/mathlib/anorms.cpp
+++ b/mathlib/anorms.cpp
@ -0,0 +1,181 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=============================================================================//
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+
+#include "mathlib/vector.h"
+#include "mathlib/anorms.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+Vector g_anorms[NUMVERTEXNORMALS] =
+{
+	Vector(-0.525731, 0.000000, 0.850651), 
+	Vector(-0.442863, 0.238856, 0.864188), 
+	Vector(-0.295242, 0.000000, 0.955423), 
+	Vector(-0.309017, 0.500000, 0.809017), 
+	Vector(-0.162460, 0.262866, 0.951056), 
+	Vector(0.000000, 0.000000, 1.000000), 
+	Vector(0.000000, 0.850651, 0.525731), 
+	Vector(-0.147621, 0.716567, 0.681718), 
+	Vector(0.147621, 0.716567, 0.681718), 
+	Vector(0.000000, 0.525731, 0.850651), 
+	Vector(0.309017, 0.500000, 0.809017), 
+	Vector(0.525731, 0.000000, 0.850651), 
+	Vector(0.295242, 0.000000, 0.955423), 
+	Vector(0.442863, 0.238856, 0.864188), 
+	Vector(0.162460, 0.262866, 0.951056), 
+	Vector(-0.681718, 0.147621, 0.716567), 
+	Vector(-0.809017, 0.309017, 0.500000), 
+	Vector(-0.587785, 0.425325, 0.688191), 
+	Vector(-0.850651, 0.525731, 0.000000), 
+	Vector(-0.864188, 0.442863, 0.238856), 
+	Vector(-0.716567, 0.681718, 0.147621), 
+	Vector(-0.688191, 0.587785, 0.425325), 
+	Vector(-0.500000, 0.809017, 0.309017), 
+	Vector(-0.238856, 0.864188, 0.442863), 
+	Vector(-0.425325, 0.688191, 0.587785), 
+	Vector(-0.716567, 0.681718, -0.147621), 
+	Vector(-0.500000, 0.809017, -0.309017), 
+	Vector(-0.525731, 0.850651, 0.000000), 
+	Vector(0.000000, 0.850651, -0.525731), 
+	Vector(-0.238856, 0.864188, -0.442863), 
+	Vector(0.000000, 0.955423, -0.295242), 
+	Vector(-0.262866, 0.951056, -0.162460), 
+	Vector(0.000000, 1.000000, 0.000000), 
+	Vector(0.000000, 0.955423, 0.295242), 
+	Vector(-0.262866, 0.951056, 0.162460), 
+	Vector(0.238856, 0.864188, 0.442863), 
+	Vector(0.262866, 0.951056, 0.162460), 
+	Vector(0.500000, 0.809017, 0.309017), 
+	Vector(0.238856, 0.864188, -0.442863), 
+	Vector(0.262866, 0.951056, -0.162460), 
+	Vector(0.500000, 0.809017, -0.309017), 
+	Vector(0.850651, 0.525731, 0.000000), 
+	Vector(0.716567, 0.681718, 0.147621), 
+	Vector(0.716567, 0.681718, -0.147621), 
+	Vector(0.525731, 0.850651, 0.000000), 
+	Vector(0.425325, 0.688191, 0.587785), 
+	Vector(0.864188, 0.442863, 0.238856), 
+	Vector(0.688191, 0.587785, 0.425325), 
+	Vector(0.809017, 0.309017, 0.500000), 
+	Vector(0.681718, 0.147621, 0.716567), 
+	Vector(0.587785, 0.425325, 0.688191), 
+	Vector(0.955423, 0.295242, 0.000000), 
+	Vector(1.000000, 0.000000, 0.000000), 
+	Vector(0.951056, 0.162460, 0.262866), 
+	Vector(0.850651, -0.525731, 0.000000), 
+	Vector(0.955423, -0.295242, 0.000000), 
+	Vector(0.864188, -0.442863, 0.238856), 
+	Vector(0.951056, -0.162460, 0.262866), 
+	Vector(0.809017, -0.309017, 0.500000), 
+	Vector(0.681718, -0.147621, 0.716567), 
+	Vector(0.850651, 0.000000, 0.525731), 
+	Vector(0.864188, 0.442863, -0.238856), 
+	Vector(0.809017, 0.309017, -0.500000), 
+	Vector(0.951056, 0.162460, -0.262866), 
+	Vector(0.525731, 0.000000, -0.850651), 
+	Vector(0.681718, 0.147621, -0.716567), 
+	Vector(0.681718, -0.147621, -0.716567), 
+	Vector(0.850651, 0.000000, -0.525731), 
+	Vector(0.809017, -0.309017, -0.500000), 
+	Vector(0.864188, -0.442863, -0.238856), 
+	Vector(0.951056, -0.162460, -0.262866), 
+	Vector(0.147621, 0.716567, -0.681718), 
+	Vector(0.309017, 0.500000, -0.809017), 
+	Vector(0.425325, 0.688191, -0.587785), 
+	Vector(0.442863, 0.238856, -0.864188), 
+	Vector(0.587785, 0.425325, -0.688191), 
+	Vector(0.688191, 0.587785, -0.425325), 
+	Vector(-0.147621, 0.716567, -0.681718), 
+	Vector(-0.309017, 0.500000, -0.809017), 
+	Vector(0.000000, 0.525731, -0.850651), 
+	Vector(-0.525731, 0.000000, -0.850651), 
+	Vector(-0.442863, 0.238856, -0.864188), 
+	Vector(-0.295242, 0.000000, -0.955423), 
+	Vector(-0.162460, 0.262866, -0.951056), 
+	Vector(0.000000, 0.000000, -1.000000), 
+	Vector(0.295242, 0.000000, -0.955423), 
+	Vector(0.162460, 0.262866, -0.951056), 
+	Vector(-0.442863, -0.238856, -0.864188), 
+	Vector(-0.309017, -0.500000, -0.809017), 
+	Vector(-0.162460, -0.262866, -0.951056), 
+	Vector(0.000000, -0.850651, -0.525731), 
+	Vector(-0.147621, -0.716567, -0.681718), 
+	Vector(0.147621, -0.716567, -0.681718), 
+	Vector(0.000000, -0.525731, -0.850651), 
+	Vector(0.309017, -0.500000, -0.809017), 
+	Vector(0.442863, -0.238856, -0.864188), 
+	Vector(0.162460, -0.262866, -0.951056), 
+	Vector(0.238856, -0.864188, -0.442863), 
+	Vector(0.500000, -0.809017, -0.309017), 
+	Vector(0.425325, -0.688191, -0.587785), 
+	Vector(0.716567, -0.681718, -0.147621), 
+	Vector(0.688191, -0.587785, -0.425325), 
+	Vector(0.587785, -0.425325, -0.688191), 
+	Vector(0.000000, -0.955423, -0.295242), 
+	Vector(0.000000, -1.000000, 0.000000), 
+	Vector(0.262866, -0.951056, -0.162460), 
+	Vector(0.000000, -0.850651, 0.525731), 
+	Vector(0.000000, -0.955423, 0.295242), 
+	Vector(0.238856, -0.864188, 0.442863), 
+	Vector(0.262866, -0.951056, 0.162460), 
+	Vector(0.500000, -0.809017, 0.309017), 
+	Vector(0.716567, -0.681718, 0.147621), 
+	Vector(0.525731, -0.850651, 0.000000), 
+	Vector(-0.238856, -0.864188, -0.442863), 
+	Vector(-0.500000, -0.809017, -0.309017), 
+	Vector(-0.262866, -0.951056, -0.162460), 
+	Vector(-0.850651, -0.525731, 0.000000), 
+	Vector(-0.716567, -0.681718, -0.147621), 
+	Vector(-0.716567, -0.681718, 0.147621), 
+	Vector(-0.525731, -0.850651, 0.000000), 
+	Vector(-0.500000, -0.809017, 0.309017), 
+	Vector(-0.238856, -0.864188, 0.442863), 
+	Vector(-0.262866, -0.951056, 0.162460), 
+	Vector(-0.864188, -0.442863, 0.238856), 
+	Vector(-0.809017, -0.309017, 0.500000), 
+	Vector(-0.688191, -0.587785, 0.425325), 
+	Vector(-0.681718, -0.147621, 0.716567), 
+	Vector(-0.442863, -0.238856, 0.864188), 
+	Vector(-0.587785, -0.425325, 0.688191), 
+	Vector(-0.309017, -0.500000, 0.809017), 
+	Vector(-0.147621, -0.716567, 0.681718), 
+	Vector(-0.425325, -0.688191, 0.587785), 
+	Vector(-0.162460, -0.262866, 0.951056), 
+	Vector(0.442863, -0.238856, 0.864188), 
+	Vector(0.162460, -0.262866, 0.951056), 
+	Vector(0.309017, -0.500000, 0.809017), 
+	Vector(0.147621, -0.716567, 0.681718), 
+	Vector(0.000000, -0.525731, 0.850651), 
+	Vector(0.425325, -0.688191, 0.587785), 
+	Vector(0.587785, -0.425325, 0.688191), 
+	Vector(0.688191, -0.587785, 0.425325), 
+	Vector(-0.955423, 0.295242, 0.000000), 
+	Vector(-0.951056, 0.162460, 0.262866), 
+	Vector(-1.000000, 0.000000, 0.000000), 
+	Vector(-0.850651, 0.000000, 0.525731), 
+	Vector(-0.955423, -0.295242, 0.000000), 
+	Vector(-0.951056, -0.162460, 0.262866), 
+	Vector(-0.864188, 0.442863, -0.238856), 
+	Vector(-0.951056, 0.162460, -0.262866), 
+	Vector(-0.809017, 0.309017, -0.500000), 
+	Vector(-0.864188, -0.442863, -0.238856), 
+	Vector(-0.951056, -0.162460, -0.262866), 
+	Vector(-0.809017, -0.309017, -0.500000), 
+	Vector(-0.681718, 0.147621, -0.716567), 
+	Vector(-0.681718, -0.147621, -0.716567), 
+	Vector(-0.850651, 0.000000, -0.525731), 
+	Vector(-0.688191, 0.587785, -0.425325), 
+	Vector(-0.587785, 0.425325, -0.688191), 
+	Vector(-0.425325, 0.688191, -0.587785), 
+	Vector(-0.425325, -0.688191, -0.587785), 
+	Vector(-0.587785, -0.425325, -0.688191), 
+	Vector(-0.688191, -0.587785, -0.425325)
+};
+
+#endif // !_STATIC_LINKED || _SHARED_LIB
--- a/mathlib/bumpvects.cpp
+++ b/mathlib/bumpvects.cpp
@ -0,0 +1,69 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $Workfile:     $
+// $Date:         $
+//
+//-----------------------------------------------------------------------------
+// $Log: $
+//
+// $NoKeywords: $
+//=============================================================================//
+
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+
+#ifdef QUIVER
+#include "r_local.h"
+#endif
+#include "mathlib/bumpvects.h"
+#include "mathlib/vector.h"
+#include <assert.h>
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+// z is coming out of the face.
+
+void GetBumpNormals( const Vector& sVect, const Vector& tVect, const Vector& flatNormal, 
+					 const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] )
+{
+	Vector tmpNormal;
+	bool leftHanded;
+	int i;
+
+	assert( NUM_BUMP_VECTS == 3 );
+	
+	// Are we left or right handed?
+	CrossProduct( sVect, tVect, tmpNormal );
+	if( DotProduct( flatNormal, tmpNormal ) < 0.0f )
+	{
+		leftHanded = true;
+	}
+	else
+	{
+		leftHanded = false;
+	}
+
+	// Build a basis for the face around the phong normal
+	matrix3x4_t smoothBasis;
+	CrossProduct( phongNormal.Base(), sVect.Base(), smoothBasis[1] );
+	VectorNormalize( smoothBasis[1] );
+	CrossProduct( smoothBasis[1], phongNormal.Base(), smoothBasis[0] );
+	VectorNormalize( smoothBasis[0] );
+	VectorCopy( phongNormal.Base(), smoothBasis[2] );
+	
+	if( leftHanded )
+	{
+		VectorNegate( smoothBasis[1] );
+	}
+	
+	// move the g_localBumpBasis into world space to create bumpNormals
+	for( i = 0; i < 3; i++ )
+	{
+		VectorIRotate( g_localBumpBasis[i], smoothBasis, bumpNormals[i] );
+	}
+}
+
+#endif // !_STATIC_LINKED || _SHARED_LIB
--- a/mathlib/color_conversion.cpp
+++ b/mathlib/color_conversion.cpp
@ -0,0 +1,637 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Color conversion routines.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+//-----------------------------------------------------------------------------
+// Gamma conversion support
+//-----------------------------------------------------------------------------
+static byte		texgammatable[256];	// palette is sent through this to convert to screen gamma
+
+static float	texturetolinear[256];	// texture (0..255) to linear (0..1)
+static int		lineartotexture[1024];	// linear (0..1) to texture (0..255)
+static int		lineartoscreen[1024];	// linear (0..1) to gamma corrected vertex light (0..255)
+
+// build a lightmap texture to combine with surface texture, adjust for src*dst+dst*src, ramp reprogramming, etc
+float			lineartovertex[4096];	// linear (0..4) to screen corrected vertex space (0..1?)
+unsigned char	lineartolightmap[4096];	// linear (0..4) to screen corrected texture value (0..255)
+
+static float	g_Mathlib_GammaToLinear[256];	// gamma (0..1) to linear (0..1)
+static float	g_Mathlib_LinearToGamma[256];	// linear (0..1) to gamma (0..1)
+
+// This is aligned to 16-byte boundaries so that we can load it
+// onto SIMD registers easily if needed (used by SSE version of lightmaps)
+// TODO: move this into the one DLL that actually uses it, instead of statically
+// linking it everywhere via mathlib.
+ALIGN128 float	power2_n[256] = 			// 2**(index - 128) / 255
+{ 
+	1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041, 
+	1.843912707172215700E-040, 3.687825414344431300E-040, 7.375650828688862700E-040, 1.475130165737772500E-039,
+	2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038, 
+	4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037, 
+	7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036, 
+	1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035, 
+	1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033, 
+	3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032, 
+	4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031, 
+	7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030, 
+	1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028, 
+	2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027, 
+	3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026, 
+	5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025, 
+	8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024, 
+	1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022, 
+	2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021, 
+	3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020, 
+	5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019, 
+	8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018, 
+	1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016, 
+	2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015, 
+	3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014, 
+	5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013, 
+	9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012, 
+	1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010, 
+	2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009, 
+	3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008, 
+	5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007, 
+	9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006, 
+	1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004, 
+	2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003, 
+	3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002, 
+	6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001, 
+	1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000, 
+	1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002, 
+	2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003, 
+	4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004, 
+	6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005, 
+	1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006, 
+	1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008, 
+	2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009, 
+	4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010, 
+	6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011, 
+	1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012, 
+	1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014, 
+	2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015, 
+	4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016, 
+	7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017, 
+	1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018, 
+	1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020, 
+	2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021, 
+	4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022, 
+	7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023, 
+	1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024, 
+	1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026, 
+	3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027, 
+	4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028, 
+	7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029, 
+	1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031, 
+	2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032, 
+	3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033, 
+	5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034, 
+	8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035 
+};
+
+// You can use this to double check the exponent table and assert that 
+// the precomputation is correct.
+#ifdef DBGFLAG_ASSERT
+#pragma warning(push)
+#pragma warning( disable : 4189 ) // disable unused local variable warning
+static void CheckExponentTable()
+{
+	for( int i = 0; i < 256; i++ )
+	{
+		float testAgainst = pow( 2.0f, i - 128 ) / 255.0f;
+		float diff = testAgainst - power2_n[i] ;
+		float relativeDiff = diff / testAgainst;
+		Assert( testAgainst == 0 ? 
+				power2_n[i] < 1.16E-041 :
+				power2_n[i] == testAgainst );
+	}
+}
+#pragma warning(pop)
+#endif
+
+void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright )
+{
+	int		i, inf;
+	float	g1, g3;
+
+	// Con_Printf("BuildGammaTable %.1f %.1f %.1f\n", g, v_lightgamma.GetFloat(), v_texgamma.GetFloat() );
+
+	float g = gamma;
+	if (g > 3.0) 
+	{
+		g = 3.0;
+	}
+
+	g = 1.0 / g;
+	g1 = texGamma * g; 
+
+	if (brightness <= 0.0) 
+	{
+		g3 = 0.125;
+	}
+	else if (brightness > 1.0) 
+	{
+		g3 = 0.05;
+	}
+	else 
+	{
+		g3 = 0.125 - (brightness * brightness) * 0.075;
+	}
+
+	for (i=0 ; i<256 ; i++)
+	{
+		inf = 255 * pow ( i/255.f, g1 ); 
+		if (inf < 0)
+			inf = 0;
+		if (inf > 255)
+			inf = 255;
+		texgammatable[i] = inf;
+	}
+
+	for (i=0 ; i<1024 ; i++)
+	{
+		float f;
+
+		f = i / 1023.0;
+
+		// scale up
+		if (brightness > 1.0)
+			f = f * brightness;
+
+		// shift up
+		if (f <= g3)
+			f = (f / g3) * 0.125;
+		else 
+			f = 0.125 + ((f - g3) / (1.0 - g3)) * 0.875;
+
+		// convert linear space to desired gamma space
+		inf = 255 * pow ( f, g ); 
+
+		if (inf < 0)
+			inf = 0;
+		if (inf > 255)
+			inf = 255;
+		lineartoscreen[i] = inf;
+	}
+
+	/*
+	for (i=0 ; i<1024 ; i++)
+	{
+		// convert from screen gamma space to linear space
+		lineargammatable[i] = 1023 * pow ( i/1023.0, v_gamma.GetFloat() );
+		// convert from linear gamma space to screen space
+		screengammatable[i] = 1023 * pow ( i/1023.0, 1.0 / v_gamma.GetFloat() );
+	}
+	*/
+
+	for (i=0 ; i<256 ; i++)
+	{
+		// convert from nonlinear texture space (0..255) to linear space (0..1)
+		texturetolinear[i] =  pow( i / 255.f, texGamma );
+
+		// convert from linear space (0..1) to nonlinear (sRGB) space (0..1)
+		g_Mathlib_LinearToGamma[i] =  LinearToGammaFullRange( i / 255.f );
+
+		// convert from sRGB gamma space (0..1) to linear space (0..1)
+		g_Mathlib_GammaToLinear[i] =  GammaToLinearFullRange( i / 255.f );
+	}
+
+	for (i=0 ; i<1024 ; i++)
+	{
+		// convert from linear space (0..1) to nonlinear texture space (0..255)
+		lineartotexture[i] =  pow( i / 1023.0, 1.0 / texGamma ) * 255;
+	}
+
+#if 0
+	for (i=0 ; i<256 ; i++)
+	{
+		float f;
+
+		// convert from nonlinear lightmap space (0..255) to linear space (0..4)
+		// f =  (i / 255.0) * sqrt( 4 );
+		f =  i * (2.0 / 255.0);
+		f = f * f;
+
+		texlighttolinear[i] = f;
+	}
+#endif
+
+	{
+		float f;
+		float overbrightFactor = 1.0f;
+
+		// Can't do overbright without texcombine
+		// UNDONE: Add GAMMA ramp to rectify this
+		if ( overbright == 2 )
+		{
+			overbrightFactor = 0.5;
+		}
+		else if ( overbright == 4 )
+		{
+			overbrightFactor = 0.25;
+		}
+
+		for (i=0 ; i<4096 ; i++)
+		{
+			// convert from linear 0..4 (x1024) to screen corrected vertex space (0..1?)
+			f = pow ( i/1024.0, 1.0 / gamma );
+
+			lineartovertex[i] = f * overbrightFactor;
+			if (lineartovertex[i] > 1)
+				lineartovertex[i] = 1;
+
+			int nLightmap = RoundFloatToInt( f * 255 * overbrightFactor );
+			nLightmap = clamp( nLightmap, 0, 255 );
+			lineartolightmap[i] = (unsigned char)nLightmap;
+		}
+	}
+}
+
+float GammaToLinearFullRange( float gamma )
+{
+	return pow( gamma, 2.2f );
+}
+
+float LinearToGammaFullRange( float linear )
+{
+	return pow( linear, 1.0f / 2.2f );
+}
+
+float GammaToLinear( float gamma )
+{
+	Assert( s_bMathlibInitialized );
+	if ( gamma < 0.0f )
+	{
+		return 0.0f;
+	}
+
+	if ( gamma >= 0.95f )
+	{
+		// Use GammaToLinearFullRange maybe if you trip this.
+// X360TEMP
+//		Assert( gamma <= 1.0f );
+		return 1.0f;
+	}
+
+	int index = RoundFloatToInt( gamma * 255.0f );
+	Assert( index >= 0 && index < 256 );
+	return g_Mathlib_GammaToLinear[index];
+}
+
+float LinearToGamma( float linear )
+{
+	Assert( s_bMathlibInitialized );
+	if ( linear < 0.0f )
+	{
+		return 0.0f;
+	}
+	if ( linear > 1.0f )
+	{
+		// Use LinearToGammaFullRange maybe if you trip this.
+		Assert( 0 );
+		return 1.0f;
+	}
+
+	int index = RoundFloatToInt( linear * 255.0f );
+	Assert( index >= 0 && index < 256 );
+	return g_Mathlib_LinearToGamma[index];
+}
+
+//-----------------------------------------------------------------------------
+// Helper functions to convert between sRGB and 360 gamma space
+//-----------------------------------------------------------------------------
+float SrgbGammaToLinear( float flSrgbGammaValue )
+{
+	float x = clamp( flSrgbGammaValue, 0.0f, 1.0f );
+	return ( x <= 0.04045f ) ? ( x / 12.92f ) : ( pow( ( x + 0.055f ) / 1.055f, 2.4f ) );
+}
+
+float SrgbLinearToGamma( float flLinearValue )
+{
+	float x = clamp( flLinearValue, 0.0f, 1.0f );
+	return ( x <= 0.0031308f ) ? ( x * 12.92f ) : ( 1.055f * pow( x, ( 1.0f / 2.4f ) ) ) - 0.055f;
+}
+
+float X360GammaToLinear( float fl360GammaValue )
+{
+	float flLinearValue;
+
+	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
+	if ( fl360GammaValue < ( 96.0f / 255.0f ) )
+	{
+		if ( fl360GammaValue < ( 64.0f / 255.0f ) )
+		{
+			flLinearValue = fl360GammaValue * 255.0f;
+		}
+		else
+		{
+			flLinearValue = fl360GammaValue * ( 255.0f * 2.0f ) - 64.0f;
+			flLinearValue += floor( flLinearValue * ( 1.0f / 512.0f ) );
+		}
+	}
+	else
+	{
+		if( fl360GammaValue < ( 192.0f / 255.0f ) )
+		{
+			flLinearValue = fl360GammaValue * ( 255.0f * 4.0f ) - 256.0f;
+			flLinearValue += floor( flLinearValue * ( 1.0f / 256.0f ) );
+		}
+		else
+		{
+			flLinearValue = fl360GammaValue * ( 255.0f * 8.0f ) - 1024.0f;
+			flLinearValue += floor( flLinearValue * ( 1.0f / 128.0f ) );
+		}
+	}
+
+	flLinearValue *= 1.0f / 1023.0f;
+
+	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
+	return flLinearValue;
+}
+
+float X360LinearToGamma( float flLinearValue )
+{
+	float fl360GammaValue;
+
+	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
+	if ( flLinearValue < ( 128.0f / 1023.0f ) )
+	{
+		if ( flLinearValue < ( 64.0f / 1023.0f ) )
+		{
+			fl360GammaValue = flLinearValue * ( 1023.0f * ( 1.0f / 255.0f ) );
+		}
+		else
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f / 2.0f ) * ( 1.0f / 255.0f ) ) + ( 32.0f / 255.0f );
+		}
+	}
+	else
+	{
+		if ( flLinearValue < ( 512.0f / 1023.0f ) )
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f / 4.0f ) * ( 1.0f / 255.0f ) ) + ( 64.0f / 255.0f );
+		}
+		else
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f /8.0f ) * ( 1.0f / 255.0f ) ) + ( 128.0f /255.0f ); // 1.0 -> 1.0034313725490196078431372549016
+			if ( fl360GammaValue > 1.0f )
+			{
+				fl360GammaValue = 1.0f;
+			}
+		}
+	}
+
+	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
+	return fl360GammaValue;
+}
+
+float SrgbGammaTo360Gamma( float flSrgbGammaValue )
+{
+	float flLinearValue = SrgbGammaToLinear( flSrgbGammaValue );
+	float fl360GammaValue = X360LinearToGamma( flLinearValue );
+	return fl360GammaValue;
+}
+
+// convert texture to linear 0..1 value
+float TextureToLinear( int c )
+{
+	Assert( s_bMathlibInitialized );
+	if (c < 0)
+		return 0;
+	if (c > 255)
+		return 1.0;
+
+	return texturetolinear[c];
+}
+
+// convert texture to linear 0..1 value
+int LinearToTexture( float f )
+{
+	Assert( s_bMathlibInitialized );
+	int i;
+	i = f * 1023;	// assume 0..1 range
+	if (i < 0)
+		i = 0;
+	if (i > 1023)
+		i = 1023;
+
+	return lineartotexture[i];
+}
+
+
+// converts 0..1 linear value to screen gamma (0..255)
+int LinearToScreenGamma( float f )
+{
+	Assert( s_bMathlibInitialized );
+	int i;
+	i = f * 1023;	// assume 0..1 range
+	if (i < 0)
+		i = 0;
+	if (i > 1023)
+		i = 1023;
+
+	return lineartoscreen[i];
+}
+
+void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector& out )
+{
+	Assert( s_bMathlibInitialized );
+	// FIXME: Why is there a factor of 255 built into this?
+	out.x = 255.0f * TexLightToLinear( in.r, in.exponent );
+	out.y = 255.0f * TexLightToLinear( in.g, in.exponent );
+	out.z = 255.0f * TexLightToLinear( in.b, in.exponent );
+}
+
+#if 0
+// assumes that the desired mantissa range is 128..255
+static int VectorToColorRGBExp32_CalcExponent( float in )
+{
+	int power = 0;
+	
+	if( in != 0.0f )
+	{
+		while( in > 255.0f )
+		{
+			power += 1;
+			in *= 0.5f;
+		}
+		
+		while( in < 128.0f )
+		{
+			power -= 1;
+			in *= 2.0f;
+		}
+	}
+
+	return power;
+}
+
+void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
+{
+	Vector v = vin;
+	Assert( s_bMathlibInitialized );
+	Assert( v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f );
+	int i;		
+	float max = v[0];				
+	for( i = 1; i < 3; i++ )
+	{
+		// Get the maximum value.
+		if( v[i] > max )
+		{
+			max = v[i];
+		}
+	}
+				
+	// figure out the exponent for this luxel.
+	int exponent = VectorToColorRGBExp32_CalcExponent( max );
+				
+	// make the exponent fits into a signed byte.
+	if( exponent < -128 )
+	{
+		exponent = -128;
+	}
+	else if( exponent > 127 )
+	{
+		exponent = 127;
+	}
+				
+	// undone: optimize with a table
+	float scalar = pow( 2.0f, -exponent );
+	// convert to mantissa x 2^exponent format
+	for( i = 0; i < 3; i++ )
+	{
+		v[i] *= scalar;
+		// clamp
+		if( v[i] > 255.0f )
+		{
+			v[i] = 255.0f;
+		}
+	}
+	c.r = ( unsigned char )v[0];
+	c.g = ( unsigned char )v[1];
+	c.b = ( unsigned char )v[2];
+	c.exponent = ( signed char )exponent;
+}
+
+#else
+
+// given a floating point number  f, return an exponent e such that
+// for f' = f * 2^e,  f is on [128..255].
+// Uses IEEE 754 representation to directly extract this information
+// from the float.
+inline static int VectorToColorRGBExp32_CalcExponent( const float *pin )
+{
+	// The thing we will take advantage of here is that the exponent component
+	// is stored in the float itself, and because we want to map to 128..255, we
+	// want an "ideal" exponent of 2^7. So, we compute the difference between the
+	// input exponent and 7 to work out the normalizing exponent. Thus if you pass in 
+	// 32 (represented in IEEE 754 as 2^5), this function will return 2
+	// (because 32 * 2^2 = 128)
+	if (*pin == 0.0f)
+		return 0;
+
+	unsigned int fbits = *reinterpret_cast<const unsigned int *>(pin);
+	
+	// the exponent component is bits 23..30, and biased by +127
+	const unsigned int biasedSeven = 7 + 127;
+
+	signed int expComponent = ( fbits & 0x7F800000 ) >> 23;
+	expComponent -= biasedSeven; // now the difference from seven (positive if was less than, etc)
+	return expComponent;
+}
+
+
+
+/// Slightly faster version of the function to turn a float-vector color into 
+/// a compressed-exponent notation 32bit color. However, still not SIMD optimized.
+/// PS3 developer: note there is a movement of a float onto an int here, which is
+/// bad on the base registers -- consider doing this as Altivec code, or better yet
+/// moving it onto the cell.
+/// \warning: Assumes an IEEE 754 single-precision float representation! Those of you
+/// porting to an 8080 are out of luck.
+void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f );
+
+	// work out which of the channels is the largest ( we will use that to map the exponent )
+	// this is a sluggish branch-based decision tree -- most architectures will offer a [max]
+	// assembly opcode to do this faster.
+	const float *pMax;
+	if (vin.x > vin.y)
+	{
+		if (vin.x > vin.z)
+		{
+			pMax = &vin.x;
+		}
+		else
+		{
+			pMax = &vin.z;
+		}
+	}
+	else
+	{
+		if (vin.y > vin.z)
+		{
+			pMax = &vin.y;
+		}
+		else
+		{
+			pMax = &vin.z;
+		}
+	}
+
+	// now work out the exponent for this luxel. 
+	signed int exponent = VectorToColorRGBExp32_CalcExponent( pMax );
+
+	// make sure the exponent fits into a signed byte.
+	// (in single precision format this is assured because it was a signed byte to begin with)
+	Assert(exponent > -128 && exponent <= 127);
+
+	// promote the exponent back onto a scalar that we'll use to normalize all the numbers
+	float scalar;
+	{
+		unsigned int fbits = (127 - exponent) << 23;
+		scalar = *reinterpret_cast<float *>(&fbits);
+	}
+
+	// we should never need to clamp:
+	Assert(vin.x * scalar <= 255.0f && 
+		   vin.y * scalar <= 255.0f && 
+		   vin.z * scalar <= 255.0f);
+
+	// This awful construction is necessary to prevent VC2005 from using the 
+	// fldcw/fnstcw control words around every float-to-unsigned-char operation.
+	{
+		int red = (vin.x * scalar);
+		int green = (vin.y * scalar);
+		int blue = (vin.z * scalar);
+
+		c.r = red;
+		c.g = green;
+		c.b = blue;
+	}
+	/*
+	c.r = ( unsigned char )(vin.x * scalar);
+	c.g = ( unsigned char )(vin.y * scalar);
+	c.b = ( unsigned char )(vin.z * scalar);
+	*/
+
+	c.exponent = ( signed char )exponent;
+}
+
+#endif
--- a/mathlib/datagen.pl
+++ b/mathlib/datagen.pl
@ -0,0 +1,63 @@
+#! perl
+use Text::Wrap;
+
+# generate output data for noise generators
+
+srand(31456);
+
+print <<END
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: static data for noise() primitives.
+//
+// \$Workfile:     \$
+// \$NoKeywords: \$
+//=============================================================================//
+//
+//    **** DO NOT EDIT THIS FILE. GENERATED BY DATAGEN.PL ****
+//
+
+END
+;
+
+@perm_a=0..255;
+
+&fisher_yates_shuffle(\@perm_a);
+
+$Text::Wrap::Columns=78;
+$Text::Wrap::break=",";
+$Text::Wrap::separator=",\n";
+
+print "static int perm_a[]={\n",wrap('    ','   ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static int perm_b[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static int perm_c[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static int perm_d[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+
+for ($i=0;$i<256;$i++)
+  {
+	$float_perm=(1.0/255.0)*$perm_a[$i];
+	$perm_a[$i] = sprintf("%f",$float_perm);
+  }
+&fisher_yates_shuffle(\@perm_a);
+print "static float impulse_xcoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static float impulse_ycoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static float impulse_zcoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+
+
+
+# fisher_yates_shuffle( \@array ) : generate a random permutation
+# of @array in place
+sub fisher_yates_shuffle {
+    my $array = shift;
+    my $i;
+    for ($i = @$array; --$i; ) {
+        my $j = int rand ($i+1);
+        next if $i == $j;
+        @$array[$i,$j] = @$array[$j,$i];
+    }
+}
--- a/mathlib/halton.cpp
+++ b/mathlib/halton.cpp
@ -0,0 +1,30 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#include <halton.h>
+
+HaltonSequenceGenerator_t::HaltonSequenceGenerator_t(int b)
+{
+	base=b;
+	fbase=(float) b;
+	seed=1;
+
+}
+
+float HaltonSequenceGenerator_t::GetElement(int elem)
+{
+	int tmpseed=seed;
+	float ret=0.0;
+	float base_inv=1.0/fbase;
+	while(tmpseed)
+	{
+		int dig=tmpseed % base;
+		ret+=((float) dig)*base_inv;
+		base_inv/=fbase;
+		tmpseed/=base;
+	}
+	return ret;
+}
--- a/mathlib/imagequant.cpp
+++ b/mathlib/imagequant.cpp
@ -0,0 +1,96 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#include <quantize.h>
+#include <minmax.h>
+
+#define N_EXTRAVALUES 1
+#define N_DIMENSIONS (3+N_EXTRAVALUES)
+
+#define PIXEL(x,y,c) Image[4*((x)+((Width*(y))))+c]
+
+static uint8 Weights[]={5,7,4,8};
+static int ExtraValueXForms[3*N_EXTRAVALUES]={
+	76,151,28,
+};
+
+  
+
+#define MAX_QUANTIZE_IMAGE_WIDTH 4096
+
+void ColorQuantize(uint8 const *Image,
+				   int Width,
+				   int Height,
+				   int flags, int ncolors,
+				   uint8 *out_pixels,
+				   uint8 *out_palette,
+				   int firstcolor)
+{
+	int Error[MAX_QUANTIZE_IMAGE_WIDTH+1][3][2];
+	struct Sample *s=AllocSamples(Width*Height,N_DIMENSIONS);
+	int x,y,c;
+	for(y=0;y<Height;y++)
+		for(x=0;x<Width;x++)
+		{
+			for(c=0;c<3;c++)
+				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=PIXEL(x,y,c);
+			// now, let's generate extra values to quantize on
+			for(int i=0;i<N_EXTRAVALUES;i++)
+			{
+				int val1=0;
+				for(c=0;c<3;c++)
+					val1+=PIXEL(x,y,c)*ExtraValueXForms[i*3+c];
+				val1>>=8;
+				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=(uint8)
+					(min(255,max(0,val1)));
+			}
+		}
+	struct QuantizedValue *q=Quantize(s,Width*Height,N_DIMENSIONS,
+									  ncolors,Weights,firstcolor);
+	delete[] s;
+	memset(out_palette,0x55,768);
+	for(int p=0;p<256;p++)
+	{
+		struct QuantizedValue *v=FindQNode(q,p);
+		if (v)
+			for(int c=0;c<3;c++)
+				out_palette[p*3+c]=v->Mean[c];
+	}
+	memset(Error,0,sizeof(Error));
+	for(y=0;y<Height;y++)
+	{
+		int ErrorUse=y & 1;
+		int ErrorUpdate=ErrorUse^1;
+		for(x=0;x<Width;x++)
+		{
+			uint8 samp[3];
+			for(c=0;c<3;c++)
+			{
+				int tryc=PIXEL(x,y,c);
+				if (! (flags & QUANTFLAGS_NODITHER))
+				{
+					tryc+=Error[x][c][ErrorUse];
+					Error[x][c][ErrorUse]=0;
+				}
+				samp[c]=(uint8) min(255,max(0,tryc));
+			}
+			struct QuantizedValue *f=FindMatch(samp,3,Weights,q);
+			out_pixels[Width*y+x]=(uint8) (f->value);
+			if (! (flags & QUANTFLAGS_NODITHER))
+				for(int i=0;i<3;i++)
+				{
+					int newerr=samp[i]-f->Mean[i];
+					int orthog_error=(newerr*3)/8;
+					Error[x+1][i][ErrorUse]+=orthog_error;
+					Error[x][i][ErrorUpdate]=orthog_error;
+					Error[x+1][i][ErrorUpdate]=newerr-2*orthog_error;
+				}
+		}
+	}
+	if (q) FreeQuantization(q);
+}
+
--- a/mathlib/lightdesc.cpp
+++ b/mathlib/lightdesc.cpp
@ -0,0 +1,312 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#include <ssemath.h>
+#include <lightdesc.h>
+#include "mathlib.h"
+
+void LightDesc_t::RecalculateDerivedValues(void)
+{
+	m_Flags=0;
+	if (m_Attenuation0)
+		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0;
+	if (m_Attenuation1)
+		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1;
+	if (m_Attenuation2)
+		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2;
+	
+	if (m_Type==MATERIAL_LIGHT_SPOT)
+	{
+		m_ThetaDot=cos(m_Theta);
+		m_PhiDot=cos(m_Phi);
+		float spread=m_ThetaDot-m_PhiDot;
+		if (spread>1.0e-10)
+		{
+			// note - this quantity is very sensitive to round off error. the sse
+			// reciprocal approximation won't cut it here.
+			OneOver_ThetaDot_Minus_PhiDot=1.0/spread;
+		}
+		else
+		{
+			// hard falloff instead of divide by zero
+			OneOver_ThetaDot_Minus_PhiDot=1.0;
+		}				
+	}	
+	if (m_Type==MATERIAL_LIGHT_DIRECTIONAL)
+	{
+		// set position to be real far away in the right direction
+		m_Position=m_Direction;
+		m_Position *= 2.0e6;
+	}
+	
+	m_RangeSquared=m_Range*m_Range;
+
+}
+
+void LightDesc_t::ComputeLightAtPointsForDirectional(
+	const FourVectors &pos, const FourVectors &normal,
+	FourVectors &color, bool DoHalfLambert ) const
+{
+	FourVectors delta;
+	delta.DuplicateVector(m_Direction);
+//	delta.VectorNormalizeFast();
+	fltx4 strength=delta*normal;
+	if (DoHalfLambert)
+	{
+		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
+	}
+	else
+		strength=MaxSIMD(Four_Zeros,delta*normal);
+		
+	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
+	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
+	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
+}
+
+
+void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
+										FourVectors &color, bool DoHalfLambert ) const
+{
+	FourVectors delta;
+	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
+	switch (m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+		case MATERIAL_LIGHT_SPOT:
+			delta.DuplicateVector(m_Position);
+			delta-=pos;
+			break;
+				
+		case MATERIAL_LIGHT_DIRECTIONAL:
+			ComputeLightAtPointsForDirectional( pos, normal, color, DoHalfLambert );
+			return;
+	}
+
+	fltx4 dist2 = delta*delta;
+
+	dist2=MaxSIMD( Four_Ones, dist2 );
+
+	fltx4 falloff;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
+	{
+		falloff = ReplicateX4(m_Attenuation0);
+	}
+	else
+		falloff= Four_Epsilons;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
+	}
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
+	}
+
+	falloff=ReciprocalEstSIMD(falloff);
+	// Cull out light beyond this radius
+	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
+	if (m_Range != 0.f)
+	{
+		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
+		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
+	}
+
+	delta.VectorNormalizeFast();
+	fltx4 strength=delta*normal;
+	if (DoHalfLambert)
+	{
+		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
+	}
+	else
+		strength=MaxSIMD(Four_Zeros,delta*normal);
+		
+	switch(m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+			// half-lambert
+			break;
+				
+		case MATERIAL_LIGHT_SPOT:
+		{
+			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff
+
+
+			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(OneOver_ThetaDot_Minus_PhiDot),
+												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
+			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
+			
+			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
+			{
+				// !!speed!! could compute integer exponent needed by powsimd and store in light
+				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
+			}
+			strength=MulSIMD(cone_falloff_scale,strength);
+
+			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
+			// from pow function, etc
+			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
+			strength=AndSIMD(OutsideMask,strength);
+		}
+		break;
+			
+
+	}
+	strength=MulSIMD(strength,falloff);
+	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
+	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
+	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
+}
+
+
+
+void LightDesc_t::ComputeNonincidenceLightAtPoints( const FourVectors &pos, FourVectors &color ) const
+{
+	FourVectors delta;
+	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
+	switch (m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+		case MATERIAL_LIGHT_SPOT:
+			delta.DuplicateVector(m_Position);
+			delta-=pos;
+			break;
+				
+		case MATERIAL_LIGHT_DIRECTIONAL:
+			return;
+	}
+
+	fltx4 dist2 = delta*delta;
+
+	dist2=MaxSIMD( Four_Ones, dist2 );
+
+	fltx4 falloff;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
+	{
+		falloff = ReplicateX4(m_Attenuation0);
+	}
+	else
+		falloff= Four_Epsilons;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
+	}
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
+	}
+
+	falloff=ReciprocalEstSIMD(falloff);
+	// Cull out light beyond this radius
+	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
+	if (m_Range != 0.f)
+	{
+		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
+		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
+	}
+
+	delta.VectorNormalizeFast();
+	fltx4 strength = Four_Ones;
+	//fltx4 strength=delta;
+	//fltx4 strength = MaxSIMD(Four_Zeros,delta);
+		
+	switch(m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+			// half-lambert
+			break;
+				
+		case MATERIAL_LIGHT_SPOT:
+		{
+			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff
+
+
+			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(OneOver_ThetaDot_Minus_PhiDot),
+												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
+			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
+			
+			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
+			{
+				// !!speed!! could compute integer exponent needed by powsimd and store in light
+				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
+			}
+			strength=MulSIMD(cone_falloff_scale,strength);
+
+			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
+			// from pow function, etc
+			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
+			strength=AndSIMD(OutsideMask,strength);
+		}
+		break;
+			
+
+	}
+	strength=MulSIMD(strength,falloff);
+	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
+	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
+	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
+}
+
+
+
+void LightDesc_t::SetupOldStyleAttenuation( float fQuadraticAttn, float fLinearAttn, float fConstantAttn )
+{
+	// old-style manually typed quadrtiac coefficients
+	if ( fQuadraticAttn < EQUAL_EPSILON )
+		fQuadraticAttn = 0;
+	
+	if ( fLinearAttn < EQUAL_EPSILON)
+		fLinearAttn = 0;
+	
+	if ( fConstantAttn < EQUAL_EPSILON)
+		fConstantAttn = 0;
+	
+	if ( ( fConstantAttn < EQUAL_EPSILON ) && 
+		 ( fLinearAttn < EQUAL_EPSILON ) && 
+		 ( fQuadraticAttn < EQUAL_EPSILON ) )
+		fConstantAttn = 1;
+
+	m_Attenuation2=fQuadraticAttn;
+	m_Attenuation1=fLinearAttn;
+	m_Attenuation0=fConstantAttn;
+	float fScaleFactor = fQuadraticAttn * 10000 + fLinearAttn * 100 + fConstantAttn;
+	
+	if ( fScaleFactor > 0 )
+		m_Color *= fScaleFactor;
+}
+
+void LightDesc_t::SetupNewStyleAttenuation( float fFiftyPercentDistance, 
+											float fZeroPercentDistance )
+{
+	// new style storing 50% and 0% distances
+	float d50=fFiftyPercentDistance;
+	float d0=fZeroPercentDistance;
+	if (d0<d50)
+	{
+		// !!warning in lib code???!!!
+		Warning("light has _fifty_percent_distance of %f but no zero_percent_distance\n",d50);
+		d0=2.0*d50;
+	}
+	float a=0,b=1,c=0;
+	if (! SolveInverseQuadraticMonotonic(0,1.0,d50,2.0,d0,256.0,a,b,c))
+	{
+		Warning("can't solve quadratic for light %f %f\n",d50,d0);
+	}
+	float v50=c+d50*(b+d50*a);
+	float scale=2.0/v50;
+	a*=scale;
+	b*=scale;
+	c*=scale;
+	m_Attenuation2=a;
+	m_Attenuation1=b;
+	m_Attenuation0=c;
+}
+
--- a/mathlib/mathlib-2005.vcproj
+++ b/mathlib/mathlib-2005.vcproj
@ -0,0 +1,403 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="8.00"
+	Name="mathlib"
+	ProjectGUID="{884C66F2-7F84-4570-AE6C-B634C1113D69}"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\Debug"
+			IntermediateDirectory=".\Debug"
+			ConfigurationType="4"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine=""
+				ExcludedFromBuild="false"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				UseUnicodeResponseFiles="false"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\common;..\public;..\public\tier0;..\public\tier1;..\public\mathlib"
+				PreprocessorDefinitions="WIN32;_WIN32;_DEBUG;DEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_DEPRECATE"
+				StringPooling="true"
+				MinimalRebuild="true"
+				ExceptionHandling="0"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				BufferSecurityCheck="false"
+				FloatingPointModel="2"
+				TreatWChar_tAsBuiltInType="true"
+				ForceConformanceInForLoopScope="true"
+				RuntimeTypeInfo="true"
+				OpenMP="false"
+				UsePrecompiledHeader="0"
+				ExpandAttributedSource="false"
+				AssemblerOutput="0"
+				AssemblerListingLocation="$(IntDir)/"
+				ObjectFile="$(IntDir)/"
+				ProgramDataBaseFileName="$(IntDir)/"
+				GenerateXMLDocumentationFiles="false"
+				BrowseInformation="0"
+				BrowseInformationFile="$(IntDir)/"
+				WarningLevel="4"
+				Detect64BitPortabilityProblems="true"
+				DebugInformationFormat="4"
+				CompileAs="2"
+				ErrorReporting="1"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+				CommandLine=""
+				ExcludedFromBuild="false"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				UseUnicodeResponseFiles="false"
+				OutputFile="..\lib\public\mathlib.lib"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)/mathlib.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				ExcludedFromBuild="false"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\Release"
+			IntermediateDirectory=".\Release"
+			ConfigurationType="4"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine=""
+				ExcludedFromBuild="false"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				UseUnicodeResponseFiles="false"
+				Optimization="2"
+				InlineFunctionExpansion="2"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="1"
+				AdditionalIncludeDirectories="..\common;..\public;..\public\tier0;..\public\tier1;..\public\mathlib"
+				PreprocessorDefinitions="WIN32;_WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_DEPRECATE"
+				StringPooling="true"
+				ExceptionHandling="0"
+				RuntimeLibrary="0"
+				BufferSecurityCheck="false"
+				EnableFunctionLevelLinking="true"
+				FloatingPointModel="2"
+				TreatWChar_tAsBuiltInType="true"
+				ForceConformanceInForLoopScope="true"
+				RuntimeTypeInfo="true"
+				OpenMP="false"
+				UsePrecompiledHeader="0"
+				ExpandAttributedSource="false"
+				AssemblerOutput="0"
+				AssemblerListingLocation="$(IntDir)/"
+				ObjectFile="$(IntDir)/"
+				ProgramDataBaseFileName="$(IntDir)/"
+				GenerateXMLDocumentationFiles="false"
+				BrowseInformation="0"
+				BrowseInformationFile="$(IntDir)/"
+				WarningLevel="4"
+				Detect64BitPortabilityProblems="true"
+				DebugInformationFormat="1"
+				CompileAs="2"
+				ErrorReporting="1"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+				CommandLine=""
+				ExcludedFromBuild="false"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				UseUnicodeResponseFiles="false"
+				OutputFile="..\lib\public\mathlib.lib"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)/mathlib.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				ExcludedFromBuild="false"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			>
+			<File
+				RelativePath=".\3dnow.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\anorms.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\bumpvects.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\color_conversion.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\halton.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\IceKey.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\imagequant.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\lightdesc.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\mathlib_base.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\polyhedron.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\powsse.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\quantize.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\randsse.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\simdvectormatrix.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\sparse_convolution_noise.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\sse.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\sseconst.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\ssenoise.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\vector.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\vmatrix.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Public Header Files"
+			>
+			<File
+				RelativePath="..\public\mathlib\amd3dx.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\anorms.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\bumpvects.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\compressed_3d_unitvec.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\compressed_light_cube.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\compressed_vector.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\halton.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\IceKey.H"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\lightdesc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\math_pfns.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\mathlib.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\noise.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\polyhedron.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\quantize.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\simdvectormatrix.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\ssemath.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\ssequaternion.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\vector.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\vector2d.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\vector4d.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\vmatrix.h"
+				>
+			</File>
+			<File
+				RelativePath="..\public\mathlib\vplane.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			>
+			<File
+				RelativePath=".\3dnow.h"
+				>
+			</File>
+			<File
+				RelativePath=".\noisedata.h"
+				>
+			</File>
+			<File
+				RelativePath=".\sse.h"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/mathlib/mathlib_base.cpp
+++ b/mathlib/mathlib_base.cpp
--- a/mathlib/noisedata.h
+++ b/mathlib/noisedata.h
@ -0,0 +1,180 @@
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: static data for noise() primitives.
+//
+// $Workfile:     $
+// $NoKeywords: $
+//=============================================================================//
+//
+//    **** DO NOT EDIT THIS FILE. GENERATED BY DATAGEN.PL ****
+//
+
+static int perm_a[]={
+    66,147,106,213,89,115,239,25,171,175,9,114,141,226,118,128,41,208,4,56,
+   180,248,43,82,246,219,94,245,133,131,222,103,160,130,168,145,238,38,23,6,
+   236,67,99,2,70,232,80,209,1,3,68,65,102,210,13,73,55,252,187,170,22,36,
+   52,181,117,163,46,79,166,224,148,75,113,95,156,185,220,164,51,142,161,35,
+   206,251,45,136,197,190,132,32,218,127,63,27,137,93,242,20,189,108,183,
+   122,139,191,249,253,87,98,69,0,144,64,24,214,97,116,158,42,107,15,53,212,
+   83,111,152,240,74,237,62,77,205,149,26,151,178,204,91,176,234,49,154,203,
+   33,221,125,134,165,124,86,39,37,60,150,157,179,109,110,44,159,153,5,100,
+   10,207,40,186,96,215,143,162,230,184,101,54,174,247,76,59,241,223,192,84,
+   104,78,169,146,138,30,48,85,233,19,29,92,126,17,199,250,31,81,188,225,28,
+   112,88,11,182,173,211,129,194,172,14,120,200,167,135,12,177,227,229,155,
+   201,61,105,195,193,244,235,58,8,196,123,254,16,18,50,121,71,243,90,57,
+   202,119,255,47,7,198,228,21,217,216,231,140,72,34
+};
+
+static int perm_b[]={
+    123,108,201,64,40,75,24,221,137,110,191,142,9,69,230,83,7,247,51,54,115,
+    133,180,248,109,116,62,99,251,55,89,253,65,106,228,167,131,132,58,143,
+    97,102,163,202,149,234,12,117,174,94,121,74,32,113,20,60,159,182,204,29,
+    244,118,3,178,255,38,6,114,36,93,30,134,213,90,245,209,88,232,162,125,
+    84,166,70,136,208,231,27,71,157,80,76,0,170,225,203,176,33,161,196,128,
+    252,236,246,2,138,1,250,197,77,243,218,242,19,164,68,212,14,237,144,63,
+    46,103,177,188,85,223,8,160,222,4,216,219,35,15,44,23,126,127,100,226,
+    235,37,168,101,49,22,11,73,61,135,111,183,72,96,185,239,82,18,50,155,
+    186,153,17,233,146,156,107,5,254,10,192,198,148,207,104,13,124,48,95,
+    129,120,206,199,81,249,91,150,210,119,240,122,194,92,34,28,205,175,227,
+    179,220,140,152,79,26,195,47,66,173,169,241,53,184,187,145,112,238,214,
+    147,98,171,229,200,151,25,67,78,189,217,130,224,57,172,59,41,43,16,105,
+    158,165,21,45,56,141,139,215,190,86,42,52,39,87,181,31,154,193,211
+};
+
+static int perm_c[]={
+    97,65,96,25,122,26,219,85,148,251,102,0,140,130,136,213,138,60,236,52,
+    178,131,115,183,144,78,147,168,39,45,169,70,57,146,67,142,252,216,28,54,
+    86,222,194,200,48,5,205,125,214,56,181,255,196,155,37,218,153,208,66,
+    242,73,248,206,61,62,246,177,2,197,107,162,152,89,41,6,160,94,8,201,38,
+    235,228,165,93,111,239,74,231,121,47,166,221,157,64,77,244,29,105,150,
+    123,190,191,225,118,133,42,10,84,185,159,124,132,240,180,44,1,9,19,99,
+    254,12,207,186,71,234,184,11,20,16,193,139,175,98,59,113,27,170,230,91,
+    187,46,156,249,108,195,171,114,14,188,82,192,233,24,32,241,87,164,90,43,
+    163,245,92,40,215,55,226,15,3,112,158,250,172,22,227,137,35,128,145,247,
+    161,119,80,217,189,81,7,63,202,120,223,83,179,4,106,199,229,95,53,50,33,
+    182,72,143,23,243,75,18,173,141,167,198,204,58,174,237,17,129,238,127,
+    31,101,176,36,30,110,209,34,203,135,232,68,149,49,134,126,212,79,76,117,
+    104,210,211,224,253,100,220,109,116,88,13,151,154,69,21,51,103
+};
+
+static int perm_d[]={
+    94,234,145,235,151,166,187,238,4,5,128,115,87,107,229,175,190,108,218,
+    32,17,220,97,90,122,121,71,109,64,227,225,75,81,19,27,162,3,89,139,69,
+    92,26,48,215,116,191,114,2,104,157,66,39,1,127,96,124,30,0,82,233,219,
+    42,131,173,35,201,182,144,14,98,148,244,160,159,179,91,31,68,119,154,
+    205,113,149,167,44,60,18,228,251,245,43,10,80,15,129,67,181,174,6,45,
+    194,237,213,52,99,232,211,212,164,217,57,153,156,102,134,20,249,132,55,
+    204,65,33,231,85,61,37,163,193,189,170,226,63,168,236,165,224,242,195,
+    41,200,40,70,112,100,36,172,130,74,137,252,243,135,230,161,207,16,146,
+    198,118,150,24,29,250,188,25,209,103,23,105,47,7,46,133,83,184,50,79,
+    110,120,53,253,206,214,9,240,101,147,152,183,254,59,126,216,197,171,51,
+    208,248,202,58,176,28,72,177,185,141,12,11,56,222,86,178,155,223,88,111,
+    73,142,210,138,239,221,199,192,84,93,241,125,76,77,255,95,8,78,247,186,
+    123,196,13,140,180,143,54,106,136,34,62,169,38,117,22,21,49,203,158,246
+};
+
+static float impulse_xcoords[]={
+    0.788235,0.541176,0.972549,0.082353,0.352941,0.811765,0.286275,0.752941,
+    0.203922,0.705882,0.537255,0.886275,0.580392,0.137255,0.800000,0.533333,
+    0.117647,0.447059,0.129412,0.925490,0.086275,0.478431,0.666667,0.568627,
+    0.678431,0.313725,0.321569,0.349020,0.988235,0.419608,0.898039,0.219608,
+    0.243137,0.623529,0.501961,0.772549,0.952941,0.517647,0.949020,0.701961,
+    0.454902,0.505882,0.564706,0.960784,0.207843,0.007843,0.831373,0.184314,
+    0.576471,0.462745,0.572549,0.247059,0.262745,0.694118,0.615686,0.121569,
+    0.384314,0.749020,0.145098,0.717647,0.415686,0.607843,0.105882,0.101961,
+    0.200000,0.807843,0.521569,0.780392,0.466667,0.552941,0.996078,0.627451,
+    0.992157,0.529412,0.407843,0.011765,0.709804,0.458824,0.058824,0.819608,
+    0.176471,0.317647,0.392157,0.223529,0.156863,0.490196,0.325490,0.074510,
+    0.239216,0.164706,0.890196,0.603922,0.921569,0.839216,0.854902,0.098039,
+    0.686275,0.843137,0.152941,0.372549,0.062745,0.474510,0.486275,0.227451,
+    0.400000,0.298039,0.309804,0.274510,0.054902,0.815686,0.647059,0.635294,
+    0.662745,0.976471,0.094118,0.509804,0.650980,0.211765,0.180392,0.003922,
+    0.827451,0.278431,0.023529,0.525490,0.450980,0.725490,0.690196,0.941176,
+    0.639216,0.560784,0.196078,0.364706,0.043137,0.494118,0.796078,0.113725,
+    0.760784,0.729412,0.258824,0.290196,0.584314,0.674510,0.823529,0.905882,
+    0.917647,0.070588,0.862745,0.345098,0.913725,0.937255,0.031373,0.215686,
+    0.768627,0.333333,0.411765,0.423529,0.945098,0.721569,0.039216,0.792157,
+    0.956863,0.266667,0.254902,0.047059,0.294118,0.658824,0.250980,1.000000,
+    0.984314,0.756863,0.027451,0.305882,0.835294,0.513725,0.360784,0.776471,
+    0.611765,0.192157,0.866667,0.858824,0.592157,0.803922,0.141176,0.435294,
+    0.588235,0.619608,0.341176,0.109804,0.356863,0.270588,0.737255,0.847059,
+    0.050980,0.764706,0.019608,0.870588,0.933333,0.784314,0.549020,0.337255,
+    0.631373,0.929412,0.231373,0.427451,0.078431,0.498039,0.968627,0.654902,
+    0.125490,0.698039,0.015686,0.878431,0.713725,0.368627,0.431373,0.874510,
+    0.403922,0.556863,0.443137,0.964706,0.909804,0.301961,0.035294,0.850980,
+    0.882353,0.741176,0.380392,0.133333,0.470588,0.643137,0.282353,0.396078,
+    0.980392,0.168627,0.149020,0.235294,0.670588,0.596078,0.733333,0.160784,
+    0.376471,0.682353,0.545098,0.482353,0.745098,0.894118,0.188235,0.329412,
+    0.439216,0.901961,0.000000,0.600000,0.388235,0.172549,0.090196,0.066667
+};
+
+static float impulse_ycoords[]={
+    0.827451,0.337255,0.941176,0.886275,0.878431,0.239216,0.400000,0.164706,
+    0.490196,0.411765,0.964706,0.349020,0.803922,0.317647,0.647059,0.431373,
+    0.933333,0.156863,0.094118,0.219608,0.039216,0.521569,0.498039,0.705882,
+    0.717647,0.047059,0.631373,0.517647,0.984314,0.847059,0.482353,0.439216,
+    0.250980,0.862745,0.690196,0.913725,0.270588,0.070588,0.027451,0.694118,
+    0.811765,0.000000,0.494118,0.823529,0.800000,0.600000,0.003922,0.443137,
+    0.639216,0.376471,0.031373,0.035294,0.552941,0.215686,0.305882,0.133333,
+    0.564706,0.176471,0.211765,0.874510,0.360784,0.654902,0.223529,0.807843,
+    0.372549,0.137255,0.321569,0.015686,0.007843,0.262745,0.125490,0.078431,
+    0.396078,0.976471,0.929412,1.000000,0.937255,0.509804,0.188235,0.850980,
+    0.831373,0.392157,0.741176,0.541176,0.592157,0.286275,0.345098,0.572549,
+    0.537255,0.725490,0.839216,0.184314,0.772549,0.149020,0.505882,0.423529,
+    0.780392,0.011765,0.890196,0.086275,0.427451,0.023529,0.788235,0.050980,
+    0.760784,0.603922,0.066667,0.643137,0.623529,0.960784,0.172549,0.333333,
+    0.082353,0.290196,0.992157,0.709804,0.894118,0.596078,0.243137,0.752941,
+    0.486275,0.670588,0.949020,0.784314,0.145098,0.560784,0.513725,0.180392,
+    0.580392,0.996078,0.380392,0.556863,0.407843,0.945098,0.117647,0.058824,
+    0.678431,0.129412,0.192157,0.105882,0.968627,0.545098,0.462745,0.227451,
+    0.019608,0.866667,0.674510,0.207843,0.627451,0.819608,0.921569,0.356863,
+    0.447059,0.533333,0.435294,0.341176,0.054902,0.529412,0.235294,0.764706,
+    0.615686,0.043137,0.745098,0.266667,0.501961,0.619608,0.776471,0.450980,
+    0.309804,0.325490,0.200000,0.635294,0.247059,0.698039,0.721569,0.168627,
+    0.854902,0.141176,0.611765,0.525490,0.415686,0.298039,0.254902,0.858824,
+    0.568627,0.329412,0.062745,0.843137,0.588235,0.733333,0.607843,0.478431,
+    0.576471,0.662745,0.470588,0.666667,0.980392,0.113725,0.898039,0.203922,
+    0.294118,0.152941,0.098039,0.909804,0.796078,0.768627,0.713725,0.196078,
+    0.368627,0.419608,0.352941,0.090196,0.749020,0.121569,0.882353,0.278431,
+    0.388235,0.917647,0.701961,0.729412,0.835294,0.258824,0.301961,0.101961,
+    0.792157,0.474510,0.686275,0.658824,0.364706,0.682353,0.458824,0.815686,
+    0.282353,0.160784,0.870588,0.988235,0.756863,0.549020,0.274510,0.384314,
+    0.650980,0.737255,0.901961,0.956863,0.972549,0.584314,0.925490,0.403922,
+    0.074510,0.454902,0.952941,0.109804,0.313725,0.905882,0.231373,0.466667
+};
+
+static float impulse_zcoords[]={
+    0.082353,0.643137,0.415686,0.929412,0.568627,0.509804,0.537255,0.815686,
+    0.698039,0.941176,0.776471,0.752941,0.737255,0.525490,0.498039,0.423529,
+    0.792157,0.125490,0.619608,0.164706,0.368627,0.870588,0.137255,0.372549,
+    0.466667,0.486275,0.501961,0.513725,0.709804,0.576471,0.203922,0.258824,
+    0.152941,0.556863,0.223529,0.047059,0.235294,0.474510,0.764706,0.552941,
+    0.847059,0.145098,0.176471,0.937255,0.654902,0.894118,0.729412,0.054902,
+    0.666667,0.749020,0.262745,0.560784,0.431373,0.286275,0.352941,0.239216,
+    0.156863,0.839216,0.427451,0.949020,0.384314,0.227451,0.180392,0.074510,
+    0.172549,0.356863,0.066667,0.517647,0.447059,0.184314,0.062745,0.670588,
+    0.603922,0.219608,0.270588,0.976471,0.505882,0.627451,0.819608,0.854902,
+    0.843137,0.019608,0.713725,0.035294,0.925490,0.349020,0.866667,0.701961,
+    0.909804,0.811765,0.717647,0.141176,0.917647,0.023529,0.098039,0.803922,
+    0.733333,0.658824,0.827451,0.133333,0.858824,0.800000,0.635294,1.000000,
+    0.078431,0.450980,0.835294,0.321569,0.360784,0.529412,0.725490,0.572549,
+    0.639216,0.341176,0.533333,0.094118,0.149020,0.545098,0.101961,0.901961,
+    0.278431,0.694118,0.521569,0.490196,0.454902,0.329412,0.274510,0.027451,
+    0.745098,0.933333,0.443137,0.168627,0.192157,0.988235,0.070588,0.972549,
+    0.768627,0.400000,0.470588,0.207843,0.215686,0.388235,0.439216,0.780392,
+    0.482353,0.121569,0.964706,0.086275,0.890196,0.337255,0.109804,0.305882,
+    0.113725,0.435294,0.721569,0.772549,0.807843,0.741176,0.254902,0.596078,
+    0.494118,0.317647,0.419608,0.000000,0.188235,0.031373,0.376471,0.380392,
+    0.611765,0.945098,0.411765,0.313725,0.874510,0.588235,0.678431,0.160784,
+    0.007843,0.090196,0.850980,0.788235,0.705882,0.266667,0.309804,0.541176,
+    0.231373,0.129412,0.294118,0.243137,0.913725,0.996078,0.117647,0.478431,
+    0.290196,0.549020,0.682353,0.784314,0.396078,0.831373,0.984314,0.584314,
+    0.039216,0.250980,0.600000,0.392157,0.298039,0.050980,0.364706,0.105882,
+    0.623529,0.886275,0.980392,0.325490,0.247059,0.690196,0.674510,0.960784,
+    0.647059,0.211765,0.882353,0.686275,0.823529,0.058824,0.956863,0.043137,
+    0.345098,0.301961,0.592157,0.862745,0.607843,0.458824,0.282353,0.003922,
+    0.580392,0.760784,0.564706,0.011765,0.968627,0.905882,0.756863,0.952941,
+    0.662745,0.015686,0.898039,0.196078,0.333333,0.992157,0.650980,0.407843,
+    0.796078,0.615686,0.878431,0.921569,0.631373,0.200000,0.403922,0.462745
+};
+
--- a/mathlib/polyhedron.cpp
+++ b/mathlib/polyhedron.cpp
--- a/mathlib/powsse.cpp
+++ b/mathlib/powsse.cpp
@ -0,0 +1,39 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#include "mathlib/ssemath.h"
+
+fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent)
+{
+	fltx4 rslt=Four_Ones;									// x^0=1.0
+	int xp=abs(exponent);
+	if (xp & 3)												// fraction present?
+	{
+		fltx4 sq_rt=SqrtEstSIMD(x);
+		if (xp & 1)											// .25?
+			rslt=SqrtEstSIMD(sq_rt);						// x^.25
+		if (xp & 2)
+			rslt=MulSIMD(rslt,sq_rt);
+	}
+	xp>>=2;													// strip fraction
+	fltx4 curpower=x;										// curpower iterates through  x,x^2,x^4,x^8,x^16...
+
+	while(1)
+	{
+		if (xp & 1)
+			rslt=MulSIMD(rslt,curpower);
+		xp>>=1;
+		if (xp)
+			curpower=MulSIMD(curpower,curpower);
+		else
+			break;
+	}
+	if (exponent<0)
+		return ReciprocalEstSIMD(rslt);							// pow(x,-b)=1/pow(x,b)
+	else
+		return rslt;
+}
+
--- a/mathlib/quantize.cpp
+++ b/mathlib/quantize.cpp
@ -0,0 +1,679 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef STDIO_H
+#include <stdio.h>
+#endif
+
+#ifndef STRING_H
+#include <string.h>
+#endif
+
+#ifndef QUANTIZE_H
+#include <quantize.h>
+#endif
+
+#include <stdlib.h>
+#include <minmax.h>
+
+#include <math.h>
+
+static int current_ndims;
+static struct QuantizedValue *current_root;
+static int current_ssize;
+
+static uint8 *current_weights;
+
+double SquaredError;
+
+#define SPLIT_THEN_SORT 1
+
+#define SQ(x) ((x)*(x))
+
+static struct QuantizedValue *AllocQValue(void)
+{
+	struct QuantizedValue *ret=new QuantizedValue;
+	ret->Samples=0;
+	ret->Children[0]=ret->Children[1]=0;
+	ret->NSamples=0;
+  
+	ret->ErrorMeasure=new double[current_ndims];
+	ret->Mean=new uint8[current_ndims];
+	ret->Mins=new uint8[current_ndims];
+	ret->Maxs=new uint8[current_ndims];
+	ret->Sums=new int [current_ndims];
+	memset(ret->Sums,0,sizeof(int)*current_ndims);
+	ret->NQuant=0;
+	ret->sortdim=-1;
+	return ret;
+}
+
+void FreeQuantization(struct QuantizedValue *t)
+{
+	if (t)
+	{
+		delete[] t->ErrorMeasure;
+		delete[] t->Mean;
+		delete[] t->Mins;
+		delete[] t->Maxs;
+		FreeQuantization(t->Children[0]);
+		FreeQuantization(t->Children[1]);
+		delete[] t->Sums;
+		delete[] t;
+	}
+}
+
+static int QNumSort(void const *a, void const *b)
+{
+	int32 as=((struct Sample *) a)->QNum;
+	int32 bs=((struct Sample *) b)->QNum;
+	if (as==bs) return 0;
+	return (as>bs)?1:-1;
+}
+
+#if SPLIT_THEN_SORT
+#else
+static int current_sort_dim;
+
+static int samplesort(void const *a, void const *b)
+{
+	uint8 as=((struct Sample *) a)->Value[current_sort_dim];
+	uint8 bs=((struct Sample *) b)->Value[current_sort_dim];
+	if (as==bs) return 0;
+	return (as>bs)?1:-1;
+}
+#endif
+
+static int sortlong(void const *a, void const *b)
+{
+	// treat the entire vector of values as a long integer for duplicate removal.
+	return memcmp(((struct Sample *) a)->Value,
+				  ((struct Sample *) b)->Value,current_ndims);
+}
+
+
+  
+#define NEXTSAMPLE(s) ( (struct Sample *) (((uint8 *) s)+current_ssize))
+#define SAMPLE(s,i) NthSample(s,i,current_ndims)
+
+static void SetNDims(int n)
+{
+	current_ssize=sizeof(struct Sample)+(n-1);
+	current_ndims=n;
+}
+
+int CompressSamples(struct Sample *s, int nsamples, int ndims)
+{
+	SetNDims(ndims);
+	qsort(s,nsamples,current_ssize,sortlong);
+	// now, they are all sorted by treating all dimensions as a large number.
+	// we may now remove duplicates.
+	struct Sample *src=s;
+	struct Sample *dst=s;
+	struct Sample *lastdst=dst;
+	dst=NEXTSAMPLE(dst);		// copy first sample to get the ball rolling
+	src=NEXTSAMPLE(src);
+	int noutput=1;
+	while(--nsamples)		// while some remain
+	{
+		if (memcmp(src->Value,lastdst->Value,current_ndims))
+		{
+			// yikes, a difference has been found!
+			memcpy(dst,src,current_ssize);
+			lastdst=dst;
+			dst=NEXTSAMPLE(dst);
+			noutput++;
+		}
+		else
+			lastdst->Count++;
+		src=NEXTSAMPLE(src);
+	}
+	return noutput;
+}
+
+void PrintSamples(struct Sample const *s, int nsamples, int ndims)
+{
+	SetNDims(ndims);
+	int cnt=0;
+	while(nsamples--)
+	{
+		printf("sample #%d, count=%d, values=\n { ",cnt++,s->Count);
+		for(int d=0;d<ndims;d++)
+			printf("%02x,",s->Value[d]);
+		printf("}\n");
+		s=NEXTSAMPLE(s);
+	}
+}
+
+void PrintQTree(struct QuantizedValue const *p,int idlevel)
+{
+	int i;
+
+	if (p)
+	{
+		for(i=0;i<idlevel;i++)
+			printf(" ");
+		printf("node=%p NSamples=%d value=%d Mean={",p,p->NSamples,p->value);
+		for(i=0;i<current_ndims;i++)
+			printf("%x,",p->Mean[i]);
+		printf("}\n");
+		for(i=0;i<idlevel;i++)
+			printf(" ");
+		printf("Errors={");
+		for(i=0;i<current_ndims;i++)
+			printf("%f,",p->ErrorMeasure[i]);
+		printf("}\n");
+		for(i=0;i<idlevel;i++)
+			printf(" ");
+		printf("Mins={");
+		for(i=0;i<current_ndims;i++)
+			printf("%d,",p->Mins[i]);
+		printf("} Maxs={");
+		for(i=0;i<current_ndims;i++)
+			printf("%d,",p->Maxs[i]);
+		printf("}\n");
+		PrintQTree(p->Children[0],idlevel+2);
+		PrintQTree(p->Children[1],idlevel+2);
+	}
+}
+
+static void UpdateStats(struct QuantizedValue *v)
+{
+	// first, find mean
+	int32 Means[MAXDIMS];
+	double Errors[MAXDIMS];
+	double WorstError[MAXDIMS];
+	int i,j;
+  
+	memset(Means,0,sizeof(Means));
+	int N=0;
+	for(i=0;i<v->NSamples;i++)
+	{
+		struct Sample *s=SAMPLE(v->Samples,i);
+		N+=s->Count;
+		for(j=0;j<current_ndims;j++)
+		{
+			uint8 v=s->Value[j];
+			Means[j]+=v*s->Count;
+		}
+	}
+	for(j=0;j<current_ndims;j++)
+	{
+		if (N) v->Mean[j]=(uint8) (Means[j]/N);
+		Errors[j]=WorstError[j]=0.;
+	}
+	for(i=0;i<v->NSamples;i++)
+	{
+		struct Sample *s=SAMPLE(v->Samples,i);
+		double c=s->Count;
+		for(j=0;j<current_ndims;j++)
+		{
+			double diff=SQ(s->Value[j]-v->Mean[j]);
+			Errors[j]+=c*diff; // charles uses abs not sq()
+			if (diff>WorstError[j])
+				WorstError[j]=diff;
+		}
+	}
+	v->TotalError=0.;
+	double ErrorScale=1.; // /sqrt((double) (N));
+	for(j=0;j<current_ndims;j++)
+	{
+		v->ErrorMeasure[j]=(ErrorScale*Errors[j]*current_weights[j]);
+		v->TotalError+=v->ErrorMeasure[j];
+#if SPLIT_THEN_SORT
+		v->ErrorMeasure[j]*=WorstError[j];
+#endif
+	}
+	v->TotSamples=N;
+}
+
+static int ErrorDim;
+static double ErrorVal;
+static struct QuantizedValue *ErrorNode;
+
+static void UpdateWorst(struct QuantizedValue *q)
+{
+	if (q->Children[0])
+	{
+		// not a leaf node
+		UpdateWorst(q->Children[0]);
+		UpdateWorst(q->Children[1]);
+	}
+	else
+	{
+		if (q->TotalError>ErrorVal)
+		{
+			ErrorVal=q->TotalError;
+			ErrorNode=q;
+			ErrorDim=0;
+			for(int d=0;d<current_ndims;d++)
+				if (q->ErrorMeasure[d]>q->ErrorMeasure[ErrorDim])
+					ErrorDim=d;
+		}
+	}
+}
+
+static int FindWorst(void)
+{
+	ErrorVal=-1.;
+	UpdateWorst(current_root);
+	return (ErrorVal>0);
+}
+
+
+
+static void SubdivideNode(struct QuantizedValue *n, int whichdim)
+{
+	int NAdded=0;
+	int i;
+
+#if SPLIT_THEN_SORT
+	// we will try the "split then sort" method. This works by finding the
+	// means for all samples above and below the mean along the given axis.
+	// samples are then split into two groups, with the selection based upon
+	// which of the n-dimensional means the sample is closest to.
+	double LocalMean[MAXDIMS][2];
+	int totsamps[2];
+	for(i=0;i<current_ndims;i++)
+		LocalMean[i][0]=LocalMean[i][1]=0.;
+	totsamps[0]=totsamps[1]=0;
+	uint8 minv=255;
+	uint8 maxv=0;
+	struct Sample *minS=0,*maxS=0;
+	for(i=0;i<n->NSamples;i++)
+	{
+		uint8 v;
+		int whichside=1;
+		struct Sample *sl;
+		sl=SAMPLE(n->Samples,i);
+		v=sl->Value[whichdim];
+		if (v<minv) { minv=v; minS=sl; }
+		if (v>maxv) { maxv=v; maxS=sl; }
+		if (v<n->Mean[whichdim])
+			whichside=0;
+		totsamps[whichside]+=sl->Count;
+		for(int d=0;d<current_ndims;d++)
+			LocalMean[d][whichside]+=
+				sl->Count*sl->Value[d];
+	}
+
+	if (totsamps[0] && totsamps[1])
+		for(i=0;i<current_ndims;i++)
+		{
+			LocalMean[i][0]/=totsamps[0];
+			LocalMean[i][1]/=totsamps[1];
+		}
+	else
+	{
+		// it is possible that the clustering failed to split the samples.
+		// this can happen with a heavily biased sample (i.e. all black
+		// with a few stars). If this happens, we will cluster around the
+		// extrema instead. LocalMean[i][0] will be the point with the lowest
+		// value on the dimension and LocalMean[i][1] the one with the lowest
+		// value.
+		for(int i=0;i<current_ndims;i++)
+		{
+			LocalMean[i][0]=minS->Value[i];
+			LocalMean[i][1]=maxS->Value[i];
+		}
+	}
+
+	// now, we have 2 n-dimensional means. We will label each sample
+	// for which one it is nearer to by using the QNum field.
+	for(i=0;i<n->NSamples;i++)
+	{
+		double dist[2];
+		dist[0]=dist[1]=0.;
+		struct Sample *s=SAMPLE(n->Samples,i);
+		for(int d=0;d<current_ndims;d++)
+			for(int w=0;w<2;w++)
+				dist[w]+=current_weights[d]*SQ(LocalMean[d][w]-s->Value[d]);
+		s->QNum=(dist[0]<dist[1]);
+    }
+
+
+	// hey ho! we have now labelled each one with a candidate bin. Let's
+	// sort the array by moving the 0-labelled ones to the head of the array.
+	n->sortdim=-1;
+	qsort(n->Samples,n->NSamples,current_ssize,QNumSort);
+	for(i=0;i<n->NSamples;i++,NAdded++)
+		if (SAMPLE(n->Samples,i)->QNum)
+			break;
+  
+#else
+	if (whichdim != n->sortdim)
+	{
+		current_sort_dim=whichdim;
+		qsort(n->Samples,n->NSamples,current_ssize,samplesort);
+		n->sortdim=whichdim;
+	}
+	// now, the samples are sorted along the proper dimension.  we need
+	// to find the place to cut in order to split the node.  this is
+	// complicated by the fact that each sample entry can represent many
+	// samples. What we will do is start at the beginning of the array,
+	// adding samples to the first node, until either the number added
+	// is >=TotSamples/2, or there is only one left.
+	int TotAdded=0;
+	for(;;)
+	{
+		if (NAdded==n->NSamples-1)
+			break;
+		if (TotAdded>=n->TotSamples/2)
+			break;
+		TotAdded+=SAMPLE(n->Samples,NAdded)->Count;
+		NAdded++;
+	}
+#endif
+	struct QuantizedValue *a=AllocQValue();
+	a->sortdim=n->sortdim;
+	a->Samples=n->Samples;
+	a->NSamples=NAdded;
+	n->Children[0]=a;
+	UpdateStats(a);
+	a=AllocQValue();
+	a->Samples=SAMPLE(n->Samples,NAdded);
+	a->NSamples=n->NSamples-NAdded;
+	a->sortdim=n->sortdim;
+	n->Children[1]=a;
+	UpdateStats(a);
+}
+
+static int colorid=0;
+
+static void Label(struct QuantizedValue *q, int updatecolor)
+{
+	// fill in max/min values for tree, etc.
+	if (q)
+	{
+		Label(q->Children[0],updatecolor);
+		Label(q->Children[1],updatecolor);
+		if (! q->Children[0])	// leaf node?
+		{
+			if (updatecolor)
+			{
+				q->value=colorid++;
+				for(int j=0;j<q->NSamples;j++)
+				{
+					SAMPLE(q->Samples,j)->QNum=q->value;
+					SAMPLE(q->Samples,j)->qptr=q;
+				}
+			}
+			for(int i=0;i<current_ndims;i++)
+			{
+				q->Mins[i]=q->Mean[i];
+				q->Maxs[i]=q->Mean[i];
+			}
+		}
+		else
+			for(int i=0;i<current_ndims;i++)
+			{
+				q->Mins[i]=min(q->Children[0]->Mins[i],q->Children[1]->Mins[i]);
+				q->Maxs[i]=max(q->Children[0]->Maxs[i],q->Children[1]->Maxs[i]);
+			}
+	}
+}    
+
+struct QuantizedValue *FindQNode(struct QuantizedValue const *q, int32 code)
+{
+	if (! (q->Children[0]))
+		if (code==q->value) return (struct QuantizedValue *) q;
+		else return 0;
+	else
+	{
+		struct QuantizedValue *found=FindQNode(q->Children[0],code);
+		if (! found) found=FindQNode(q->Children[1],code);
+		return found;
+	}
+}
+
+
+void CheckInRange(struct QuantizedValue *q, uint8 *max, uint8 *min)
+{
+	if (q)
+	{
+		if (q->Children[0])
+		{
+			// non-leaf node
+			CheckInRange(q->Children[0],q->Maxs, q->Mins);
+			CheckInRange(q->Children[1],q->Maxs, q->Mins);
+			CheckInRange(q->Children[0],max, min);
+			CheckInRange(q->Children[1],max, min);
+		}
+		for (int i=0;i<current_ndims;i++)
+		{
+			if (q->Maxs[i]>max[i]) printf("error1\n");
+			if (q->Mins[i]<min[i]) printf("error2\n");
+		}
+	}
+}
+
+struct QuantizedValue *Quantize(struct Sample *s, int nsamples, int ndims,
+								int nvalues, uint8 *weights, int firstvalue)
+{
+	SetNDims(ndims);
+	current_weights=weights;
+	current_root=AllocQValue();
+	current_root->Samples=s;
+	current_root->NSamples=nsamples;
+	UpdateStats(current_root);
+	while(--nvalues)
+	{
+		if (! FindWorst())
+			break;                          // if <n unique ones, stop now
+		SubdivideNode(ErrorNode,ErrorDim);
+	}
+	colorid=firstvalue;
+	Label(current_root,1);
+	return current_root;
+}
+
+double MinimumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights)
+{
+	double err=0;
+	for(int i=0;i<ndims;i++)
+	{
+		int val1;
+		int val2=sample[i];
+		if ((q->Mins[i]<=val2) && (q->Maxs[i]>=val2)) val1=val2;
+		else
+		{
+			val1=(val2<=q->Mins[i])?q->Mins[i]:q->Maxs[i];
+		}
+		err+=weights[i]*SQ(val1-val2);
+	}
+	return err;
+}
+
+double MaximumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights)
+{
+	double err=0;
+	for(int i=0;i<ndims;i++)
+	{
+		int val2=sample[i];
+		int val1=(abs(val2-q->Mins[i])>abs(val2-q->Maxs[i]))?
+			q->Mins[i]:
+			q->Maxs[i];
+		err+=weights[i]*SQ(val2-val1);
+	}
+	return err;
+}
+
+				     
+
+// heap (priority queue) routines used for nearest-neghbor searches
+struct FHeap {
+	int heap_n;
+	double *heap[MAXQUANT];
+};
+
+void InitHeap(struct FHeap *h)
+{
+  h->heap_n=0;
+}
+
+
+void UpHeap(int k, struct FHeap *h)
+{
+  double *tmpk=h->heap[k];
+  double tmpkn=*tmpk;
+  while((k>1) && (tmpkn <= *(h->heap[k/2])))
+    {
+      h->heap[k]=h->heap[k/2];
+      k/=2;
+    }
+  h->heap[k]=tmpk;
+}
+
+void HeapInsert(struct FHeap *h,double *elem)
+{
+  h->heap_n++;
+  h->heap[h->heap_n]=elem;
+  UpHeap(h->heap_n,h);
+}
+
+void DownHeap(int k, struct FHeap *h)
+{
+  double *v=h->heap[k];
+  while(k<=h->heap_n/2)
+    {
+      int j=2*k;
+      if (j<h->heap_n)
+	if (*(h->heap[j]) >= *(h->heap[j+1]))
+	  j++;
+      if (*v < *(h->heap[j]))
+	{
+	  h->heap[k]=v;
+	  return;
+	}
+      h->heap[k]=h->heap[j]; k=j;
+    }
+  h->heap[k]=v;
+}
+
+void *RemoveHeapItem(struct FHeap *h)
+{
+  void *ret=0;
+  if (h->heap_n!=0)
+    {
+      ret=h->heap[1];
+      h->heap[1]=h->heap[h->heap_n];
+      h->heap_n--;
+      DownHeap(1,h);
+    }
+  return ret;
+}
+
+// now, nearest neighbor finder. Use a heap to traverse the tree, stopping
+// when there are no nodes with a minimum error < the current error.
+
+struct FHeap TheQueue;
+
+#define PUSHNODE(a) { \
+  (a)->MinError=MinimumError(a,sample,ndims,weights); \
+  if ((a)->MinError < besterror) HeapInsert(&TheQueue,&(a)->MinError); \
+ }
+
+struct QuantizedValue *FindMatch(uint8 const *sample, int ndims,
+								 uint8 *weights, struct QuantizedValue *q)
+{
+	InitHeap(&TheQueue);
+	struct QuantizedValue *bestmatch=0;
+	double besterror=1.0e63;
+	PUSHNODE(q);
+	for(;;)
+	{
+		struct QuantizedValue *test=(struct QuantizedValue *)
+			RemoveHeapItem(&TheQueue);
+		if (! test) break;		// heap empty
+//    printf("got pop node =%p minerror=%f\n",test,test->MinError);
+    
+		if (test->MinError>besterror) break;
+		if (test->Children[0])
+		{
+			// it's a parent node. put the children on the queue
+			struct QuantizedValue *c1=test->Children[0];
+			struct QuantizedValue *c2=test->Children[1];
+			c1->MinError=MinimumError(c1,sample,ndims,weights);
+			if (c1->MinError < besterror)
+				HeapInsert(&TheQueue,&(c1->MinError));
+			c2->MinError=MinimumError(c2,sample,ndims,weights);
+			if (c2->MinError < besterror)
+				HeapInsert(&TheQueue,&(c2->MinError));
+		}
+		else
+		{
+			// it's a leaf node. This must be a new minimum or the MinError
+			// test would have failed.
+			if (test->MinError < besterror)
+			{
+				bestmatch=test;
+				besterror=test->MinError;
+			}
+		}
+	}
+	if (bestmatch)
+	{
+		SquaredError+=besterror;
+		bestmatch->NQuant++;
+		for(int i=0;i<ndims;i++)
+			bestmatch->Sums[i]+=sample[i];
+	}
+	return bestmatch;
+}
+
+static void RecalcMeans(struct QuantizedValue *q)
+{
+	if (q)
+	{
+		if (q->Children[0])
+		{
+			// not a leaf, invoke recursively.
+			RecalcMeans(q->Children[0]);
+			RecalcMeans(q->Children[0]);
+		}
+		else
+		{
+			// it's a leaf. Set the means
+			if (q->NQuant)
+			{
+				for(int i=0;i<current_ndims;i++)
+				{
+					q->Mean[i]=(uint8) (q->Sums[i]/q->NQuant);
+					q->Sums[i]=0;
+				}
+				q->NQuant=0;
+			}
+		}
+	}
+}
+		      
+void OptimizeQuantizer(struct QuantizedValue *q, int ndims)
+{
+	SetNDims(ndims);
+	RecalcMeans(q);		// reset q values
+	Label(q,0);			// update max/mins
+}
+
+
+static void RecalcStats(struct QuantizedValue *q)
+{
+	if (q)
+	{
+		UpdateStats(q);
+		RecalcStats(q->Children[0]);
+		RecalcStats(q->Children[1]);
+	}
+}
+
+void RecalculateValues(struct QuantizedValue *q, int ndims)
+{
+	SetNDims(ndims);
+	RecalcStats(q);
+	Label(q,0);
+}
--- a/mathlib/randsse.cpp
+++ b/mathlib/randsse.cpp
@ -0,0 +1,109 @@
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: generates 4 randum numbers in the range 0..1 quickly, using SIMD
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/ssemath.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+// see knuth volume 3 for insight.
+
+class SIMDRandStreamContext
+{
+	fltx4 m_RandY[55];
+
+	fltx4 *m_pRand_J, *m_pRand_K;
+
+
+public:
+	void Seed( uint32 seed )
+	{
+		m_pRand_J=m_RandY+23; m_pRand_K=m_RandY+54;
+		for(int i=0;i<55;i++)
+		{
+			for(int j=0;j<4;j++)
+			{
+				SubFloat( m_RandY[i], j) = (seed>>16)/65536.0;
+				seed=(seed+1)*3141592621u;
+			}
+		}
+	}
+
+	inline fltx4 RandSIMD( void )
+	{
+		// ret= rand[k]+rand[j]
+		fltx4 retval=AddSIMD( *m_pRand_K, *m_pRand_J );
+		
+		// if ( ret>=1.0) ret-=1.0
+		fltx4 overflow_mask=CmpGeSIMD( retval, Four_Ones );
+		retval=SubSIMD( retval, AndSIMD( Four_Ones, overflow_mask ) );
+		
+		*m_pRand_K = retval;
+		
+		// update pointers w/ wrap-around
+		if ( --m_pRand_J < m_RandY )
+			m_pRand_J=m_RandY+54;
+		if ( --m_pRand_K < m_RandY )
+			m_pRand_K=m_RandY+54;
+		
+		return retval;
+	}
+};
+
+#define MAX_SIMULTANEOUS_RANDOM_STREAMS 32
+
+static SIMDRandStreamContext s_SIMDRandContexts[MAX_SIMULTANEOUS_RANDOM_STREAMS];
+
+static volatile int s_nRandContextsInUse[MAX_SIMULTANEOUS_RANDOM_STREAMS];
+
+void SeedRandSIMD(uint32 seed)
+{
+	for( int i = 0; i<MAX_SIMULTANEOUS_RANDOM_STREAMS; i++)
+		s_SIMDRandContexts[i].Seed( seed+i );
+}
+
+fltx4 RandSIMD( int nContextIndex )
+{
+	return s_SIMDRandContexts[nContextIndex].RandSIMD();
+}
+
+int GetSIMDRandContext( void )
+{
+	for(;;)
+	{
+		for(int i=0; i < NELEMS( s_SIMDRandContexts ); i++)
+		{
+			if ( ! s_nRandContextsInUse[i] )				// available?
+			{
+				// try to take it!
+				if ( ThreadInterlockedAssignIf( &( s_nRandContextsInUse[i]), 1, 0 ) )
+				{
+					return i;								// done!
+				}
+			}
+		}
+		Assert(0);											// why don't we have enough buffers?
+		ThreadSleep();
+	}
+}
+
+void ReleaseSIMDRandContext( int nContext )
+{
+	s_nRandContextsInUse[ nContext ] = 0;
+}
+
+
+fltx4 RandSIMD( void )
+{
+	return s_SIMDRandContexts[0].RandSIMD();
+}
--- a/mathlib/simdvectormatrix.cpp
+++ b/mathlib/simdvectormatrix.cpp
@ -0,0 +1,112 @@
+//====== Copyright <20> 1996-2006, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: Provide a class (SSE/SIMD only) holding a 2d matrix of class FourVectors,
+// for high speed processing in tools.
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+
+
+#include "basetypes.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/simdvectormatrix.h"
+#include "mathlib/ssemath.h"
+#include "tier0/dbg.h"
+
+void CSIMDVectorMatrix::CreateFromRGBA_FloatImageData(int srcwidth, int srcheight,
+													  float const *srcdata )
+{
+	Assert( srcwidth && srcheight && srcdata );
+	SetSize( srcwidth, srcheight );
+
+	FourVectors *p_write_ptr=m_pData;
+	int n_vectors_per_source_line=(srcwidth >> 2);
+	int ntrailing_pixels_per_source_line=(srcwidth & 3);
+	for(int y=0;y<srcheight;y++)
+	{
+		float const *data_in=srcdata;
+		float *data_out=reinterpret_cast<float *>( p_write_ptr );
+		// copy full input blocks
+		for(int x=0;x<n_vectors_per_source_line;x++)
+		{
+			for(int c=0;c<3;c++)
+			{
+				data_out[0]=data_in[c];					// x0
+				data_out[1]=data_in[4+c];				// x1
+				data_out[2]=data_in[8+c];				// x2
+				data_out[3]=data_in[12+c];				// x3
+				data_out+=4;
+			}
+			data_in += 16;
+		}
+		// now, copy trailing data and pad with copies
+		if (ntrailing_pixels_per_source_line )
+		{
+			for(int c=0;c<3;c++)
+			{
+				for(int cp=0;cp<4; cp++)
+				{
+					int real_cp=min( cp, ntrailing_pixels_per_source_line-1 );
+					data_out[4*c+cp]= data_in[c+4*real_cp];
+				}
+			}
+		}
+		// advance ptrs to next line
+		p_write_ptr += m_nPaddedWidth;
+		srcdata += 4 * srcwidth;
+	}
+}
+
+void CSIMDVectorMatrix::RaiseToPower( float power )
+{
+	int nv=NVectors();
+	if ( nv )
+	{
+		int fixed_point_exp=(int) ( 4.0*power );
+		FourVectors *src=m_pData;
+		do
+		{
+			src->x=Pow_FixedPoint_Exponent_SIMD( src->x, fixed_point_exp );
+			src->y=Pow_FixedPoint_Exponent_SIMD( src->y, fixed_point_exp );
+			src->z=Pow_FixedPoint_Exponent_SIMD( src->z, fixed_point_exp );
+			src++;
+		} while (--nv);
+	}
+}
+
+CSIMDVectorMatrix & CSIMDVectorMatrix::operator+=( CSIMDVectorMatrix const &src )
+{
+	Assert( m_nWidth == src.m_nWidth );
+	Assert( m_nHeight == src.m_nHeight );
+	int nv=NVectors();
+	if ( nv )
+	{
+		FourVectors *srcv=src.m_pData;
+		FourVectors *destv=m_pData;
+		do													// !! speed !! inline more iters
+		{
+			*( destv++ ) += *( srcv++ );
+		} while ( --nv );
+	}
+	return *this;
+}
+
+CSIMDVectorMatrix & CSIMDVectorMatrix::operator*=( Vector const &src )
+{
+	int nv=NVectors();
+	if ( nv )
+	{
+		FourVectors scalevalue;
+		scalevalue.DuplicateVector( src );
+		FourVectors *destv=m_pData;
+		do													// !! speed !! inline more iters
+		{
+			destv->VProduct( scalevalue );
+			destv++;
+		} while ( --nv );
+	}
+	return *this;
+}
+
--- a/mathlib/sparse_convolution_noise.cpp
+++ b/mathlib/sparse_convolution_noise.cpp
@ -0,0 +1,218 @@
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: noise() primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/noise.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+// generate high quality noise based upon "sparse convolution". HIgher quality than perlin noise,
+// and no direcitonal artifacts.
+
+#include "noisedata.h"
+
+#define N_IMPULSES_PER_CELL 5
+#define NORMALIZING_FACTOR 1.0
+
+//(0.5/N_IMPULSES_PER_CELL)
+
+static inline int LatticeCoord(float x)
+{
+	return ((int) floor(x)) & 0xff;
+}
+
+static inline int Hash4D(int ix, int iy, int iz, int idx)
+{
+	int ret=perm_a[ix];
+	ret=perm_b[(ret+iy) & 0xff];
+	ret=perm_c[(ret+iz) & 0xff];
+	ret=perm_d[(ret+idx) & 0xff];
+	return ret;
+}
+
+#define SQ(x) ((x)*(x))
+
+static float CellNoise( int ix, int iy, int iz, float xfrac, float yfrac, float zfrac,
+						float (*pNoiseShapeFunction)(float) )
+{
+	float ret=0;
+	for(int idx=0;idx<N_IMPULSES_PER_CELL;idx++)
+	{
+		int coord_idx=Hash4D( ix, iy, iz, idx );
+		float dsq=SQ(impulse_xcoords[coord_idx]-xfrac)+
+			SQ(impulse_ycoords[coord_idx]-yfrac)+
+			SQ(impulse_zcoords[coord_idx]-zfrac);
+		dsq = sqrt( dsq );
+		if (dsq < 1.0 )
+		{
+			ret += (*pNoiseShapeFunction)( 1-dsq );
+		}
+	}
+	return ret;
+}
+
+
+float SparseConvolutionNoise( Vector const &pnt )
+{
+	return SparseConvolutionNoise( pnt, QuinticInterpolatingPolynomial );
+}
+
+float FractalNoise( Vector const &pnt, int n_octaves)
+{
+	float scale=1.0;
+	float iscale=1.0;
+	float ret=0;
+	float sumscale=0;
+	for(int o=0;o<n_octaves;o++)
+	{
+		Vector p1=pnt;
+		p1 *= scale;
+		ret+=iscale * SparseConvolutionNoise( p1 );
+		sumscale += iscale;
+		scale *= 2.0;
+		iscale *= 0.5;
+	}
+	return ret * ( 1.0/sumscale );
+}
+
+float Turbulence( Vector const &pnt, int n_octaves)
+{
+	float scale=1.0;
+	float iscale=1.0;
+	float ret=0;
+	float sumscale=0;
+	for(int o=0;o<n_octaves;o++)
+	{
+		Vector p1=pnt;
+		p1 *= scale;
+		ret+=iscale * fabs ( 2.0*( SparseConvolutionNoise( p1 )-.5 ) );
+		sumscale += iscale;
+		scale *= 2.0;
+		iscale *= 0.5;
+	}
+	return ret * ( 1.0/sumscale );
+}
+
+#ifdef MEASURE_RANGE
+float fmin1=10000000.0;
+float fmax1=-1000000.0;
+#endif
+
+float SparseConvolutionNoise(Vector const &pnt, float (*pNoiseShapeFunction)(float) )
+{
+	// computer integer lattice point
+	int ix=LatticeCoord(pnt.x);
+	int iy=LatticeCoord(pnt.y);
+	int iz=LatticeCoord(pnt.z);
+
+	// compute offsets within unit cube
+	float xfrac=pnt.x-floor(pnt.x);
+	float yfrac=pnt.y-floor(pnt.y);
+	float zfrac=pnt.z-floor(pnt.z);
+
+	float sum_out=0.;
+
+	for(int ox=-1; ox<=1; ox++)
+		for(int oy=-1; oy<=1; oy++)
+			for(int oz=-1; oz<=1; oz++)
+			{
+				sum_out += CellNoise( ix+ox, iy+oy, iz+oz,
+									  xfrac-ox, yfrac-oy, zfrac-oz,
+									  pNoiseShapeFunction );
+			}
+#ifdef MEASURE_RANGE
+	fmin1=min(sum_out,fmin1);
+	fmax1=max(sum_out,fmax1);
+#endif
+	return RemapValClamped( sum_out, .544487, 9.219176, 0.0, 1.0 );
+}
+
+
+// Improved Perlin Noise
+// The following code is the c-ification of Ken Perlin's new noise algorithm
+// "JAVA REFERENCE IMPLEMENTATION OF IMPROVED NOISE - COPYRIGHT 2002 KEN PERLIN"
+// as available here: http://mrl.nyu.edu/~perlin/noise/
+
+float NoiseGradient(int hash, float x, float y, float z)
+{
+	int h = hash & 15;                      // CONVERT LO 4 BITS OF HASH CODE
+	float u = h<8 ? x : y;                  // INTO 12 GRADIENT DIRECTIONS.
+	float v = h<4 ? y : (h==12||h==14 ? x : z);
+	return ((h&1) == 0 ? u : -u) + ((h&2) == 0 ? v : -v);
+}
+
+int NoiseHashIndex( int i )
+{
+	static int s_permutation[] = 
+	{
+		151,160,137,91,90,15,
+			131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
+			190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
+			88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
+			77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
+			102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
+			135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
+			5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
+			223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9,
+			129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
+			251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
+			49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,
+			138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
+	};
+
+	return s_permutation[ i & 0xff ];
+}
+
+float ImprovedPerlinNoise( Vector const &pnt )
+{
+	float fx = floor(pnt.x);
+	float fy = floor(pnt.y);
+	float fz = floor(pnt.z);
+
+	int X = (int)fx & 255;								// FIND UNIT CUBE THAT
+	int Y = (int)fy & 255;								// CONTAINS POINT.
+	int Z = (int)fz & 255;
+
+	float x = pnt.x - fx;								// FIND RELATIVE X,Y,Z
+	float y = pnt.y - fy;								// OF POINT IN CUBE.
+	float z = pnt.z - fz;
+
+	float u = QuinticInterpolatingPolynomial(x);		// COMPUTE FADE CURVES
+	float v = QuinticInterpolatingPolynomial(y);		// FOR EACH OF X,Y,Z.
+	float w = QuinticInterpolatingPolynomial(z);
+
+	int A  = NoiseHashIndex( X ) + Y;					// HASH COORDINATES OF
+	int AA = NoiseHashIndex( A ) + Z;					// THE 8 CUBE CORNERS,
+	int AB = NoiseHashIndex( A + 1 ) + Z;
+	int B  = NoiseHashIndex( X + 1 ) + Y;
+	int BA = NoiseHashIndex( B ) + Z;
+	int BB = NoiseHashIndex( B + 1 ) + Z;
+
+	float g0 = NoiseGradient(NoiseHashIndex(AA  ), x  , y  , z   );
+	float g1 = NoiseGradient(NoiseHashIndex(BA  ), x-1, y  , z   );
+	float g2 = NoiseGradient(NoiseHashIndex(AB  ), x  , y-1, z   );
+	float g3 = NoiseGradient(NoiseHashIndex(BB  ), x-1, y-1, z   );
+	float g4 = NoiseGradient(NoiseHashIndex(AA+1), x  , y  , z-1 );
+	float g5 = NoiseGradient(NoiseHashIndex(BA+1), x-1, y  , z-1 );
+	float g6 = NoiseGradient(NoiseHashIndex(AB+1), x  , y-1, z-1 );
+	float g7 = NoiseGradient(NoiseHashIndex(BB+1), x-1, y-1, z-1 );
+
+	// AND ADD BLENDED RESULTS FROM 8 CORNERS OF CUBE
+	float g01 = Lerp( u, g0, g1 );
+	float g23 = Lerp( u, g2, g3 );
+	float g45 = Lerp( u, g4, g5 );
+	float g67 = Lerp( u, g6, g7 );
+	float g0123 = Lerp( v, g01, g23 );
+	float g4567 = Lerp( v, g45, g67 );
+
+	return Lerp( w, g0123,g4567 );
+}
--- a/mathlib/sse.cpp
+++ b/mathlib/sse.cpp
@ -0,0 +1,845 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: SSE Math primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "sse.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
+static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
+
+//-----------------------------------------------------------------------------
+// Macros and constants required by some of the SSE assembly:
+//-----------------------------------------------------------------------------
+
+#ifdef _WIN32
+	#define _PS_EXTERN_CONST(Name, Val) \
+		const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+		const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+
+	#define _EPI32_CONST(Name, Val) \
+		static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_CONST(Name, Val) \
+		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+#elif _LINUX
+	#define _PS_EXTERN_CONST(Name, Val) \
+		const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+		const __attribute__((aligned(16))) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+
+	#define _EPI32_CONST(Name, Val) \
+		static const __attribute__((aligned(16))) int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_CONST(Name, Val) \
+		static const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+#endif
+
+_PS_EXTERN_CONST(am_0, 0.0f);
+_PS_EXTERN_CONST(am_1, 1.0f);
+_PS_EXTERN_CONST(am_m1, -1.0f);
+_PS_EXTERN_CONST(am_0p5, 0.5f);
+_PS_EXTERN_CONST(am_1p5, 1.5f);
+_PS_EXTERN_CONST(am_pi, (float)M_PI);
+_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
+_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
+_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
+_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
+_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
+_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
+_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
+_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
+_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
+
+_EPI32_CONST(1, 1);
+_EPI32_CONST(2, 2);
+
+_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
+_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
+_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
+_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
+
+#ifdef PFN_VECTORMA
+void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
+#endif
+
+//-----------------------------------------------------------------------------
+// SSE implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _SSE_Sqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+	float	root = 0.f;
+#ifdef _WIN32
+	_asm
+	{
+		sqrtss		xmm0, x
+		movss		root, xmm0
+	}
+#elif _LINUX
+	__asm__ __volatile__(
+		"movss %1,%%xmm2\n"
+		"sqrtss %%xmm2,%%xmm1\n"
+		"movss %%xmm1,%0"
+       	: "=m" (root)
+		: "m" (x)
+	);
+#endif
+	return root;
+}
+
+// Single iteration NewtonRaphson reciprocal square root:
+// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
+// Very low error, and fine to use in place of 1.f / sqrtf(x).	
+#if 0
+float _SSE_RSqrtAccurate(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	float rroot;
+	_asm
+	{
+		rsqrtss	xmm0, x
+		movss	rroot, xmm0
+	}
+
+	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
+}
+#else
+// Intel / Kipps SSE RSqrt.  Significantly faster than above.
+float _SSE_RSqrtAccurate(float a)
+{
+	float x;
+	float half = 0.5f;
+	float three = 3.f;
+
+#ifdef _WIN32
+	__asm
+	{
+		movss   xmm3, a;
+		movss   xmm1, half;
+		movss   xmm2, three;
+		rsqrtss xmm0, xmm3;
+
+		mulss   xmm3, xmm0;
+		mulss   xmm1, xmm0;
+		mulss   xmm3, xmm0;
+		subss   xmm2, xmm3;
+		mulss   xmm1, xmm2;
+
+		movss   x,    xmm1;
+	}
+#elif _LINUX
+	__asm__ __volatile__(
+		"movss   %1, %%xmm3 \n\t"
+        "movss   %2, %%xmm1 \n\t"
+        "movss   %3, %%xmm2 \n\t"
+        "rsqrtss %%xmm3, %%xmm0 \n\t"
+        "mulss   %%xmm0, %%xmm3 \n\t"
+        "mulss   %%xmm0, %%xmm1 \n\t"
+        "mulss   %%xmm0, %%xmm3 \n\t"
+        "subss   %%xmm3, %%xmm2 \n\t"
+        "mulss   %%xmm2, %%xmm1 \n\t"
+        "movss   %%xmm1, %0 \n\t"
+		: "=m" (x)
+		: "m" (a), "m" (half), "m" (three)
+);
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+#endif
+
+// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
+// or so, so ok for closed transforms.  (ie, computing lighting normals)
+float _SSE_RSqrtFast(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	float rroot;
+#ifdef _WIN32
+	_asm
+	{
+		rsqrtss	xmm0, x
+		movss	rroot, xmm0
+	}
+#elif _LINUX
+	 __asm__ __volatile__(
+		"rsqrtss %1, %%xmm0 \n\t"
+		"movss %%xmm0, %0 \n\t"
+		: "=m" (x)
+		: "m" (rroot)
+		: "%xmm0"
+	);
+#else
+#error
+#endif
+
+	return rroot;
+}
+
+float FASTCALL _SSE_VectorNormalize (Vector& vec)
+{
+	Assert( s_bMathlibInitialized );
+
+	// NOTE: This is necessary to prevent an memory overwrite...
+	// sice vec only has 3 floats, we can't "movaps" directly into it.
+#ifdef _WIN32
+	__declspec(align(16)) float result[4];
+#elif _LINUX
+	__attribute__((aligned(16))) float result[4];
+#endif
+
+	float *v = &vec[0];
+	float *r = &result[0];
+
+	float	radius = 0.f;
+	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
+	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
+	if ( v[0] || v[1] || v[2] )
+	{
+#ifdef _WIN32
+	_asm
+		{
+			mov			eax, v
+			mov			edx, r
+#ifdef ALIGNED_VECTOR
+			movaps		xmm4, [eax]			// r4 = vx, vy, vz, X
+			movaps		xmm1, xmm4			// r1 = r4
+#else
+			movups		xmm4, [eax]			// r4 = vx, vy, vz, X
+			movaps		xmm1, xmm4			// r1 = r4
+#endif
+			mulps		xmm1, xmm4			// r1 = vx * vx, vy * vy, vz * vz, X
+			movhlps		xmm3, xmm1			// r3 = vz * vz, X, X, X
+			movaps		xmm2, xmm1			// r2 = r1
+			shufps		xmm2, xmm2, 1		// r2 = vy * vy, X, X, X
+			addss		xmm1, xmm2			// r1 = (vx * vx) + (vy * vy), X, X, X
+			addss		xmm1, xmm3			// r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+			sqrtss		xmm1, xmm1			// r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
+			movss		radius, xmm1		// radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
+			rcpss		xmm1, xmm1			// r1 = 1/radius, X, X, X
+			shufps		xmm1, xmm1, 0		// r1 = 1/radius, 1/radius, 1/radius, X
+			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+		}
+#elif _LINUX
+		__asm__ __volatile__(
+#ifdef ALIGNED_VECTOR
+            "movaps          %2, %%xmm4 \n\t"
+            "movaps          %%xmm4, %%xmm1 \n\t"
+#else
+            "movups          %2, %%xmm4 \n\t"
+            "movaps          %%xmm4, %%xmm1 \n\t"
+#endif
+            "mulps           %%xmm4, %%xmm1 \n\t"
+            "movhlps         %%xmm1, %%xmm3 \n\t"
+            "movaps          %%xmm1, %%xmm2 \n\t"
+            "shufps          $1, %%xmm2, %%xmm2 \n\t"
+            "addss           %%xmm2, %%xmm1 \n\t"
+            "addss           %%xmm3, %%xmm1 \n\t"
+            "sqrtss          %%xmm1, %%xmm1 \n\t"
+            "movss           %%xmm1, %0 \n\t"
+            "rcpss           %%xmm1, %%xmm1 \n\t"
+            "shufps          $0, %%xmm1, %%xmm1 \n\t"
+            "mulps           %%xmm1, %%xmm4 \n\t"
+            "movaps          %%xmm4, %1 \n\t"
+            : "=m" (radius), "=m" (result)
+            : "m" (*v)
+ 		);
+#else
+	#error "Not Implemented"
+#endif
+		vec.x = result[0];
+		vec.y = result[1];
+		vec.z = result[2];
+
+	}
+
+	return radius;
+}
+
+void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
+{
+	float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
+
+	vec.x *= ool;
+	vec.y *= ool;
+	vec.z *= ool;
+}
+
+float _SSE_InvRSquared(const float* v)
+{
+	float	inv_r2 = 1.f;
+#ifdef _WIN32
+	_asm { // Intel SSE only routine
+		mov			eax, v
+		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps		xmm4, [eax]			// x4 = vx, vy, vz, X
+#else
+		movups		xmm4, [eax]			// x4 = vx, vy, vz, X
+#endif
+		movaps		xmm1, xmm4			// x1 = x4
+		mulps		xmm1, xmm4			// x1 = vx * vx, vy * vy, vz * vz, X
+		movhlps		xmm3, xmm1			// x3 = vz * vz, X, X, X
+		movaps		xmm2, xmm1			// x2 = x1
+		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
+		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
+		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
+		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
+		movss		inv_r2, xmm0		// inv_r2 = x0
+	}
+#elif _LINUX
+		__asm__ __volatile__(
+#ifdef ALIGNED_VECTOR
+		"movaps          %1, %%xmm4 \n\t"
+#else
+		"movups          %1, %%xmm4 \n\t"
+#endif
+        "movaps          %%xmm4, %%xmm1 \n\t"
+        "mulps           %%xmm4, %%xmm1 \n\t"
+		"movhlps         %%xmm1, %%xmm3 \n\t"
+		"movaps          %%xmm1, %%xmm2 \n\t"
+        "shufps          $1, %%xmm2, %%xmm2 \n\t"
+        "addss           %%xmm2, %%xmm1 \n\t"
+        "addss           %%xmm3, %%xmm1 \n\t"
+		"maxss           %%xmm5, %%xmm1 \n\t"
+        "rcpss           %%xmm1, %%xmm0 \n\t"
+		"movss           %%xmm0, %0 \n\t" 
+        : "=m" (inv_r2)
+        : "m" (*v), "0" (inv_r2)
+ 		);
+#else
+	#error "Not Implemented"
+#endif
+
+	return inv_r2;
+}
+
+void _SSE_SinCos(float x, float* s, float* c)
+{
+#ifdef _WIN32
+	float t4, t8, t12;
+
+	__asm
+	{
+		movss	xmm0, x
+		movss	t12, xmm0
+		movss	xmm1, _ps_am_inv_sign_mask
+		mov		eax, t12
+		mulss	xmm0, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		and		eax, 0x80000000
+
+		cvttss2si	edx, xmm0
+		mov		ecx, edx
+		mov		t12, esi
+		mov		esi, edx
+		add		edx, 0x1	
+		shl		ecx, (31 - 1)
+		shl		edx, (31 - 1)
+
+		movss	xmm4, _ps_am_1
+		cvtsi2ss	xmm3, esi
+		mov		t8, eax
+		and		esi, 0x1
+
+		subss	xmm0, xmm3
+		movss	xmm3, _sincos_inv_masks[esi * 4]
+		minss	xmm0, xmm4
+
+		subss	xmm4, xmm0
+
+		movss	xmm6, xmm4
+		andps	xmm4, xmm3
+		and		ecx, 0x80000000
+		movss	xmm2, xmm3
+		andnps	xmm3, xmm0
+		and		edx, 0x80000000
+		movss	xmm7, t8
+		andps	xmm0, xmm2
+		mov		t8, ecx
+		mov		t4, edx
+		orps	xmm4, xmm3
+
+		mov		eax, s     //mov eax, [esp + 4 + 16]
+		mov		edx, c //mov edx, [esp + 4 + 16 + 4]
+
+		andnps	xmm2, xmm6
+		orps	xmm0, xmm2
+
+		movss	xmm2, t8
+		movss	xmm1, xmm0
+		movss	xmm5, xmm4
+		xorps	xmm7, xmm2
+		movss	xmm3, _ps_sincos_p3
+		mulss	xmm0, xmm0
+		mulss	xmm4, xmm4
+		movss	xmm2, xmm0
+		movss	xmm6, xmm4
+		orps	xmm1, xmm7
+		movss	xmm7, _ps_sincos_p2
+		mulss	xmm0, xmm3
+		mulss	xmm4, xmm3
+		movss	xmm3, _ps_sincos_p1
+		addss	xmm0, xmm7
+		addss	xmm4, xmm7
+		movss	xmm7, _ps_sincos_p0
+		mulss	xmm0, xmm2
+		mulss	xmm4, xmm6
+		addss	xmm0, xmm3
+		addss	xmm4, xmm3
+		movss	xmm3, t4
+		mulss	xmm0, xmm2
+		mulss	xmm4, xmm6
+		orps	xmm5, xmm3
+		mov		esi, t12
+		addss	xmm0, xmm7
+		addss	xmm4, xmm7
+		mulss	xmm0, xmm1
+		mulss	xmm4, xmm5
+
+		// use full stores since caller might reload with full loads
+		movss	[eax], xmm0
+		movss	[edx], xmm4
+	}
+#elif _LINUX
+	#warning "_SSE_sincos NOT implemented!"
+#else
+	#error "Not Implemented"
+#endif
+}
+
+float _SSE_cos( float x )
+{
+#ifdef _WIN32
+	float temp;
+	__asm
+	{
+		movss	xmm0, x
+		movss	xmm1, _ps_am_inv_sign_mask
+		andps	xmm0, xmm1
+		addss	xmm0, _ps_am_pi_o_2
+		mulss	xmm0, _ps_am_2_o_pi
+
+		cvttss2si	ecx, xmm0
+		movss	xmm5, _ps_am_1
+		mov		edx, ecx
+		shl		edx, (31 - 1)
+		cvtsi2ss	xmm1, ecx
+		and		edx, 0x80000000
+		and		ecx, 0x1
+
+		subss	xmm0, xmm1
+		movss	xmm6, _sincos_masks[ecx * 4]
+		minss	xmm0, xmm5
+
+		movss	xmm1, _ps_sincos_p3
+		subss	xmm5, xmm0
+
+		andps	xmm5, xmm6
+		movss	xmm7, _ps_sincos_p2
+		andnps	xmm6, xmm0
+		mov		temp, edx
+		orps	xmm5, xmm6
+		movss	xmm0, xmm5
+
+		mulss	xmm5, xmm5
+		movss	xmm4, _ps_sincos_p1
+		movss	xmm2, xmm5
+		mulss	xmm5, xmm1
+		movss	xmm1, _ps_sincos_p0
+		addss	xmm5, xmm7
+		mulss	xmm5, xmm2
+		movss	xmm3, temp
+		addss	xmm5, xmm4
+		mulss	xmm5, xmm2
+		orps	xmm0, xmm3
+		addss	xmm5, xmm1
+		mulss	xmm0, xmm5
+		
+		movss   x,    xmm0
+
+	}
+#elif _LINUX
+	#warning "_SSE_cos NOT implemented!"
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+
+//-----------------------------------------------------------------------------
+// SSE2 implementations of optimized routines:
+//-----------------------------------------------------------------------------
+void _SSE2_SinCos(float x, float* s, float* c)  // any x
+{
+#ifdef _WIN32
+	__asm
+	{
+		movss	xmm0, x
+		movaps	xmm7, xmm0
+		movss	xmm1, _ps_am_inv_sign_mask
+		movss	xmm2, _ps_am_sign_mask
+		movss	xmm3, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		andps	xmm7, xmm2
+		mulss	xmm0, xmm3
+
+		pxor	xmm3, xmm3
+		movd	xmm5, _epi32_1
+		movss	xmm4, _ps_am_1
+
+		cvttps2dq	xmm2, xmm0
+		pand	xmm5, xmm2
+		movd	xmm1, _epi32_2
+		pcmpeqd	xmm5, xmm3
+		movd	xmm3, _epi32_1
+		cvtdq2ps	xmm6, xmm2
+		paddd	xmm3, xmm2
+		pand	xmm2, xmm1
+		pand	xmm3, xmm1
+		subss	xmm0, xmm6
+		pslld	xmm2, (31 - 1)
+		minss	xmm0, xmm4
+
+		mov		eax, s     // mov eax, [esp + 4 + 16]
+		mov		edx, c	   // mov edx, [esp + 4 + 16 + 4]
+
+		subss	xmm4, xmm0
+		pslld	xmm3, (31 - 1)
+
+		movaps	xmm6, xmm4
+		xorps	xmm2, xmm7
+		movaps	xmm7, xmm5
+		andps	xmm6, xmm7
+		andnps	xmm7, xmm0
+		andps	xmm0, xmm5
+		andnps	xmm5, xmm4
+		movss	xmm4, _ps_sincos_p3
+		orps	xmm6, xmm7
+		orps	xmm0, xmm5
+		movss	xmm5, _ps_sincos_p2
+
+		movaps	xmm1, xmm0
+		movaps	xmm7, xmm6
+		mulss	xmm0, xmm0
+		mulss	xmm6, xmm6
+		orps	xmm1, xmm2
+		orps	xmm7, xmm3
+		movaps	xmm2, xmm0
+		movaps	xmm3, xmm6
+		mulss	xmm0, xmm4
+		mulss	xmm6, xmm4
+		movss	xmm4, _ps_sincos_p1
+		addss	xmm0, xmm5
+		addss	xmm6, xmm5
+		movss	xmm5, _ps_sincos_p0
+		mulss	xmm0, xmm2
+		mulss	xmm6, xmm3
+		addss	xmm0, xmm4
+		addss	xmm6, xmm4
+		mulss	xmm0, xmm2
+		mulss	xmm6, xmm3
+		addss	xmm0, xmm5
+		addss	xmm6, xmm5
+		mulss	xmm0, xmm1
+		mulss	xmm6, xmm7
+
+		// use full stores since caller might reload with full loads
+		movss	[eax], xmm0
+		movss	[edx], xmm6
+	}
+#elif _LINUX
+	#warning "_SSE2_SinCos NOT implemented!"
+#else
+	#error "Not Implemented"
+#endif
+}
+
+float _SSE2_cos(float x)  
+{
+#ifdef _WIN32
+	__asm
+	{
+		movss	xmm0, x
+		movss	xmm1, _ps_am_inv_sign_mask
+		movss	xmm2, _ps_am_pi_o_2
+		movss	xmm3, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		addss	xmm0, xmm2
+		mulss	xmm0, xmm3
+
+		pxor	xmm3, xmm3
+		movd	xmm5, _epi32_1
+		movss	xmm4, _ps_am_1
+		cvttps2dq	xmm2, xmm0
+		pand	xmm5, xmm2
+		movd	xmm1, _epi32_2
+		pcmpeqd	xmm5, xmm3
+		cvtdq2ps	xmm6, xmm2
+		pand	xmm2, xmm1
+		pslld	xmm2, (31 - 1)
+
+		subss	xmm0, xmm6
+		movss	xmm3, _ps_sincos_p3
+		minss	xmm0, xmm4
+		subss	xmm4, xmm0
+		andps	xmm0, xmm5
+		andnps	xmm5, xmm4
+		orps	xmm0, xmm5
+
+		movaps	xmm1, xmm0
+		movss	xmm4, _ps_sincos_p2
+		mulss	xmm0, xmm0
+		movss	xmm5, _ps_sincos_p1
+		orps	xmm1, xmm2
+		movaps	xmm7, xmm0
+		mulss	xmm0, xmm3
+		movss	xmm6, _ps_sincos_p0
+		addss	xmm0, xmm4
+		mulss	xmm0, xmm7
+		addss	xmm0, xmm5
+		mulss	xmm0, xmm7
+		addss	xmm0, xmm6
+		mulss	xmm0, xmm1
+		movss   x,    xmm0
+	}
+#elif _LINUX
+	#warning "_SSE2_cos NOT implemented!"
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+
+// SSE Version of VectorTransform
+void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out1 );
+
+#ifdef _WIN32
+	__asm
+	{
+		mov eax, in1;
+		mov ecx, in2;
+		mov edx, out1;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+ 		movss [edx], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+		movss [edx+4], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+		movss [edx+8], xmm0;
+	}
+#elif _LINUX
+	#warning "VectorTransformSSE C implementation only"
+		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
+		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
+		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
+#else
+	#error "Not Implemented"
+#endif
+}
+
+void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out1 );
+
+#ifdef _WIN32
+	__asm
+	{
+		mov eax, in1;
+		mov ecx, in2;
+		mov edx, out1;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+ 		movss [edx], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx+4], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx+8], xmm0;
+	}
+#elif _LINUX
+	#warning "VectorRotateSSE C implementation only"
+		out1[0] = DotProduct( in1, in2[0] );
+		out1[1] = DotProduct( in1, in2[1] );
+		out1[2] = DotProduct( in1, in2[2] );
+#else
+	#error "Not Implemented"
+#endif
+}
+
+#ifdef _WIN32
+void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
+{
+	// FIXME: This don't work!! It will overwrite memory in the write to dest
+	Assert(0);
+
+	Assert( s_bMathlibInitialized );
+	_asm {  // Intel SSE only routine
+		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
+		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
+		mov edx, DWORD PTR [esp+0x10]	; *dest
+		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movaps	[edx], xmm3				; *dest = x3
+#else
+		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movups	[edx], xmm3				; *dest = x3
+#endif
+	}
+}
+#endif
+
+#ifdef _WIN32
+#ifdef PFN_VECTORMA
+void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
+{
+	// FIXME: This don't work!! It will overwrite memory in the write to dest
+	Assert(0);
+
+	Assert( s_bMathlibInitialized );
+	_asm 
+	{  
+		// Intel SSE only routine
+		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
+		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
+		mov edx, DWORD PTR [esp+0x10]	; *dest
+		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movaps	[edx], xmm3				; *dest = x3
+#else
+		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movups	[edx], xmm3				; *dest = x3
+#endif
+	}
+}
+float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
+#endif
+#endif
+
+
+// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
+//   Should be validated too!  :)
+//   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
+//   this, but when inlined, and instruction scheduled, the C version is faster.  
+//   Verified this via VTune
+/*
+vec_t DotProduct (const vec_t *a, const vec_t *c)
+{
+	vec_t temp;
+
+	__asm
+	{
+		mov eax, a;
+		mov ecx, c;
+		mov edx, DWORD PTR [temp]
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx], xmm0;
+		fld DWORD PTR [edx];
+		ret
+	}
+}
+*/
+
--- a/mathlib/sse.h
+++ b/mathlib/sse.h
@ -0,0 +1,23 @@
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef _SSE_H
+#define _SSE_H
+
+float _SSE_Sqrt(float x);
+float _SSE_RSqrtAccurate(float a);
+float _SSE_RSqrtFast(float x);
+float FASTCALL _SSE_VectorNormalize(Vector& vec);
+void FASTCALL _SSE_VectorNormalizeFast(Vector& vec);
+float _SSE_InvRSquared(const float* v);
+void _SSE_SinCos(float x, float* s, float* c);
+float _SSE_cos( float x);
+void _SSE2_SinCos(float x, float* s, float* c);
+float _SSE2_cos(float x); 
+void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1);
+void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 );
+
+#endif // _SSE_H
--- a/mathlib/sseconst.cpp
+++ b/mathlib/sseconst.cpp
--- a/mathlib/ssenoise.cpp
+++ b/mathlib/ssenoise.cpp
@ -0,0 +1,105 @@
+//========= Copyright <20> 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Fast low quality noise suitable for real time use
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/ssemath.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+#include "noisedata.h"
+
+
+#define MAGIC_NUMBER (1<<15)								// gives 8 bits of fraction
+
+static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
+
+
+static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff};
+
+#define MASK255 (*((fltx4 *)(& idx_mask )))
+
+// returns 0..1
+static inline float GetLatticePointValue( int idx_x, int idx_y, int idx_z )
+{
+	int ret_idx = perm_a[idx_x & 0xff];
+	ret_idx = perm_b[( idx_y + ret_idx ) & 0xff];
+	ret_idx = perm_c[( idx_z + ret_idx ) & 0xff];
+	return impulse_xcoords[ret_idx];
+
+}
+
+fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z )
+{
+	// use magic to convert to integer index
+	fltx4 x_idx = AndSIMD( MASK255, AddSIMD( x, Four_MagicNumbers ) );
+	fltx4 y_idx = AndSIMD( MASK255, AddSIMD( y, Four_MagicNumbers ) );
+	fltx4 z_idx = AndSIMD( MASK255, AddSIMD( z, Four_MagicNumbers ) );
+
+	fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros;
+	fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros;
+
+	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
+	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
+	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
+	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
+#define DOPASS(i)															\
+    {	unsigned int xi = SubInt( x_idx, i );								\
+		unsigned int yi = SubInt( y_idx, i );								\
+		unsigned int zi = SubInt( z_idx, i );								\
+		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
+		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
+		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
+		xi>>=8;																\
+		yi>>=8;																\
+		zi>>=8;																\
+																			\
+		SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi );		\
+		SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 );		\
+		SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi );		\
+		SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 );	\
+		SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi );		\
+		SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 );	\
+		SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi );	\
+		SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 );	\
+    }
+
+	DOPASS( 0 );
+	DOPASS( 1 );
+	DOPASS( 2 );
+	DOPASS( 3 );
+
+	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
+	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops
+
+	// first, do x interpolation
+	fltx4 l2d00 = AddSIMD( lattice000, MulSIMD( xfrac, SubSIMD( lattice100, lattice000 ) ) );
+	fltx4 l2d01 = AddSIMD( lattice001, MulSIMD( xfrac, SubSIMD( lattice101, lattice001 ) ) );
+	fltx4 l2d10 = AddSIMD( lattice010, MulSIMD( xfrac, SubSIMD( lattice110, lattice010 ) ) );
+	fltx4 l2d11 = AddSIMD( lattice011, MulSIMD( xfrac, SubSIMD( lattice111, lattice011 ) ) );
+
+	// now, do y interpolation
+	fltx4 l1d0 = AddSIMD( l2d00, MulSIMD( yfrac, SubSIMD( l2d10, l2d00 ) ) );
+	fltx4 l1d1 = AddSIMD( l2d01, MulSIMD( yfrac, SubSIMD( l2d11, l2d01 ) ) );
+
+	// final z interpolation
+	fltx4 rslt = AddSIMD( l1d0, MulSIMD( zfrac, SubSIMD( l1d1, l1d0 ) ) );
+
+	// map to 0..1
+	return MulSIMD( Four_Twos, SubSIMD( rslt, Four_PointFives ) );
+
+
+}
+
+fltx4 NoiseSIMD( FourVectors const &pos )
+{
+	return NoiseSIMD( pos.x, pos.y, pos.z );
+}
--- a/mathlib/vector.cpp
+++ b/mathlib/vector.cpp
@ -0,0 +1,12 @@
+//========= Copyright <20> 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#include "mathlib/vector.h"
+
+Vector vec3_origin(0,0,0);
+
--- a/mathlib/vmatrix.cpp
+++ b/mathlib/vmatrix.cpp