// // ai/spatialarray.cpp // // Copyright (C) 1999-2011 Rockstar Games. All Rights Reserved. // #include "ai/spatialarray.h" #include "math/amath.h" #include "system/criticalsection.h" #include "system/memory.h" #include "system/system.h" #include "vector/colors.h" #include "vectormath/classes.h" #if __DEV #include "grcore/debugdraw.h" #endif #define SA_STATS 0 #define Align16(x) (((x)+15)&~15) #if SA_STATS #include "profile/profiler.h" namespace CSpatialArrayStats { PF_PAGE(SpatialArray, "Spatial Array"); PF_GROUP(Update); PF_LINK(SpatialArray, Update); PF_TIMER(Insert, Update); PF_TIMER(Remove, Update); PF_TIMER(Update, Update); PF_TIMER(GetTypeFlags, Update); PF_TIMER(SetTypeFlags, Update); PF_TIMER(FindClosest3, Update); PF_TIMER(FindClosest4, Update); PF_TIMER(FindInCylinderXY, Update); PF_TIMER(FindInSphere, Update); PF_TIMER(FindInSphereOfType, Update); PF_TIMER(FindNearSegment, Update); PF_TIMER(FindBelowZ, Update); } using namespace CSpatialArrayStats; #define SA_PF_START(x) PF_START(x) #define SA_PF_STOP(x) PF_STOP(x) #define SA_PF_FUNC(x) PF_FUNC(x) #else // SA_STATS #define SA_PF_START(x) #define SA_PF_STOP(x) #define SA_PF_FUNC(x) #endif // SA_STATS #if SPATIALARRAY64BIT namespace { u64 sNodePtrToU64(const CSpatialArrayNode* nodePtr) { #if __64BIT return (u64)nodePtr; #else // Could do this to test: // u32 upper = (u32)nodePtr + 1; // u32 lower = (u32)nodePtr; // return ((u64)(upper) << 32) + (u64)lower; return (u64)(u32)nodePtr; #endif } } // anon namespace #endif // SPATIALARRAY64BIT bool CSpatialArray::sm_UseLock = false; #define SPATIALARRAYTHREADLOCK \ sysCriticalSection cs(m_Lock, sm_UseLock); \ if (!sm_UseLock) \ { \ Assertf(CSystem::IsThisThreadId(SYS_THREAD_UPDATE), "CSpatialArray used from another thread than the main thread, this may not be safe."); \ } //------------------------------------------------------------------------------ // CSpatialArray CSpatialArray::CSpatialArray(void *buffer, int maxObj) : m_MaxObj(maxObj) , m_NumObj(0) { // The use of vector selection for addresses wouldn't work right // away on 64 bit pointers. It should still be possible to vectorize // it efficiently with 128 bit vectors, but we would need one high // and one low vector register for four pointers or something. // That work hasn't been done, so generate some errors if this is about // to happen. /FF #if !SPATIALARRAY64BIT #if __64BIT // To not prevent 64 bit applications to build, this is just a run time error // (it's very possible that the application compiles with this class without // actually using it). /FF Errorf("CSpatialArray was not designed for 64 bit pointers, probably won't work correctly right now."); #else // If we are not running in 64 bit mode, I wouldn't expect pointers to be // anything but 32 bits, so if they are, we may as well catch it at compile time. /FF CompileTimeAssert(sizeof(void*) == 4); #endif #endif // !SPATIALARRAY64BIT Assert((maxObj & 3) == 0); float *posXArray = (float*)buffer; float *posYArray = posXArray + maxObj; float *posZArray = posYArray + maxObj; Assert(Align16((size_t)posXArray)); Assert(Align16((size_t)posYArray)); Assert(Align16((size_t)posZArray)); #if SPATIALARRAY64BIT u32 *nodesUpper = (u32*)(posZArray + maxObj); u32 *nodesLower = nodesUpper + maxObj; u32 *typeFlagArray = (u32*)(nodesLower + maxObj); #else CSpatialArrayNodeAddr *nodes = (CSpatialArrayNodeAddr*)(posZArray + maxObj); u32 *typeFlagArray = (u32*)(nodes + maxObj); #endif Assert(((size_t)posXArray & 0xf) == 0); Assert(((size_t)posYArray & 0xf) == 0); Assert(((size_t)posZArray & 0xf) == 0); #if SPATIALARRAY64BIT Assert(((size_t)nodesUpper & 0xf) == 0); Assert(((size_t)nodesLower & 0xf) == 0); #else Assert(((size_t)nodes & 0xf) == 0); #endif Assert(((size_t)typeFlagArray & 0xf) == 0); m_PosXArray = posXArray; m_PosYArray = posYArray; m_PosZArray = posZArray; #if SPATIALARRAY64BIT m_NodeArrayUpper = nodesUpper; m_NodeArrayLower = nodesLower; #else m_NodeArray = nodes; #endif m_TypeFlagArray = typeFlagArray; // For some of the vector operations to work at the end of the array, // we make sure to keep the node pointers to NULL. /FF #if SPATIALARRAY64BIT sysMemSet((void*)nodesUpper, 0, sizeof(u32)*maxObj); sysMemSet((void*)nodesLower, 0, sizeof(u32)*maxObj); #else sysMemSet((void*)nodes, 0, sizeof(CSpatialArrayNodeAddr)*maxObj); #endif Assert(kMaxObjForTempBuffer >= maxObj); Assert(maxObj*sizeof(float) <= 0xffff); } void CSpatialArray::Reset() { SPATIALARRAYTHREADLOCK; m_NumObj = 0; } void CSpatialArray::Insert(CSpatialArrayNode &node, u32 typeFlags, bool forceInsert) { SA_PF_FUNC(Insert); if(forceInsert || Verifyf(node.m_Offs == CSpatialArrayNode::kOffsInvalid, "Tried to insert a spatial array node that's already inserted.")) { SPATIALARRAYTHREADLOCK; const int numObj = m_NumObj; const int maxObj = m_MaxObj; if(Verifyf(numObj < maxObj, "Out of space in spatial array.")) { const unsigned int offs = numObj*sizeof(float); const int newNumObj = numObj + 1; #if SPATIALARRAY64BIT u32 *addedNodePtrUpper = (u32*)((char*)m_NodeArrayUpper + offs); u32 *addedNodePtrLower = (u32*)((char*)m_NodeArrayLower + offs); #else CSpatialArrayNodeAddr *addedNodePtr = (CSpatialArrayNodeAddr*)((char*)m_NodeArray + offs); #endif u32 *typeFlagPtr = (u32*)((char*)m_TypeFlagArray + offs); #if SPATIALARRAY64BIT u64 nodePtr = sNodePtrToU64(&node); *addedNodePtrLower = (u32)nodePtr; *addedNodePtrUpper = (u32)(nodePtr >> 32); #else *addedNodePtr = ptrdiff_t_to_int((ptrdiff_t)&node); // Catch truncation on x64 builds #endif node.m_Offs = (u16)offs; m_NumObj = newNumObj; *typeFlagPtr = typeFlags; } else { node.m_Offs = CSpatialArrayNode::kOffsInvalid; } } } void CSpatialArray::Remove(CSpatialArrayNode &node) { SA_PF_FUNC(Remove); if(Verifyf(node.m_Offs != CSpatialArrayNode::kOffsInvalid, "Removing spatial array node not in array.")) { SPATIALARRAYTHREADLOCK; const int oldNumObj = m_NumObj; const int newNumObj = oldNumObj - 1; const unsigned int removedOffs = node.m_Offs; #if SPATIALARRAY64BIT u32* nodesLower = m_NodeArrayLower; u32* nodesUpper = m_NodeArrayUpper; u32 oldLastNodeLower = nodesLower[newNumObj]; u32 oldLastNodeUpper = nodesUpper[newNumObj]; CSpatialArrayNode* oldLastNodePtr = NodePtrFromUpperLower(oldLastNodeUpper, oldLastNodeLower); u32* removedNodePtrLower = (u32*)((char*)nodesLower + removedOffs); u32* removedNodePtrUpper = (u32*)((char*)nodesUpper + removedOffs); #else CSpatialArrayNodeAddr* nodes = m_NodeArray; CSpatialArrayNodeAddr oldLastNode = nodes[newNumObj]; CSpatialArrayNode* oldLastNodePtr = (CSpatialArrayNode*)oldLastNode; CSpatialArrayNodeAddr *removedNodePtr = (CSpatialArrayNodeAddr*)((char*)nodes + removedOffs); #endif float *posXArray = m_PosXArray; float *posYArray = m_PosYArray; float *posZArray = m_PosZArray; u32 *typeFlagArray = m_TypeFlagArray; const unsigned int oldOffs = oldLastNodePtr->m_Offs; float *posXPtr = (float*)((char*)posXArray + removedOffs); float *posYPtr = (float*)((char*)posYArray + removedOffs); float *posZPtr = (float*)((char*)posZArray + removedOffs); u32 *typeFlagPtr = (u32*)((char*)typeFlagArray + removedOffs); float *posXPtrOld = (float*)((char*)posXArray + oldOffs); float *posYPtrOld = (float*)((char*)posYArray + oldOffs); float *posZPtrOld = (float*)((char*)posZArray + oldOffs); u32 *typeFlagArrayOld = (u32*)((char*)typeFlagArray + oldOffs); *posXPtr = *posXPtrOld; *posYPtr = *posYPtrOld; *posZPtr = *posZPtrOld; *typeFlagPtr = *typeFlagArrayOld; #if SPATIALARRAY64BIT *removedNodePtrLower = oldLastNodeLower; *removedNodePtrUpper = oldLastNodeUpper; #else *removedNodePtr = oldLastNode; #endif oldLastNodePtr->m_Offs = (u16)removedOffs; node.m_Offs = CSpatialArrayNode::kOffsInvalid; m_NumObj = newNumObj; // For some of the vector operations to work properly at the end // of the array, we make sure to clear out the node pointer at // the previous end of the array. /FF #if SPATIALARRAY64BIT nodesUpper[newNumObj] = 0; nodesLower[newNumObj] = 0; #else nodes[newNumObj] = 0; #endif } } void CSpatialArray::Update(CSpatialArrayNode &node, float posX, float posY, float posZ) { SA_PF_FUNC(Update); if(Verifyf(node.m_Offs != CSpatialArrayNode::kOffsInvalid, "Tried to update position of invalid spatial array node.")) { SPATIALARRAYTHREADLOCK; const unsigned int offs = node.m_Offs; float *posXArray = m_PosXArray; float *posYArray = m_PosYArray; float *posZArray = m_PosZArray; float *posXPtr = (float*)((char*)posXArray + offs); float *posYPtr = (float*)((char*)posYArray + offs); float *posZPtr = (float*)((char*)posZArray + offs); *posXPtr = posX; *posYPtr = posY; *posZPtr = posZ; } } void CSpatialArray::UpdateWithTypeFlags(CSpatialArrayNode &node, float posX, float posY, float posZ, u32 flagsToChange, u32 flagValues) { SA_PF_FUNC(Update); if(Verifyf(node.m_Offs != CSpatialArrayNode::kOffsInvalid, "Tried to update position of invalid spatial array node.")) { SPATIALARRAYTHREADLOCK; const unsigned int offs = node.m_Offs; float *posXArray = m_PosXArray; float *posYArray = m_PosYArray; float *posZArray = m_PosZArray; float *posXPtr = (float*)((char*)posXArray + offs); float *posYPtr = (float*)((char*)posYArray + offs); float *posZPtr = (float*)((char*)posZArray + offs); u32 *flagPtr = (u32*)((char*)m_TypeFlagArray + offs); const u32 oldFlags = *flagPtr; const u32 newFlags = (oldFlags & ~flagsToChange) | flagValues; *posXPtr = posX; *posYPtr = posY; *posZPtr = posZ; *flagPtr = newFlags; } } void CSpatialArray::GetPosition(const CSpatialArrayNode &node, Vec3V_Ref posOut) const { // Not sure: // SPATIALARRAYTHREADLOCK; const unsigned int offs = node.m_Offs; const float *posXArray = m_PosXArray; const float *posYArray = m_PosYArray; const float *posZArray = m_PosZArray; const float *posXPtr = (const float*)((char*)posXArray + offs); const float *posYPtr = (const float*)((char*)posYArray + offs); const float *posZPtr = (const float*)((char*)posZArray + offs); posOut.SetXf(*posXPtr); posOut.SetYf(*posYPtr); posOut.SetZf(*posZPtr); } void CSpatialArray::SetTypeFlags(CSpatialArrayNode &node, u32 flagsToChange, u32 flagValues) { SA_PF_FUNC(SetTypeFlags); // Maybe not really accurate. /FF // If this fails, there are values set in flagValues that are not in flagsToChange, // which we are probably better off if the user could avoid, so we don't have to // spend time on masking them here. /FF Assert((flagValues & ~flagsToChange) == 0); if(Verifyf(node.m_Offs != CSpatialArrayNode::kOffsInvalid, "Tried to set type flags of invalid spatial array node.")) { SPATIALARRAYTHREADLOCK; const unsigned int offs = node.m_Offs; u32 *flagPtr = (u32*)((char*)m_TypeFlagArray + offs); const u32 oldFlags = *flagPtr; const u32 newFlags = (oldFlags & ~flagsToChange) | flagValues; *flagPtr = newFlags; } } u32 CSpatialArray::GetTypeFlags(const CSpatialArrayNode &node) const { SA_PF_FUNC(GetTypeFlags); // Maybe not really accurate. /FF u32 r; if(Verifyf(node.m_Offs != CSpatialArrayNode::kOffsInvalid, "Tried to get type flags of invalid spatial array node.")) { SPATIALARRAYTHREADLOCK; const unsigned int offs = node.m_Offs; const u32 *flagPtr = (u32*)((char*)m_TypeFlagArray + offs); r = *flagPtr; } else { r = 0; } return r; } #if SPATIALARRAY64BIT static int sPickFromSortedArrays(const Vec4V *closestND2V, const Vec4V *closestNNodesUpperV, const Vec4V *closestNNodesLowerV, float maxDist, CSpatialArrayNode **found, int numToPick) #else static int sPickFromSortedArrays(const Vec4V *closestND2V, const Vec4V *closestNNodesV, float maxDist, CSpatialArrayNode **found, int numToPick) #endif { const float maxDistSq = square(Min(maxDist, LARGE_FLOAT)); // Make sure we don't square FLT_MAX. /FF int numFound = 0; // We now basically have four sorted arrays of length N in memory, // and we will do comparisons between these arrays to find the closest // three over all. We do the comparisons using u32's, making use of the // fact that positive IEEE754 floating point numbers preserve the numerical // order when interpreted as integeres. That way, we avoid floating point // branches. /FF int k1 = 0, k2 = 1, k3 = 2, k4 = 3; u32 d1 = ((u32*)closestND2V)[k1]; u32 d2 = ((u32*)closestND2V)[k2]; u32 d3 = ((u32*)closestND2V)[k3]; u32 d4 = ((u32*)closestND2V)[k4]; for(int i = 0; i < numToPick; i++) { // Note: should be set in all code paths below. /FF int closestIndex; if(d1 < d2) { if(d1 < d3) { if(d1 < d4) { // d1 smallest closestIndex = k1; k1 += 4; d1 = ((u32*)closestND2V)[k1]; } else { // d4 smallest closestIndex = k4; k4 += 4; d4 = ((u32*)closestND2V)[k4]; } } else { if(d3 < d4) { // d3 smallest closestIndex = k3; k3 += 4; d3 = ((u32*)closestND2V)[k3]; } else { // d4 smallest closestIndex = k4; k4 += 4; d4 = ((u32*)closestND2V)[k4]; } } } else { if(d2 < d3) { if(d2 < d4) { // d2 smallest closestIndex = k2; k2 += 4; d2 = ((u32*)closestND2V)[k2]; } else { // d4 smallest closestIndex = k4; k4 += 4; d4 = ((u32*)closestND2V)[k4]; } } else { if(d3 < d4) { // d3 smallest closestIndex = k3; k3 += 4; d3 = ((u32*)closestND2V)[k3]; } else { // d4 smallest closestIndex = k4; k4 += 4; d4 = ((u32*)closestND2V)[k4]; } } } #if SPATIALARRAY64BIT CSpatialArrayNode* closest = CSpatialArray::NodePtrFromUpperLower( ((u32*)closestNNodesUpperV)[closestIndex], ((u32*)closestNNodesLowerV)[closestIndex] ); #else CSpatialArrayNode* closest = (CSpatialArrayNode*)(((CSpatialArrayNodeAddr*)closestNNodesV)[closestIndex]); #endif if(closest) { const float distSq = ((float*)closestND2V)[closestIndex]; if(distSq <= maxDistSq) { found[numFound++] = closest; } else { break; } } else { break; } } // Note: if it's useful, we could potentially let the code above extract // more close objects by continuing to operate on the arrays. It wouldn't // be perfectly accurate beyond the first three, but they would still be // objects closer than many others. /FF return numFound; } int CSpatialArray::FindClosest3(Vec3V_In centerV, CSpatialArrayNode **found, int ASSERT_ONLY(maxFound), const u32 &typeFlagsToCareAbout, const u32 &typeFlagValues, const CSpatialArrayNode* &excl1, const CSpatialArrayNode* &excl2, float maxDist) const { SA_PF_FUNC(FindClosest3); Assert(maxFound >= 3); // If this fails, there are values set in flagValues that are not in flagsToChange, // which we are probably better off if the user could avoid, so we don't have to // spend time on masking them here. /FF Assert((typeFlagValues & ~typeFlagsToCareAbout) == 0); SPATIALARRAYTHREADLOCK; // Load the type flag stuff into vector registers. Note that we intentionally // pass in these by reference, requiring the user to put them in memory, because // if they were passed in in general purpose registers, we would need to store // them to memory and load them back anyway. Could pass them in in vector // registers, of course, but that's probably not worth the trouble. /FF const Vec4V typeFlagsToCareAboutV = Vec4V(LoadScalar32IntoScalarV(typeFlagsToCareAbout)); const Vec4V typeFlagValuesV = Vec4V(LoadScalar32IntoScalarV(typeFlagValues)); #if SPATIALARRAY64BIT // Get the exclusion pointers into vector registers. This way // of doing it is probably sub-optimal: we should be able to // read straight from excl1/excl2 (references to caller's memory) // into vector registers like we do in the 32 bit case, but to do so // we would have to be really careful to avoid endianness issues. u64 excl1Ptr = sNodePtrToU64(excl1); u64 excl2Ptr = sNodePtrToU64(excl2); ScalarV excl1LowerSV, excl1UpperSV; ScalarV excl2LowerSV, excl2UpperSV; excl1UpperSV.Seti((u32)(excl1Ptr >> 32)); excl2UpperSV.Seti((u32)(excl2Ptr >> 32)); excl1LowerSV.Seti((u32)excl1Ptr); excl2LowerSV.Seti((u32)excl2Ptr); const Vec4V excl1LowerV = Vec4V(excl1LowerSV); const Vec4V excl2LowerV = Vec4V(excl2LowerSV); const Vec4V excl1UpperV = Vec4V(excl1UpperSV); const Vec4V excl2UpperV = Vec4V(excl2UpperSV); #else const Vec4V excl1V = Vec4V(LoadScalar32IntoScalarV(*(u32*)&excl1)); const Vec4V excl2V = Vec4V(LoadScalar32IntoScalarV(*(u32*)&excl2)); #endif const Vec4V* RESTRICT objXPtr = (const Vec4V*)m_PosXArray; const Vec4V* RESTRICT objYPtr = (const Vec4V*)m_PosYArray; const Vec4V* RESTRICT objZPtr = (const Vec4V*)m_PosZArray; #if SPATIALARRAY64BIT const Vec4V* RESTRICT nodesPtrLower = (const Vec4V*)m_NodeArrayLower; const Vec4V* RESTRICT nodesPtrUpper = (const Vec4V*)m_NodeArrayUpper; #else const Vec4V* RESTRICT nodesPtr = (const Vec4V*)m_NodeArray; #endif const Vec4V* RESTRICT typeFlagPtr = (const Vec4V*)m_TypeFlagArray; const Vec4V centerxV(SplatX(centerV)); const Vec4V centeryV(SplatY(centerV)); const Vec4V centerzV(SplatZ(centerV)); const Vec4V zeroV(V_ZERO); const Vec4V maxDistV(V_FLT_MAX); const int numObj = m_NumObj; // These are used to keep track of the three closest objects // for each of the components in the vector registers. /FF #if SPATIALARRAY64BIT Vec4V close1NodesLowerV(V_ZERO); Vec4V close2NodesLowerV(V_ZERO); Vec4V close3NodesLowerV(V_ZERO); Vec4V close1NodesUpperV(V_ZERO); Vec4V close2NodesUpperV(V_ZERO); Vec4V close3NodesUpperV(V_ZERO); #else Vec4V close1NodesV(V_ZERO); Vec4V close2NodesV(V_ZERO); Vec4V close3NodesV(V_ZERO); #endif // These are the squared distances for the objects in // close[1/2/3]NodesV. /FF Vec4V close1D2V(V_FLT_MAX); Vec4V close2D2V(V_FLT_MAX); Vec4V close3D2V(V_FLT_MAX); for(int i = 0; i < numObj; i += 4) { // Load from the arrays to the vector registers. /FF const Vec4V xxV = *objXPtr; const Vec4V yyV = *objYPtr; const Vec4V zzV = *objZPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *nodesPtrUpper; const Vec4V nodesLowerV = *nodesPtrLower; #else const Vec4V nodesV = *nodesPtr; #endif const Vec4V objTypeFlagsV = *typeFlagPtr; // Compute the squared distance to the center. /FF const Vec4V dxV = Subtract(xxV, centerxV); const Vec4V dyV = Subtract(yyV, centeryV); const Vec4V dzV = Subtract(zzV, centerzV); const Vec4V dx2V = Scale(dxV, dxV); const Vec4V dxy2V = AddScaled(dx2V, dyV, dyV); const Vec4V d2BeforeMaskV = AddScaled(dxy2V, dzV, dzV); const Vec4V objTypeFlagsCaredAboutV = And(objTypeFlagsV, typeFlagsToCareAboutV); // This is needed to deal properly with the end of the array if the number // of objects is not aligned with 4. The node pointers beyond the end will // be NULL, and here we create a mask where 0x0000 means that the node was // within range (pointer not NULL) while 0xffff indicates a value past the // end of the array. /FF #if SPATIALARRAY64BIT const VecBoolV selectNodePtrZeroV = IsEqualInt(Or(nodesLowerV, nodesUpperV), zeroV); // Ptr is NULL only if both halves are 0. #else const VecBoolV selectNodePtrZeroV = IsEqualInt(nodesV, zeroV); #endif // Match the type flags. 0xffff in this mask indicates that // (objTypeFlags & typeFlagsToCareAbout) == typeFlagValues // i.e. the bits we care about have the values we are looking for. /FF const VecBoolV selectTypeFlagMatchV = IsEqualInt(objTypeFlagsCaredAboutV, typeFlagValuesV); #if SPATIALARRAY64BIT // Check for matches on the upper and lower halves of the exclusion addresses. const VecBoolV selectNodePtrExcl1LowerV = IsEqualInt(nodesLowerV, excl1LowerV); const VecBoolV selectNodePtrExcl2LowerV = IsEqualInt(nodesLowerV, excl2LowerV); const VecBoolV selectNodePtrExcl1UpperV = IsEqualInt(nodesUpperV, excl1UpperV); const VecBoolV selectNodePtrExcl2UpperV = IsEqualInt(nodesUpperV, excl2UpperV); // Combine the upper/lower halves together: both halves have to match. const VecBoolV selectNodePtrExcl1V = And(selectNodePtrExcl1LowerV, selectNodePtrExcl1UpperV); const VecBoolV selectNodePtrExcl2V = And(selectNodePtrExcl2LowerV, selectNodePtrExcl2UpperV); #else const VecBoolV selectNodePtrExcl1V = IsEqualInt(nodesV, excl1V); const VecBoolV selectNodePtrExcl2V = IsEqualInt(nodesV, excl2V); #endif // We have a couple of vectors now that are 0xffff on a mismatch, // instead of 0x0000 on a match. We OR them and NOT them so that // we get a mask that's 0xffff when they all match. /FF const VecBoolV selectNodePtrV = InvertBits(Or(Or(selectNodePtrExcl1V, selectNodePtrExcl2V), selectNodePtrZeroV)); // To allow use of an element, we require both that it's not past the end // of the array or otherwise an ineligible node pointer, and that the type flags match. const VecBoolV combinedSelect = And(selectNodePtrV, selectTypeFlagMatchV); // Now, select between the true measured distances and FLT_MAX, depending on whether // these objects fit the acceptance criteria above. If FLT_MAX is selected here, // it won't be closer than objects we have previously found, so the objects being // looked at now won't be chosen. /FF const Vec4V d2V = SelectFT(combinedSelect, maxDistV, d2BeforeMaskV); // Compare the squared distance of these objects vs. the squared distances // of the closest objects found so far. /FF const VecBoolV selectCloserThan1V = IsLessThan(d2V, close1D2V); const VecBoolV selectCloserThan2V = IsLessThan(d2V, close2D2V); const VecBoolV selectCloserThan3V = IsLessThan(d2V, close3D2V); // Compute some temporary vectors for the logic of how to move the elements. // For example, temp2D2V is used for the squared distance of the 2nd closest // object. If we are going to replace that element, it would either be replaced // by the current distance (if the new object is closest than the old 2nd closest, // but not closer than the #1 closest one) or by the old distance for the #1 closest // one (if that's going to get replaced). /FF const Vec4V temp2D2V = SelectFT(selectCloserThan1V, d2V, close1D2V); const Vec4V temp3D2V = SelectFT(selectCloserThan2V, d2V, close2D2V); #if SPATIALARRAY64BIT const Vec4V temp2NodesUpperV = SelectFT(selectCloserThan1V, nodesUpperV, close1NodesUpperV); const Vec4V temp3NodesUpperV = SelectFT(selectCloserThan2V, nodesUpperV, close2NodesUpperV); const Vec4V temp2NodesLowerV = SelectFT(selectCloserThan1V, nodesLowerV, close1NodesLowerV); const Vec4V temp3NodesLowerV = SelectFT(selectCloserThan2V, nodesLowerV, close2NodesLowerV); #else const Vec4V temp2NodesV = SelectFT(selectCloserThan1V, nodesV, close1NodesV); const Vec4V temp3NodesV = SelectFT(selectCloserThan2V, nodesV, close2NodesV); #endif // Finally, compute the new first, second, and third closest objects found // so far. /FF close3D2V = SelectFT(selectCloserThan3V, close3D2V, temp3D2V); close2D2V = SelectFT(selectCloserThan2V, close2D2V, temp2D2V); close1D2V = SelectFT(selectCloserThan1V, close1D2V, d2V); #if SPATIALARRAY64BIT close3NodesUpperV = SelectFT(selectCloserThan3V, close3NodesUpperV, temp3NodesUpperV); close2NodesUpperV = SelectFT(selectCloserThan2V, close2NodesUpperV, temp2NodesUpperV); close1NodesUpperV = SelectFT(selectCloserThan1V, close1NodesUpperV, nodesUpperV); close3NodesLowerV = SelectFT(selectCloserThan3V, close3NodesLowerV, temp3NodesLowerV); close2NodesLowerV = SelectFT(selectCloserThan2V, close2NodesLowerV, temp2NodesLowerV); close1NodesLowerV = SelectFT(selectCloserThan1V, close1NodesLowerV, nodesLowerV); #else close3NodesV = SelectFT(selectCloserThan3V, close3NodesV, temp3NodesV); close2NodesV = SelectFT(selectCloserThan2V, close2NodesV, temp2NodesV); close1NodesV = SelectFT(selectCloserThan1V, close1NodesV, nodesV); #endif // Move on in the arrays. /FF objXPtr++; objYPtr++; objZPtr++; #if SPATIALARRAY64BIT nodesPtrUpper++; nodesPtrLower++; #else nodesPtr++; #endif typeFlagPtr++; } // Store out the squared distances and pointers to memory. /FF Vec4V closest3D2V[3]; closest3D2V[0] = close1D2V; closest3D2V[1] = close2D2V; closest3D2V[2] = close3D2V; #if SPATIALARRAY64BIT Vec4V closest3NodesUpperV[3]; closest3NodesUpperV[0] = close1NodesUpperV; closest3NodesUpperV[1] = close2NodesUpperV; closest3NodesUpperV[2] = close3NodesUpperV; Vec4V closest3NodesLowerV[3]; closest3NodesLowerV[0] = close1NodesLowerV; closest3NodesLowerV[1] = close2NodesLowerV; closest3NodesLowerV[2] = close3NodesLowerV; return sPickFromSortedArrays(closest3D2V, closest3NodesUpperV, closest3NodesLowerV, maxDist, found, 3); #else Vec4V closest3NodesV[3]; closest3NodesV[0] = close1NodesV; closest3NodesV[1] = close2NodesV; closest3NodesV[2] = close3NodesV; return sPickFromSortedArrays(closest3D2V, closest3NodesV, maxDist, found, 3); #endif } int CSpatialArray::FindClosest4(Vec3V_In centerV, CSpatialArrayNode **found, int ASSERT_ONLY(maxFound), const u32 &typeFlagsToCareAbout, const u32 &typeFlagValues, const CSpatialArrayNode* &excl1, const CSpatialArrayNode* &excl2, float maxDist) const { SA_PF_FUNC(FindClosest4); Assert(maxFound >= 4); // If this fails, there are values set in flagValues that are not in flagsToChange, // which we are probably better off if the user could avoid, so we don't have to // spend time on masking them here. /FF Assert((typeFlagValues & ~typeFlagsToCareAbout) == 0); SPATIALARRAYTHREADLOCK; // Load the type flag stuff into vector registers. Note that we intentionally // pass in these by reference, requiring the user to put them in memory, because // if they were passed in in general purpose registers, we would need to store // them to memory and load them back anyway. Could pass them in in vector // registers, of course, but that's probably not worth the trouble. /FF const Vec4V typeFlagsToCareAboutV = Vec4V(LoadScalar32IntoScalarV(typeFlagsToCareAbout)); const Vec4V typeFlagValuesV = Vec4V(LoadScalar32IntoScalarV(typeFlagValues)); #if SPATIALARRAY64BIT // Get the exclusion pointers into vector registers. This way // of doing it is probably sub-optimal: we should be able to // read straight from excl1/excl2 (references to caller's memory) // into vector registers like we do in the 32 bit case, but to do so // we would have to be really careful to avoid endianness issues. u64 excl1Ptr = sNodePtrToU64(excl1); u64 excl2Ptr = sNodePtrToU64(excl2); ScalarV excl1LowerSV, excl1UpperSV; ScalarV excl2LowerSV, excl2UpperSV; excl1UpperSV.Seti((u32)(excl1Ptr >> 32)); excl2UpperSV.Seti((u32)(excl2Ptr >> 32)); excl1LowerSV.Seti((u32)excl1Ptr); excl2LowerSV.Seti((u32)excl2Ptr); const Vec4V excl1LowerV = Vec4V(excl1LowerSV); const Vec4V excl2LowerV = Vec4V(excl2LowerSV); const Vec4V excl1UpperV = Vec4V(excl1UpperSV); const Vec4V excl2UpperV = Vec4V(excl2UpperSV); #else const Vec4V excl1V = Vec4V(LoadScalar32IntoScalarV(*(u32*)&excl1)); const Vec4V excl2V = Vec4V(LoadScalar32IntoScalarV(*(u32*)&excl2)); #endif const Vec4V* RESTRICT objXPtr = (const Vec4V*)m_PosXArray; const Vec4V* RESTRICT objYPtr = (const Vec4V*)m_PosYArray; const Vec4V* RESTRICT objZPtr = (const Vec4V*)m_PosZArray; #if SPATIALARRAY64BIT const Vec4V* RESTRICT nodesPtrLower = (const Vec4V*)m_NodeArrayLower; const Vec4V* RESTRICT nodesPtrUpper = (const Vec4V*)m_NodeArrayUpper; #else const Vec4V* RESTRICT nodesPtr = (const Vec4V*)m_NodeArray; #endif const Vec4V* RESTRICT typeFlagPtr = (const Vec4V*)m_TypeFlagArray; const Vec4V centerxV(SplatX(centerV)); const Vec4V centeryV(SplatY(centerV)); const Vec4V centerzV(SplatZ(centerV)); const Vec4V zeroV(V_ZERO); const Vec4V maxDistV(V_FLT_MAX); const int numObj = m_NumObj; // These are used to keep track of the three closest objects // for each of the components in the vector registers. /FF #if SPATIALARRAY64BIT Vec4V close1NodesLowerV(V_ZERO); Vec4V close2NodesLowerV(V_ZERO); Vec4V close3NodesLowerV(V_ZERO); Vec4V close4NodesLowerV(V_ZERO); Vec4V close1NodesUpperV(V_ZERO); Vec4V close2NodesUpperV(V_ZERO); Vec4V close3NodesUpperV(V_ZERO); Vec4V close4NodesUpperV(V_ZERO); #else Vec4V close1NodesV(V_ZERO); Vec4V close2NodesV(V_ZERO); Vec4V close3NodesV(V_ZERO); Vec4V close4NodesV(V_ZERO); #endif // These are the squared distances for the objects in // close[1/2/3/4]NodesV. /FF Vec4V close1D2V(V_FLT_MAX); Vec4V close2D2V(V_FLT_MAX); Vec4V close3D2V(V_FLT_MAX); Vec4V close4D2V(V_FLT_MAX); for(int i = 0; i < numObj; i += 4) { // Load from the arrays to the vector registers. /FF const Vec4V xxV = *objXPtr; const Vec4V yyV = *objYPtr; const Vec4V zzV = *objZPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *nodesPtrUpper; const Vec4V nodesLowerV = *nodesPtrLower; #else const Vec4V nodesV = *nodesPtr; #endif const Vec4V objTypeFlagsV = *typeFlagPtr; // Compute the squared distance to the center. /FF const Vec4V dxV = Subtract(xxV, centerxV); const Vec4V dyV = Subtract(yyV, centeryV); const Vec4V dzV = Subtract(zzV, centerzV); const Vec4V dx2V = Scale(dxV, dxV); const Vec4V dxy2V = AddScaled(dx2V, dyV, dyV); const Vec4V d2BeforeMaskV = AddScaled(dxy2V, dzV, dzV); const Vec4V objTypeFlagsCaredAboutV = And(objTypeFlagsV, typeFlagsToCareAboutV); // This is needed to deal properly with the end of the array if the number // of objects is not aligned with 4. The node pointers beyond the end will // be NULL, and here we create a mask where 0x0000 means that the node was // within range (pointer not NULL) while 0xffff indicates a value past the // end of the array. /FF #if SPATIALARRAY64BIT const VecBoolV selectNodePtrZeroV = IsEqualInt(Or(nodesLowerV, nodesUpperV), zeroV); // Ptr is NULL only if both halves are 0. #else const VecBoolV selectNodePtrZeroV = IsEqualInt(nodesV, zeroV); #endif // Match the type flags. 0xffff in this mask indicates that // (objTypeFlags & typeFlagsToCareAbout) == typeFlagValues // i.e. the bits we care about have the values we are looking for. /FF const VecBoolV selectTypeFlagMatchV = IsEqualInt(objTypeFlagsCaredAboutV, typeFlagValuesV); #if SPATIALARRAY64BIT // Check for matches on the upper and lower halves of the exclusion addresses. const VecBoolV selectNodePtrExcl1LowerV = IsEqualInt(nodesLowerV, excl1LowerV); const VecBoolV selectNodePtrExcl2LowerV = IsEqualInt(nodesLowerV, excl2LowerV); const VecBoolV selectNodePtrExcl1UpperV = IsEqualInt(nodesUpperV, excl1UpperV); const VecBoolV selectNodePtrExcl2UpperV = IsEqualInt(nodesUpperV, excl2UpperV); // Combine the upper/lower halves together: both halves have to match. const VecBoolV selectNodePtrExcl1V = And(selectNodePtrExcl1LowerV, selectNodePtrExcl1UpperV); const VecBoolV selectNodePtrExcl2V = And(selectNodePtrExcl2LowerV, selectNodePtrExcl2UpperV); #else const VecBoolV selectNodePtrExcl1V = IsEqualInt(nodesV, excl1V); const VecBoolV selectNodePtrExcl2V = IsEqualInt(nodesV, excl2V); #endif // We have a couple of vectors now that are 0xffff on a mismatch, // instead of 0x0000 on a match. We OR them and NOT them so that // we get a mask that's 0xffff when they all match. /FF const VecBoolV selectNodePtrV = InvertBits(Or(Or(selectNodePtrExcl1V, selectNodePtrExcl2V), selectNodePtrZeroV)); // To allow use of an element, we require both that it's not past the end // of the array or otherwise an ineligible node pointer, and that the type flags match. const VecBoolV combinedSelect = And(selectNodePtrV, selectTypeFlagMatchV); // Now, select between the true measured distances and FLT_MAX, depending on whether // these objects fit the acceptance criteria above. If FLT_MAX is selected here, // it won't be closer than objects we have previously found, so the objects being // looked at now won't be chosen. /FF const Vec4V d2V = SelectFT(combinedSelect, maxDistV, d2BeforeMaskV); // Compare the squared distance of these objects vs. the squared distances // of the closest objects found so far. /FF const VecBoolV selectCloserThan1V = IsLessThan(d2V, close1D2V); const VecBoolV selectCloserThan2V = IsLessThan(d2V, close2D2V); const VecBoolV selectCloserThan3V = IsLessThan(d2V, close3D2V); const VecBoolV selectCloserThan4V = IsLessThan(d2V, close4D2V); // Compute some temporary vectors for the logic of how to move the elements. // For example, temp2D2V is used for the squared distance of the 2nd closest // object. If we are going to replace that element, it would either be replaced // by the current distance (if the new object is closest than the old 2nd closest, // but not closer than the #1 closest one) or by the old distance for the #1 closest // one (if that's going to get replaced). /FF const Vec4V temp2D2V = SelectFT(selectCloserThan1V, d2V, close1D2V); const Vec4V temp3D2V = SelectFT(selectCloserThan2V, d2V, close2D2V); const Vec4V temp4D2V = SelectFT(selectCloserThan3V, d2V, close3D2V); #if SPATIALARRAY64BIT const Vec4V temp2NodesUpperV = SelectFT(selectCloserThan1V, nodesUpperV, close1NodesUpperV); const Vec4V temp3NodesUpperV = SelectFT(selectCloserThan2V, nodesUpperV, close2NodesUpperV); const Vec4V temp4NodesUpperV = SelectFT(selectCloserThan3V, nodesUpperV, close3NodesUpperV); const Vec4V temp2NodesLowerV = SelectFT(selectCloserThan1V, nodesLowerV, close1NodesLowerV); const Vec4V temp3NodesLowerV = SelectFT(selectCloserThan2V, nodesLowerV, close2NodesLowerV); const Vec4V temp4NodesLowerV = SelectFT(selectCloserThan3V, nodesLowerV, close3NodesLowerV); #else const Vec4V temp2NodesV = SelectFT(selectCloserThan1V, nodesV, close1NodesV); const Vec4V temp3NodesV = SelectFT(selectCloserThan2V, nodesV, close2NodesV); const Vec4V temp4NodesV = SelectFT(selectCloserThan3V, nodesV, close3NodesV); #endif // Finally, compute the new first, second, and third closest objects found // so far. /FF close4D2V = SelectFT(selectCloserThan4V, close4D2V, temp4D2V); close3D2V = SelectFT(selectCloserThan3V, close3D2V, temp3D2V); close2D2V = SelectFT(selectCloserThan2V, close2D2V, temp2D2V); close1D2V = SelectFT(selectCloserThan1V, close1D2V, d2V); #if SPATIALARRAY64BIT close4NodesUpperV = SelectFT(selectCloserThan4V, close4NodesUpperV, temp4NodesUpperV); close3NodesUpperV = SelectFT(selectCloserThan3V, close3NodesUpperV, temp3NodesUpperV); close2NodesUpperV = SelectFT(selectCloserThan2V, close2NodesUpperV, temp2NodesUpperV); close1NodesUpperV = SelectFT(selectCloserThan1V, close1NodesUpperV, nodesUpperV); close4NodesLowerV = SelectFT(selectCloserThan4V, close4NodesLowerV, temp4NodesLowerV); close3NodesLowerV = SelectFT(selectCloserThan3V, close3NodesLowerV, temp3NodesLowerV); close2NodesLowerV = SelectFT(selectCloserThan2V, close2NodesLowerV, temp2NodesLowerV); close1NodesLowerV = SelectFT(selectCloserThan1V, close1NodesLowerV, nodesLowerV); #else close4NodesV = SelectFT(selectCloserThan4V, close4NodesV, temp4NodesV); close3NodesV = SelectFT(selectCloserThan3V, close3NodesV, temp3NodesV); close2NodesV = SelectFT(selectCloserThan2V, close2NodesV, temp2NodesV); close1NodesV = SelectFT(selectCloserThan1V, close1NodesV, nodesV); #endif // Move on in the arrays. /FF objXPtr++; objYPtr++; objZPtr++; #if SPATIALARRAY64BIT nodesPtrUpper++; nodesPtrLower++; #else nodesPtr++; #endif typeFlagPtr++; } // Store out the squared distances and pointers to memory. /FF Vec4V closest4D2V[4]; closest4D2V[0] = close1D2V; closest4D2V[1] = close2D2V; closest4D2V[2] = close3D2V; closest4D2V[3] = close4D2V; #if SPATIALARRAY64BIT Vec4V closest4NodesUpperV[4]; closest4NodesUpperV[0] = close1NodesUpperV; closest4NodesUpperV[1] = close2NodesUpperV; closest4NodesUpperV[2] = close3NodesUpperV; closest4NodesUpperV[3] = close4NodesUpperV; Vec4V closest4NodesLowerV[4]; closest4NodesLowerV[0] = close1NodesLowerV; closest4NodesLowerV[1] = close2NodesLowerV; closest4NodesLowerV[2] = close3NodesLowerV; closest4NodesLowerV[3] = close4NodesLowerV; return sPickFromSortedArrays(closest4D2V, closest4NodesUpperV, closest4NodesLowerV, maxDist, found, 4); #else Vec4V closest4NodesV[4]; closest4NodesV[0] = close1NodesV; closest4NodesV[1] = close2NodesV; closest4NodesV[2] = close3NodesV; closest4NodesV[3] = close4NodesV; return sPickFromSortedArrays(closest4D2V, closest4NodesV, maxDist, found, 4); #endif } int CSpatialArray::FindInSphere(Vec3V_In centerV, ScalarV_In radiusV, FindResult *found, int maxFound) const { SA_PF_FUNC(FindInSphere); SPATIALARRAYTHREADLOCK; // TODO: Probably operate on 8 objects instead of 4, to keep vector pipeline busy. // TODO: Maybe use cache prefetch and/or clear instructions. // TODO: Add some protection about assumption of 32 bit pointers, etc. /FF const Vec4V radius2V(Scale(radiusV, radiusV)); const int numObj = m_NumObj; const Vec4V* RESTRICT objXPtr = (const Vec4V*)m_PosXArray; const Vec4V* RESTRICT objYPtr = (const Vec4V*)m_PosYArray; const Vec4V* RESTRICT objZPtr = (const Vec4V*)m_PosZArray; const Vec4V centerxV(SplatX(centerV)); const Vec4V centeryV(SplatY(centerV)); const Vec4V centerzV(SplatZ(centerV)); #if SPATIALARRAY64BIT const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(u32))/sizeof(Vec4V); Vec4V foundArrayBuffUpper[tempArraySize]; Vec4V foundArrayBuffLower[tempArraySize]; u32* RESTRICT foundArrayUpper = (u32*)foundArrayBuffUpper; u32* RESTRICT foundArrayLower = (u32*)foundArrayBuffLower; // Make really sure they got aligned properly. /FF Assertf((((size_t)foundArrayUpper) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayUpper); Assertf((((size_t)foundArrayLower) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayLower); u32* RESTRICT foundArrayPtrUpper = foundArrayUpper; u32* RESTRICT foundArrayPtrLower = foundArrayLower; #else // Reserve a vector-aligned array on the stack. /FF // Note: I believe this would work too: ALIGNAS(16) CSpatialArrayNode* foundArrayBuff[kMaxObjForTempBuffer] ; const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(CSpatialArrayNodeAddr))/sizeof(Vec4V); Vec4V foundArrayBuff[ tempArraySize ]; CSpatialArrayNodeAddr* RESTRICT foundArray = (CSpatialArrayNodeAddr*)foundArrayBuff; // Make really sure it got aligned properly. /FF Assertf((((size_t)foundArray) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArray); CSpatialArrayNodeAddr* RESTRICT foundArrayPtr = foundArray; #endif Vec4V distanceArray[ kMaxObjForTempBuffer ]; int numfound = 0; Vec4V* RESTRICT distanceArrayPtr = distanceArray; for(int i = 0; i < numObj; i += 4) { const Vec4V xxV = *objXPtr; const Vec4V yyV = *objYPtr; const Vec4V zzV = *objZPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *(Vec4V*)&m_NodeArrayUpper[i]; const Vec4V nodesLowerV = *(Vec4V*)&m_NodeArrayLower[i]; #else const Vec4V nodesV = *(Vec4V*)&m_NodeArray[i]; #endif const Vec4V dxV = Subtract(xxV, centerxV); const Vec4V dyV = Subtract(yyV, centeryV); const Vec4V dzV = Subtract(zzV, centerzV); const Vec4V dx2V = Scale(dxV, dxV); const Vec4V dxy2V = AddScaled(dx2V, dyV, dyV); const Vec4V d2V = AddScaled(dxy2V, dzV, dzV); const Vec4V selectWithinSphereV(IsLessThan(d2V, radius2V)); #if SPATIALARRAY64BIT const Vec4V nodesWithinSphereUpperV = And(selectWithinSphereV, nodesUpperV); const Vec4V nodesWithinSphereLowerV = And(selectWithinSphereV, nodesLowerV); #else const Vec4V nodesWithinSphereV = And(selectWithinSphereV, nodesV); #endif objXPtr++; objYPtr++; objZPtr++; Vec4V* RESTRICT oldDistancePtr = distanceArrayPtr; #if SPATIALARRAY64BIT u32* RESTRICT oldFoundArrayPtrUpper = foundArrayPtrUpper; foundArrayPtrUpper += 4; u32* RESTRICT oldFoundArrayPtrLower = foundArrayPtrLower; foundArrayPtrLower += 4; *(Vec4V*)oldFoundArrayPtrLower = nodesWithinSphereLowerV; *(Vec4V*)oldFoundArrayPtrUpper = nodesWithinSphereUpperV; #else CSpatialArrayNodeAddr* RESTRICT oldFoundArrayPtr = foundArrayPtr; foundArrayPtr += 4; *(Vec4V*)oldFoundArrayPtr = nodesWithinSphereV; #endif distanceArrayPtr ++; *oldDistancePtr = d2V; } float* floatDistanceArray = reinterpret_cast(distanceArray); for(int i = 0; i < numObj; i++) { #if SPATIALARRAY64BIT CSpatialArrayNode* addr = NodePtrFromUpperLower(foundArrayUpper[i], foundArrayLower[i]); #else CSpatialArrayNode* addr = (CSpatialArrayNode*)foundArray[i]; #endif if(addr) { found[numfound].m_Node = addr; found[numfound].m_DistanceSq = floatDistanceArray[i]; numfound++; if(numfound >= maxFound) { break; } } } return numfound; } int CSpatialArray::FindInCylinderXY(Vec2V_In centerXYV, ScalarV_In radiusV, CSpatialArrayNode **found, int maxFound) const { SA_PF_FUNC(FindInCylinderXY); SPATIALARRAYTHREADLOCK; const Vec4V radius2V(Scale(radiusV, radiusV)); const int numObj = m_NumObj; const Vec4V* RESTRICT objXPtr = (const Vec4V*)m_PosXArray; const Vec4V* RESTRICT objYPtr = (const Vec4V*)m_PosYArray; const Vec4V centerxV(SplatX(centerXYV)); const Vec4V centeryV(SplatY(centerXYV)); #if SPATIALARRAY64BIT const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(u32))/sizeof(Vec4V); Vec4V foundArrayBuffUpper[tempArraySize]; Vec4V foundArrayBuffLower[tempArraySize]; u32* RESTRICT foundArrayUpper = (u32*)foundArrayBuffUpper; u32* RESTRICT foundArrayLower = (u32*)foundArrayBuffLower; // Make really sure they got aligned properly. /FF Assertf((((size_t)foundArrayUpper) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayUpper); Assertf((((size_t)foundArrayLower) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayLower); u32* RESTRICT foundArrayPtrUpper = foundArrayUpper; u32* RESTRICT foundArrayPtrLower = foundArrayLower; #else // Reserve a vector-aligned array on the stack. /FF // Note: I believe this would work too: ALIGNAS(16) CSpatialArrayNode* foundArrayBuff[kMaxObjForTempBuffer] ; const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(CSpatialArrayNodeAddr))/sizeof(Vec4V); Vec4V foundArrayBuff[ tempArraySize ]; CSpatialArrayNodeAddr* RESTRICT foundArray = (CSpatialArrayNodeAddr*)foundArrayBuff; // Make really sure it got aligned properly. /FF Assertf((((size_t)foundArray) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArray); CSpatialArrayNodeAddr* RESTRICT foundArrayPtr = foundArray; #endif for(int i = 0; i < numObj; i += 4) { const Vec4V xxV = *objXPtr; const Vec4V yyV = *objYPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *(Vec4V*)&m_NodeArrayUpper[i]; const Vec4V nodesLowerV = *(Vec4V*)&m_NodeArrayLower[i]; #else const Vec4V nodesV = *(Vec4V*)&m_NodeArray[i]; #endif const Vec4V dxV = Subtract(xxV, centerxV); const Vec4V dyV = Subtract(yyV, centeryV); const Vec4V dx2V = Scale(dxV, dxV); const Vec4V d2V = AddScaled(dx2V, dyV, dyV); const Vec4V selectWithinCylV(IsLessThan(d2V, radius2V)); #if SPATIALARRAY64BIT const Vec4V nodesWithinCylUpperV = And(selectWithinCylV, nodesUpperV); const Vec4V nodesWithinCylLowerV = And(selectWithinCylV, nodesLowerV); #else const Vec4V nodesWithinCylV = And(selectWithinCylV, nodesV); #endif objXPtr++; objYPtr++; #if SPATIALARRAY64BIT u32* RESTRICT oldFoundArrayPtrUpper = foundArrayPtrUpper; foundArrayPtrUpper += 4; u32* RESTRICT oldFoundArrayPtrLower = foundArrayPtrLower; foundArrayPtrLower += 4; *(Vec4V*)oldFoundArrayPtrLower = nodesWithinCylLowerV; *(Vec4V*)oldFoundArrayPtrUpper = nodesWithinCylUpperV; #else CSpatialArrayNodeAddr* RESTRICT oldFoundArrayPtr = foundArrayPtr; foundArrayPtr += 4; *(Vec4V*)oldFoundArrayPtr = nodesWithinCylV; #endif } #if SPATIALARRAY64BIT return CreateCompactNodePointerArray(foundArrayUpper, foundArrayLower, numObj, found, maxFound); #else return CreateCompactNodePointerArray(foundArray, numObj, found, maxFound); #endif } int CSpatialArray::FindInSphere(Vec3V_In centerV, float radius, FindResult *found, int maxFound) const { const ScalarV radiusV(LoadScalar32IntoScalarV(radius)); return FindInSphere(centerV, radiusV, found, maxFound); } int CSpatialArray::FindInSphereOfType(Vec3V_In centerV, ScalarV_In radiusV, CSpatialArrayNode **found, int maxFound, const u32 &typeFlagsToCareAbout, const u32 &typeFlagValues) const { SA_PF_FUNC(FindInSphereOfType); // If this fails, there are values set in flagValues that are not in flagsToChange, // which we are probably better off if the user could avoid, so we don't have to // spend time on masking them here. /FF Assert((typeFlagValues & ~typeFlagsToCareAbout) == 0); SPATIALARRAYTHREADLOCK; // TODO: Probably operate on 8 objects instead of 4, to keep vector pipeline busy. // TODO: Maybe use cache prefetch and/or clear instructions. // TODO: Add some protection about assumption of 32 bit pointers, etc. /FF const Vec4V radius2V(Scale(radiusV, radiusV)); const int numObj = m_NumObj; // Load the type flag stuff into vector registers. Note that we intentionally // pass in these by reference, requiring the user to put them in memory, because // if they were passed in in general purpose registers, we would need to store // them to memory and load them back anyway. Could pass them in in vector // registers, of course, but that's probably not worth the trouble. /FF const Vec4V typeFlagsToCareAboutV = Vec4V(LoadScalar32IntoScalarV(typeFlagsToCareAbout)); const Vec4V typeFlagValuesV = Vec4V(LoadScalar32IntoScalarV(typeFlagValues)); const Vec4V* RESTRICT objXPtr = (const Vec4V*)m_PosXArray; const Vec4V* RESTRICT objYPtr = (const Vec4V*)m_PosYArray; const Vec4V* RESTRICT objZPtr = (const Vec4V*)m_PosZArray; const Vec4V* RESTRICT typeFlagPtr = (const Vec4V*)m_TypeFlagArray; const Vec4V centerxV(SplatX(centerV)); const Vec4V centeryV(SplatY(centerV)); const Vec4V centerzV(SplatZ(centerV)); #if SPATIALARRAY64BIT const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(u32))/sizeof(Vec4V); Vec4V foundArrayBuffUpper[tempArraySize]; Vec4V foundArrayBuffLower[tempArraySize]; u32* RESTRICT foundArrayUpper = (u32*)foundArrayBuffUpper; u32* RESTRICT foundArrayLower = (u32*)foundArrayBuffLower; // Make really sure they got aligned properly. /FF Assertf((((size_t)foundArrayUpper) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayUpper); Assertf((((size_t)foundArrayLower) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayLower); u32* RESTRICT foundArrayPtrUpper = foundArrayUpper; u32* RESTRICT foundArrayPtrLower = foundArrayLower; #else // Reserve a vector-aligned array on the stack. /FF // Note: I believe this would work too: ALIGNAS(16) CSpatialArrayNode* foundArrayBuff[kMaxObjForTempBuffer] ; const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(CSpatialArrayNodeAddr))/sizeof(Vec4V); Vec4V foundArrayBuff[ tempArraySize ]; CSpatialArrayNodeAddr* RESTRICT foundArray = (CSpatialArrayNodeAddr*)foundArrayBuff; // Make really sure it got aligned properly. /FF Assertf((((size_t)foundArray) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArray); CSpatialArrayNodeAddr* RESTRICT foundArrayPtr = foundArray; #endif for(int i = 0; i < numObj; i += 4) { const Vec4V xxV = *objXPtr; const Vec4V yyV = *objYPtr; const Vec4V zzV = *objZPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *(Vec4V*)&m_NodeArrayUpper[i]; const Vec4V nodesLowerV = *(Vec4V*)&m_NodeArrayLower[i]; #else const Vec4V nodesV = *(Vec4V*)&m_NodeArray[i]; #endif const Vec4V objTypeFlagsV = *typeFlagPtr; const Vec4V dxV = Subtract(xxV, centerxV); const Vec4V dyV = Subtract(yyV, centeryV); const Vec4V dzV = Subtract(zzV, centerzV); const Vec4V dx2V = Scale(dxV, dxV); const Vec4V dxy2V = AddScaled(dx2V, dyV, dyV); const Vec4V d2V = AddScaled(dxy2V, dzV, dzV); const VecBoolV selectWithinSphereV = IsLessThan(d2V, radius2V); // Get the type flags and filter out the ones we don't care about. const Vec4V objTypeFlagsCaredAboutV = And(objTypeFlagsV, typeFlagsToCareAboutV); // See if the remaining ones have the values we want. If so, selectTypeFlagMatchV // should be all 0xffffffff. const VecBoolV selectTypeFlagMatchV = IsEqualInt(objTypeFlagsCaredAboutV, typeFlagValuesV); // Compute the mask for matching both the type and being within the sphere. const VecBoolV selectMatch = And(selectWithinSphereV, selectTypeFlagMatchV); // Mask out the pointers for the nodes that didn't match. #if SPATIALARRAY64BIT const Vec4V matchingNodesUpperV = And((Vec4V)selectMatch, nodesUpperV); const Vec4V matchingNodesLowerV = And((Vec4V)selectMatch, nodesLowerV); #else const Vec4V matchingNodesV = And((Vec4V)selectMatch, nodesV); #endif objXPtr++; objYPtr++; objZPtr++; typeFlagPtr++; #if SPATIALARRAY64BIT u32* RESTRICT oldFoundArrayPtrUpper = foundArrayPtrUpper; foundArrayPtrUpper += 4; u32* RESTRICT oldFoundArrayPtrLower = foundArrayPtrLower; foundArrayPtrLower += 4; *(Vec4V*)oldFoundArrayPtrLower = matchingNodesLowerV; *(Vec4V*)oldFoundArrayPtrUpper = matchingNodesUpperV; #else CSpatialArrayNodeAddr* RESTRICT oldFoundArrayPtr = foundArrayPtr; foundArrayPtr += 4; *(Vec4V*)oldFoundArrayPtr = matchingNodesV; #endif } #if SPATIALARRAY64BIT return CreateCompactNodePointerArray(foundArrayUpper, foundArrayLower, numObj, found, maxFound); #else return CreateCompactNodePointerArray(foundArray, numObj, found, maxFound); #endif } int CSpatialArray::FindBelowZ(ScalarV_In scalar_thresholdZV, CSpatialArrayNode **found, int maxFound) const { SA_PF_FUNC(FindBelowZ); SPATIALARRAYTHREADLOCK; // TODO: Probably operate on 8 objects instead of 4, to keep vector pipeline busy. // TODO: Maybe use cache prefetch and/or clear instructions. // TODO: Add some protection about assumption of 32 bit pointers, etc. /FF const int numObj = m_NumObj; const Vec4V* RESTRICT objZPtr = (const Vec4V*)m_PosZArray; #if SPATIALARRAY64BIT const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(u32))/sizeof(Vec4V); Vec4V foundArrayBuffUpper[tempArraySize]; Vec4V foundArrayBuffLower[tempArraySize]; u32* RESTRICT foundArrayUpper = (u32*)foundArrayBuffUpper; u32* RESTRICT foundArrayLower = (u32*)foundArrayBuffLower; // Make really sure they got aligned properly. /FF Assertf((((size_t)foundArrayUpper) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayUpper); Assertf((((size_t)foundArrayLower) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayLower); u32* RESTRICT foundArrayPtrUpper = foundArrayUpper; u32* RESTRICT foundArrayPtrLower = foundArrayLower; #else // Reserve a vector-aligned array on the stack. /FF // Note: I believe this would work too: ALIGNAS(16) CSpatialArrayNode* foundArrayBuff[kMaxObjForTempBuffer] ; const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(CSpatialArrayNodeAddr))/sizeof(Vec4V); Vec4V foundArrayBuff[ tempArraySize ]; CSpatialArrayNodeAddr* RESTRICT foundArray = (CSpatialArrayNodeAddr*)foundArrayBuff; // Make really sure it got aligned properly. /FF Assertf((((size_t)foundArray) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArray); CSpatialArrayNodeAddr* RESTRICT foundArrayPtr = foundArray; #endif const Vec4V thresholdZV(scalar_thresholdZV); for(int i = 0; i < numObj; i += 4) { const Vec4V zzV = *objZPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *(Vec4V*)&m_NodeArrayUpper[i]; const Vec4V nodesLowerV = *(Vec4V*)&m_NodeArrayLower[i]; #else const Vec4V nodesV = *(Vec4V*)&m_NodeArray[i]; #endif const VecBoolV selectMatch = IsLessThan(zzV, thresholdZV); // Mask out the pointers for the nodes that didn't match. #if SPATIALARRAY64BIT const Vec4V matchingNodesUpperV = And((Vec4V)selectMatch, nodesUpperV); const Vec4V matchingNodesLowerV = And((Vec4V)selectMatch, nodesLowerV); #else const Vec4V matchingNodesV = And((Vec4V)selectMatch, nodesV); #endif objZPtr++; #if SPATIALARRAY64BIT u32* RESTRICT oldFoundArrayPtrUpper = foundArrayPtrUpper; foundArrayPtrUpper += 4; u32* RESTRICT oldFoundArrayPtrLower = foundArrayPtrLower; foundArrayPtrLower += 4; *(Vec4V*)oldFoundArrayPtrLower = matchingNodesLowerV; *(Vec4V*)oldFoundArrayPtrUpper = matchingNodesUpperV; #else CSpatialArrayNodeAddr* RESTRICT oldFoundArrayPtr = foundArrayPtr; foundArrayPtr += 4; *(Vec4V*)oldFoundArrayPtr = matchingNodesV; #endif } #if SPATIALARRAY64BIT return CreateCompactNodePointerArray(foundArrayUpper, foundArrayLower, numObj, found, maxFound); #else return CreateCompactNodePointerArray(foundArray, numObj, found, maxFound); #endif } int CSpatialArray::FindInSphereOfType(Vec3V_In centerV, float radius, CSpatialArrayNode **found, int maxFound, const u32 &typeFlagsToCareAbout, const u32 &typeFlagValues) const { const ScalarV radiusV(LoadScalar32IntoScalarV(radius)); return FindInSphereOfType(centerV, radiusV, found, maxFound, typeFlagsToCareAbout, typeFlagValues); } int CSpatialArray::FindNearSegment(Vec3V_In segPos1V, Vec3V_In segPos2V, const float& distSegToObjCenter, CSpatialArrayNode** found, int maxFound) const { SA_PF_FUNC(FindNearSegment); SPATIALARRAYTHREADLOCK; const ScalarV thresholdDistV = LoadScalar32IntoScalarV(distSegToObjCenter); const Vec4V thresholdDistSqV = Vec4V(Scale(thresholdDistV, thresholdDistV)); const int numObj = m_NumObj; const Vec4V* RESTRICT objXPtr = (const Vec4V*)m_PosXArray; const Vec4V* RESTRICT objYPtr = (const Vec4V*)m_PosYArray; const Vec4V* RESTRICT objZPtr = (const Vec4V*)m_PosZArray; #if SPATIALARRAY64BIT const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(u32))/sizeof(Vec4V); Vec4V foundArrayBuffUpper[tempArraySize]; Vec4V foundArrayBuffLower[tempArraySize]; u32* RESTRICT foundArrayUpper = (u32*)foundArrayBuffUpper; u32* RESTRICT foundArrayLower = (u32*)foundArrayBuffLower; // Make really sure they got aligned properly. /FF Assertf((((size_t)foundArrayUpper) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayUpper); Assertf((((size_t)foundArrayLower) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArrayLower); u32* RESTRICT foundArrayPtrUpper = foundArrayUpper; u32* RESTRICT foundArrayPtrLower = foundArrayLower; #else // Reserve a vector-aligned array on the stack. /FF // Note: I believe this would work too: ALIGNAS(16) CSpatialArrayNode* foundArrayBuff[kMaxObjForTempBuffer] ; const static int tempArraySize = (kMaxObjForTempBuffer*sizeof(CSpatialArrayNodeAddr))/sizeof(Vec4V); Vec4V foundArrayBuff[ tempArraySize ]; CSpatialArrayNodeAddr* RESTRICT foundArray = (CSpatialArrayNodeAddr*)foundArrayBuff; // Make really sure it got aligned properly. /FF Assertf((((size_t)foundArray) & 0xf) == 0, "Got address %p, expected 16 byte alignment.", foundArray); CSpatialArrayNodeAddr* RESTRICT foundArrayPtr = foundArray; #endif const Vec3V segPos1To2V = Subtract(segPos2V, segPos1V); const Vec4V point1XV(segPos1V.GetX()); const Vec4V point1YV(segPos1V.GetY()); const Vec4V point1ZV(segPos1V.GetZ()); const Vec4V deltaXV(segPos1To2V.GetX()); const Vec4V deltaYV(segPos1To2V.GetY()); const Vec4V deltaZV(segPos1To2V.GetZ()); const Vec4V zeroV(V_ZERO); const Vec4V oneV(V_ONE); for(int i = 0; i < numObj; i += 4) { const Vec4V xxV = *objXPtr; const Vec4V yyV = *objYPtr; const Vec4V zzV = *objZPtr; #if SPATIALARRAY64BIT const Vec4V nodesUpperV = *(Vec4V*)&m_NodeArrayUpper[i]; const Vec4V nodesLowerV = *(Vec4V*)&m_NodeArrayLower[i]; #else const Vec4V nodesV = *(Vec4V*)&m_NodeArray[i]; #endif // Here, we will compute the T values of the closest points on the segment, // for the four points. It's more or less done with the same operations // as in geomTValues::FindTValueSegToOriginV(), and it's even more similar // to sFindTValueSegToPoint() in 'TaskNavBase.cpp'. const Vec4V ptXV = Subtract(xxV, point1XV); const Vec4V ptYV = Subtract(yyV, point1YV); const Vec4V ptZV = Subtract(zzV, point1ZV); const Vec4V oneDotXV = Scale(deltaXV, ptXV); const Vec4V oneDotXYV = AddScaled(oneDotXV, deltaYV, ptYV); const Vec4V oneDotV = AddScaled(oneDotXYV, deltaZV, ptZV); const Vec4V bothDotXV = Scale(deltaXV, deltaXV); const Vec4V bothDotXYV = AddScaled(bothDotXV, deltaYV, deltaYV); const Vec4V bothDotV = AddScaled(bothDotXYV, deltaZV, deltaZV); const Vec4V tOnInfLineV = InvScaleFast(oneDotV, bothDotV); const VecBoolV tMaxMaskV = IsGreaterThanOrEqual(tOnInfLineV, oneV); const Vec4V tClampedMaxV = SelectFT(tMaxMaskV, tOnInfLineV, oneV); const VecBoolV tMinMaskV = IsGreaterThan(oneDotV, zeroV); const Vec4V tClampedV = And(tClampedMaxV, Vec4V(tMinMaskV)); // Next, compute the X, Y, and Z coordinates of the closest points // to each of the four objects. const Vec4V closestPtXV = AddScaled(point1XV, deltaXV, tClampedV); const Vec4V closestPtYV = AddScaled(point1YV, deltaYV, tClampedV); const Vec4V closestPtZV = AddScaled(point1ZV, deltaZV, tClampedV); // Compute the squared distance to each of these. const Vec4V ptToClosestXV = Subtract(closestPtXV, xxV); const Vec4V ptToClosestYV = Subtract(closestPtYV, yyV); const Vec4V ptToClosestZV = Subtract(closestPtZV, zzV); const Vec4V distSqXV = Scale(ptToClosestXV, ptToClosestXV); const Vec4V distSqXYV = AddScaled(distSqXV, ptToClosestYV, ptToClosestYV); const Vec4V distSqV = AddScaled(distSqXYV, ptToClosestZV, ptToClosestZV); // Compute a mask for which objects are close enough, and AND that with // the node addresses. const Vec4V selectNearSegV(IsLessThan(distSqV, thresholdDistSqV)); #if SPATIALARRAY64BIT const Vec4V nodesWithinSphereUpperV = And(selectNearSegV, nodesUpperV); const Vec4V nodesWithinSphereLowerV = And(selectNearSegV, nodesLowerV); #else const Vec4V nodesWithinSphereV = And(selectNearSegV, nodesV); #endif // Advance to the next four objects. objXPtr++; objYPtr++; objZPtr++; #if SPATIALARRAY64BIT u32* RESTRICT oldFoundArrayPtrUpper = foundArrayPtrUpper; foundArrayPtrUpper += 4; u32* RESTRICT oldFoundArrayPtrLower = foundArrayPtrLower; foundArrayPtrLower += 4; *(Vec4V*)oldFoundArrayPtrLower = nodesWithinSphereLowerV; *(Vec4V*)oldFoundArrayPtrUpper = nodesWithinSphereUpperV; #else CSpatialArrayNodeAddr* RESTRICT oldFoundArrayPtr = foundArrayPtr; foundArrayPtr += 4; *(Vec4V*)oldFoundArrayPtr = nodesWithinSphereV; #endif } // Create a compact array of pointers to return to the caller. #if SPATIALARRAY64BIT return CreateCompactNodePointerArray(foundArrayUpper, foundArrayLower, numObj, found, maxFound); #else return CreateCompactNodePointerArray(foundArray, numObj, found, maxFound); #endif } #if __DEV void CSpatialArray::DebugDraw() const { SPATIALARRAYTHREADLOCK; Matrix34 mtrx; mtrx.Identity(); const int numObj = m_NumObj; for(int i = 0; i < numObj; i++) { mtrx.d.x = m_PosXArray[i]; mtrx.d.y = m_PosYArray[i]; mtrx.d.z = m_PosZArray[i]; grcDebugDraw::Axis(mtrx, 1.0f); char buf[16]; formatf(buf, "%04x", m_TypeFlagArray[i]); grcDebugDraw::Text(mtrx.d, Color_white, buf); } } #endif // __DEV #if SPATIALARRAY64BIT int CSpatialArray::CreateCompactNodePointerArray(const u32* foundArrayUpper, const u32* foundArrayLower, int numObj, CSpatialArrayNode** found, int maxFound) #else int CSpatialArray::CreateCompactNodePointerArray(const CSpatialArrayNodeAddr* foundArray, int numObj, CSpatialArrayNode** found, int maxFound) #endif { int numfound = 0; for(int i = 0; i < numObj; i++) { #if SPATIALARRAY64BIT CSpatialArrayNode* addr = NodePtrFromUpperLower(foundArrayUpper[i], foundArrayLower[i]); #else CSpatialArrayNode* addr = (CSpatialArrayNode*)foundArray[i]; #endif if(addr) { found[numfound] = addr; numfound++; if(numfound >= maxFound) { break; } } } return numfound; } //------------------------------------------------------------------------------ /* End of file sagcore/spatialarray.cpp */