Q about possible frustum culling with bepu #123
-
I will be soon implementing frustum culling and im evaluating how to approach it. I saw https://forum.bepuentertainment.com/viewtopic.php?t=2513 but thats before bepu2 was released and couldnt find any newer discussions about it. Also i skimmed all samples but didnt go into details and as such didnt found anything that would catch my eye. So im wondering does bepu2 has anything more suitable to this task than bepu1? Would love to avoid to reinvent the wheel (especially since bepu is already parallelized&optimized to the moon and whatnot) but if theres nothing better than specialized bvh/octree/kd-tree for frustum culling even approximately etc thats also ok ;) |
Beta Was this translation helpful? Give feedback.
Replies: 8 comments 20 replies
-
For the purposes of rendering or similar use cases using bepuphysics2 infrastructure, the easiest thing would be to piggy back on the broad phase Tree (or a Tree maintained specifically for whatever the use case is, if it's not the same as physics objects) and implement a frustum-tree test. I haven't implemented that in bepuphysics2 yet unfortunately (all frustum culling is done on the GPU in my projects), but it would not be too hard to do. You could take the existing ray-tree test and replace the ray-AABB test with a frustum-AABB test. I might add such a thing eventually, but if you'd like to give it a shot, here's some HLSL for such a test. First, since the frustum is going to be tested against a lot of AABBs, it caches out separating axis candidates: struct FrustumCorners
{
float3 V00; //Upper left
float V11X; //Lower right x
float3 V01; //Lower left
float V11Y; //Lower right y
float3 V10; //Upper right
float V11Z; //Lower right z
//The above is packed to stay on 16 byte alignment without padding. This is unlikely to be helpful for a CPU side use case unless you've got a lot of frusta. These are directions from the origin of the frustum pointing along its edges.
};
struct AxisCandidates
{
float3 F0;
float3 F1;
float3 F2;
float3 F3;
float2 EdgeX;
float2 EdgeY;
float2 EdgeZ;
int FaceSigns;
};
void CalibrateCandidate(float2 candidate, float2 centralDirection, float2 faceA, float2 faceB, inout float bestScore, inout float2 bestCandidate)
{
if (dot(candidate, centralDirection) > 0)
candidate = -candidate;
//Only use edge pair candidates whose candidate lies within the voronoi region of the contributing frustum edge.
//Otherwise, the interval of the frustum along the candidate won't be (-inf, 0).
float dotA = dot(faceA, candidate);
float dotB = dot(faceB, candidate);
if (dotA <= 0 || dotB <= 0)
candidate = 0;
float2 extentAlongFaceNormals = float2(dotA, dotB);
float score = dot(extentAlongFaceNormals, extentAlongFaceNormals);
if (score > bestScore)
{
bestScore = score;
bestCandidate = candidate;
}
}
//(x y and z here are just a hacky version of a rotation matrix's rows For Reasons that won't apply elsewhere, perfectly fine to change)
AxisCandidates CreateSeparatingAxisCandidates(FrustumCorners corners, float3 x, float3 y, float3 z, out float3 centralDirection)
{
float3 f00 = Transform(corners.V00, x, y, z);
float3 f01 = Transform(corners.V01, x, y, z);
float3 f10 = Transform(corners.V10, x, y, z);
float3 f11 = Transform(float3(corners.V11X, corners.V11Y, corners.V11Z), x, y, z);
centralDirection = normalize(f00 + f01 + f10 + f11);
AxisCandidates candidates;
//Normalize these so that the subsequent 'best axis' test isn't working with incomparable magnitude axes.
candidates.F0 = normalize(cross(f01, f00)); //left
candidates.F1 = normalize(cross(f11, f01)); //top
candidates.F2 = normalize(cross(f10, f11)); //right
candidates.F3 = normalize(cross(f00, f10)); //bottom
//Pick only one axis for each of the four frustum edges- the one which is most different from the adjacent face normals.
//This means we will get false positive intersection tests since we're not using all 12 potential candidates, but by picking 4 decent candidates,
//we dramatically cut down the VGPR (or hopefully SGPR) usage. Using a full set of 12 axis candidates definitely does hurt occupancy.
float scoreX = -MaxFloat;
float2 vX = 0;
CalibrateCandidate(float2(f00.z, -f00.y), centralDirection.yz, candidates.F0.yz, candidates.F3.yz, scoreX, vX); //00 is lower left
CalibrateCandidate(float2(f01.z, -f01.y), centralDirection.yz, candidates.F0.yz, candidates.F1.yz, scoreX, vX); //01 is upper left
CalibrateCandidate(float2(f10.z, -f10.y), centralDirection.yz, candidates.F2.yz, candidates.F3.yz, scoreX, vX); //10 is lower right
CalibrateCandidate(float2(f11.z, -f11.y), centralDirection.yz, candidates.F1.yz, candidates.F2.yz, scoreX, vX); //11 is upper right
candidates.EdgeX = vX;
float scoreY = -MaxFloat;
float2 vY = 0;
CalibrateCandidate(float2(f00.z, -f00.x), centralDirection.xz, candidates.F0.xz, candidates.F3.xz, scoreY, vY);
CalibrateCandidate(float2(f01.z, -f01.x), centralDirection.xz, candidates.F0.xz, candidates.F1.xz, scoreY, vY);
CalibrateCandidate(float2(f10.z, -f10.x), centralDirection.xz, candidates.F2.xz, candidates.F3.xz, scoreY, vY);
CalibrateCandidate(float2(f11.z, -f11.x), centralDirection.xz, candidates.F1.xz, candidates.F2.xz, scoreY, vY);
candidates.EdgeY = vY;
float scoreZ = -MaxFloat;
float2 vZ = 0;
CalibrateCandidate(float2(f00.y, -f00.x), centralDirection.xy, candidates.F0.xy, candidates.F3.xy, scoreZ, vZ);
CalibrateCandidate(float2(f01.y, -f01.x), centralDirection.xy, candidates.F0.xy, candidates.F1.xy, scoreZ, vZ);
CalibrateCandidate(float2(f10.y, -f10.x), centralDirection.xy, candidates.F2.xy, candidates.F3.xy, scoreZ, vZ);
CalibrateCandidate(float2(f11.y, -f11.x), centralDirection.xy, candidates.F1.xy, candidates.F2.xy, scoreZ, vZ);
candidates.EdgeZ = vZ;
//If every frustum corner direction points in the same direction relative to a world axis, then that world axis (bounding box face) is a separating axis candidate.
//We mark these with a sign value: 1 means all frustum corners point along the positive axis, -1 means all frustum corners point along the negative axis, and 0 means there's a mix.
//Because registers are scarce, we pack all of them into a single uint.
//(We're assuming here that this stuff will be put into an SGPR without special effort, since it's all wave constant.
//If we're wrong, then using bools here might actually help, since they can easily be put into an SGPR even when they're thread variable.)
float faceSignsX = (f00.x < 0 && f01.x < 0 && f10.x < 0 && f11.x < 0) ? -1.0 : f00.x > 0 && f01.x > 0 && f10.x > 0 && f11.x > 0 ? 1.0 : 0.0;
float faceSignsY = (f00.y < 0 && f01.y < 0 && f10.y < 0 && f11.y < 0) ? -1.0 : f00.y > 0 && f01.y > 0 && f10.y > 0 && f11.y > 0 ? 1.0 : 0.0;
float faceSignsZ = (f00.z < 0 && f01.z < 0 && f10.z < 0 && f11.z < 0) ? -1.0 : f00.z > 0 && f01.z > 0 && f10.z > 0 && f11.z > 0 ? 1.0 : 0.0;
//Encode as 0 => -1, 1 => 0, 2 => 1 (could just create the masks directly but eh eeehhh)
candidates.FaceSigns = (int(faceSignsX) + 1) | ((int(faceSignsY) + 1) << 2) | ((int(faceSignsZ) + 1) << 4);
return candidates;
} Those candidates are then used per-test: //Note sign is important. When stubbing out bounds with min = maxfloat and max = minfloat, the separation will become nan, and nan > 0 is false so the intersection routine incorrectly returns true.
bool CandidateIntervalOverlaps(float3 boundsCenter, float3 boundsHalfSpan, float3 candidate)
{
return dot(boundsCenter, candidate) <= dot(boundsHalfSpan, abs(candidate));
}
bool CandidateIntervalOverlaps(float2 boundsCenter, float2 boundsHalfSpan, float2 candidate)
{
return dot(boundsCenter, candidate) <= dot(boundsHalfSpan, abs(candidate));
}
bool CheckFaceSigns(float3 originToClosestPointOnBounds, uint encodedFaceSigns)
{
float3 decodedFaceSigns = float3((encodedFaceSigns & 0x3) - 1.0, ((encodedFaceSigns >> 2) & 0x3) - 1.0, ((encodedFaceSigns >> 4) & 0x3) - 1.0);
return all(originToClosestPointOnBounds * decodedFaceSigns >= 0);
}
bool Intersect(float3 origin, float3 direction, AxisCandidates candidates, float3 minBounds, float3 maxBounds, float maximumT, out float t)
{
//Highly conservative t value chosen by projecting extreme point of bounds onto the frustum central ray.
float3 closestPointOnBounds = min(max(origin, minBounds), maxBounds);
float3 originToClosestPointOnBounds = closestPointOnBounds - origin;
t = dot(originToClosestPointOnBounds, direction);
float3 boundsHalfSpan = 0.5 * (maxBounds - minBounds);
float3 boundsCenter = boundsHalfSpan + minBounds - origin;
{
return t < maximumT &&
CandidateIntervalOverlaps(boundsCenter, boundsHalfSpan, candidates.F0) &&
CandidateIntervalOverlaps(boundsCenter, boundsHalfSpan, candidates.F1) &&
CandidateIntervalOverlaps(boundsCenter, boundsHalfSpan, candidates.F2) &&
CandidateIntervalOverlaps(boundsCenter, boundsHalfSpan, candidates.F3) &&
CandidateIntervalOverlaps(boundsCenter.yz, boundsHalfSpan.yz, candidates.EdgeX) &&
CandidateIntervalOverlaps(boundsCenter.xz, boundsHalfSpan.xz, candidates.EdgeY) &&
CandidateIntervalOverlaps(boundsCenter.xy, boundsHalfSpan.xy, candidates.EdgeZ) &&
CheckFaceSigns(originToClosestPointOnBounds, candidates.FaceSigns);
}
return false;
} Worth noting that this handles far planes in a slightly odd way- it uses an approximation of the far plane's bound rather than doing a full separating axis test, but for large and very long frusta, it works well. There's also an approximation used for the edge candidates- the heuristic it uses still keeps the test pretty tight, but you could brute force every axis if you really wanted to. |
Beta Was this translation helpful? Give feedback.
-
Full solution: Add
sequential necessary in my intersection test. Well, could be removed but you lose ability to trivially customize if far plane should be ditched or not for infinite frusta add this method to public unsafe void FrustumSweep<TFrustumTester>(in Plane nearPlane, in Plane farPlane, in Plane leftPlane, in Plane rightPlane, in Plane bottomPlane, in Plane topPlane, ref TFrustumTester frustumTester, int id = 0) where TFrustumTester : IBroadPhaseFrustumTester
{
var frustumData = new FrustumData()
{
nearPlane = nearPlane,
farPlane = farPlane,
leftPlane = leftPlane,
rightPlane = rightPlane,
topPlane = topPlane,
bottomPlane = bottomPlane,
Id = id
};
FrustumLeafTester<TFrustumTester> tester;
tester.LeafTester = frustumTester;
tester.Leaves = activeLeaves;
ActiveTree.FrustumSweep(&frustumData, ref tester);
tester.Leaves = staticLeaves;
StaticTree.FrustumSweep(&frustumData, ref tester);
//The sweep tester probably relies on mutation to function; copy any mutations back to the original reference.
frustumTester = tester.LeafTester;
} add also this where you added previous method: public interface IBroadPhaseFrustumTester
{
unsafe void FrustumTest(CollidableReference collidable, FrustumData* frustumData);
}
struct FrustumLeafTester<TFrustumTester> : IFrustumLeafTester where TFrustumTester : IBroadPhaseFrustumTester
{
public TFrustumTester LeafTester;
public Buffer<CollidableReference> Leaves;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public unsafe void TestLeaf(int leafIndex, FrustumData* frustumData)
{
LeafTester.FrustumTest(Leaves[leafIndex], frustumData);
}
} And last but not least add the following to public unsafe static bool IntersectsOrInside(in Vector3 min, in Vector3 max, FrustumData* frustumData)
{
// Convert AABB to center-extents representation
Vector3 c = (max + min) * 0.5f; // Compute AABB center
Vector3 e = max - c; // Compute positive extents
ref var plane = ref frustumData->nearPlane;
//far plane test can be eliminated by setting id<6 instead. This results in frustum with "infinite" length
for (int id = 1; id < 7; id++)
{
float r = e.X * Math.Abs(plane.Normal.X) + e.Y * Math.Abs(plane.Normal.Y) + e.Z * Math.Abs(plane.Normal.Z);
var m = (c.X * plane.Normal.X) + (c.Y * plane.Normal.Y) + (c.Z * plane.Normal.Z) + plane.D;
if (m + r < 0) return false; //outside
//if (m - r < 0) result = 1; //intersect
plane = ref Unsafe.Add(ref plane, 1);
}
return true;
}
public unsafe void FrustumSweep<TLeafTester>(FrustumData* frustumData, ref TLeafTester leafTester) where TLeafTester : IFrustumLeafTester
{
if (leafCount == 0)
return;
if (leafCount == 1)
{
//If the first node isn't filled, we have to use a special case.
if (IntersectsOrInside(Nodes[0].A.Min, Nodes[0].A.Max, frustumData))
{
leafTester.TestLeaf(0, frustumData);
}
}
else
{
//TODO: Explicitly tracking depth in the tree during construction/refinement is practically required to guarantee correctness.
//While it's exceptionally rare that any tree would have more than 256 levels, the worst case of stomping stack memory is not acceptable in the long run.
var stack = stackalloc int[TraversalStackCapacity];
FrustumSweep(0, frustumData, stack, ref leafTester);
}
}
internal unsafe void FrustumSweep<TLeafTester>(int nodeIndex, FrustumData* frustumData, int* stack, ref TLeafTester leafTester) where TLeafTester : IFrustumLeafTester
{
Debug.Assert((nodeIndex >= 0 && nodeIndex < nodeCount) || (Encode(nodeIndex) >= 0 && Encode(nodeIndex) < leafCount));
Debug.Assert(leafCount >= 2, "This implementation assumes all nodes are filled.");
int stackEnd = 0;
while (true)
{
if (nodeIndex < 0)
{
//This is actually a leaf node.
var leafIndex = Encode(nodeIndex);
leafTester.TestLeaf(leafIndex, frustumData);
//Leaves have no children; have to pull from the stack to get a new target.
if (stackEnd == 0)
return;
nodeIndex = stack[--stackEnd];
}
else
{
ref var node = ref Nodes[nodeIndex];
var aIntersected = IntersectsOrInside(node.A.Min, node.A.Max, frustumData);
var bIntersected = IntersectsOrInside(node.B.Min, node.B.Max, frustumData);
if (aIntersected && !bIntersected)
nodeIndex = node.A.Index;
else if (aIntersected && bIntersected)
{
Debug.Assert(stackEnd < TraversalStackCapacity - 1, "At the moment, we use a fixed size stack. Until we have explicitly tracked depths, watch out for excessive depth traversals.");
nodeIndex = node.A.Index;
stack[stackEnd++] = node.B.Index;
}
else if (bIntersected)
nodeIndex = node.B.Index;
else
{
//No intersection. Need to pull from the stack to get a new target.
if (stackEnd == 0)
return;
nodeIndex = stack[--stackEnd];
}
}
}
} I didnt particularly optimize AABB-Frustum test ( Btw this also work if you are inside aabb bigger than frustum. Test isnt 100% precise there can be false positive but 0 false negatives |
Beta Was this translation helpful? Give feedback.
-
@RossNordby Im trying to optimize frustum culling and noticed that |
Beta Was this translation helpful? Give feedback.
-
Final, optimized version before multithreading:
//representation of 0b1111_1111_1111_1111_1111_1111_1100_0000 on little endian that also works on big endian
private const uint isInside = 4294967232;
private static Dictionary<int, int> failedPlane = new Dictionary<int, int>();
internal unsafe void FrustumSweep<TLeafTester>(int nodeIndex, FrustumData* frustumData, int* stack, ref TLeafTester leafTester) where TLeafTester : IFrustumLeafTester
{
Debug.Assert((nodeIndex >= 0 && nodeIndex < nodeCount) || (Encode(nodeIndex) >= 0 && Encode(nodeIndex) < leafCount));
Debug.Assert(leafCount >= 2, "This implementation assumes all nodes are filled.");
uint planeBitmask = uint.MaxValue;
int stackEnd = 0;
int fullyContainedStack = -1;
while (true)
{
if (nodeIndex < 0)
{
//This is actually a leaf node.
var leafIndex = Encode(nodeIndex);
leafTester.TestLeaf(leafIndex, frustumData);
//Leaves have no children; have to pull from the stack to get a new target.
if (stackEnd == 0)
return;
nodeIndex = stack[--stackEnd];
//check if we are no longer fully inside frustum. if yes reset fullyContainedStack
if (stackEnd == fullyContainedStack)
fullyContainedStack = -1;
//reset bitmask every time when we have to go back and we arent fully inside frustum
//we must make separate test from previous test because for fullyContainedStack=-1 prev test would never be true
if (fullyContainedStack < -1)
planeBitmask = uint.MaxValue;
}
else
{
ref var node = ref Nodes[nodeIndex];
//skip tests if frustum fully contains childs,
//unset bit means fully inside single plane,
//set bit means intersection with that plane,
//and 7th least significant bit UNset means that AABB is outside frustum
//we have six planes thats why we check 6 zeroes
if (planeBitmask == isInside)
{
Debug.Assert(stackEnd < TraversalStackCapacity - 1, "At the moment, we use a fixed size stack. Until we have explicitly tracked depths, watch out for excessive depth traversals.");
nodeIndex = node.A.Index;
stack[stackEnd++] = node.B.Index;
}
else
{
var aBitmask = IntersectsOrInside(node.A.Min, node.A.Max, frustumData, node.A.Index, planeBitmask);
var bBitmask = IntersectsOrInside(node.B.Min, node.B.Max, frustumData, node.B.Index, planeBitmask);
var aIntersected = (aBitmask & (1 << 6)) != 0;
var bIntersected = (bBitmask & (1 << 6)) != 0;
if (aIntersected && !bIntersected)
{
nodeIndex = node.A.Index;
planeBitmask = aBitmask;
//check if A child is fully contained in frustum
//remember we can still intersect at this point and we need to be fully inside
if (aBitmask == isInside)
fullyContainedStack = stackEnd;
}
else if (aIntersected && bIntersected)
{
Debug.Assert(stackEnd < TraversalStackCapacity - 1, "At the moment, we use a fixed size stack. Until we have explicitly tracked depths, watch out for excessive depth traversals.");
nodeIndex = node.A.Index;
planeBitmask = aBitmask;
//check if both childs are fully contained in frustum
//remember we can still intersect at this point and we need to be fully inside
if (aBitmask == isInside && bBitmask == isInside)
fullyContainedStack = stackEnd;
stack[stackEnd++] = node.B.Index;
}
else if (bIntersected)
{
nodeIndex = node.B.Index;
planeBitmask = bBitmask;
//check if B child is fully contained in frustum
//remember we can still intersect at this point and we need to be fully inside
if (bBitmask == isInside)
fullyContainedStack = stackEnd;
}
else
{
//No intersection. Need to pull from the stack to get a new target.
if (stackEnd == 0)
return;
nodeIndex = stack[--stackEnd];
//check if we are no longer fully inside frustum. if yes reset fullyContainedStack
if (stackEnd < fullyContainedStack)
fullyContainedStack = -1;
//reset bitmask every time when we have to go back and we arent fully inside frustum
//we must make separate test from previous test because for fullyContainedStack=-1 prev test would never be true
if (fullyContainedStack == -1)
planeBitmask = uint.MaxValue;
}
}
}
}
}
[StructLayout(LayoutKind.Sequential)]
struct ExtentsRepresentation
{
public Vector3 center;
private float padding1;
public Vector3 extents;
private float padding2;
}
public unsafe static uint IntersectsOrInside(in Vector3 min, in Vector3 max, FrustumData* frustumData, int nodeIndex, uint planeBitmask = uint.MaxValue)
{
var shouldRenumberPlanes = failedPlane.TryGetValue(nodeIndex, out var planeId);
var start = 0;
//far plane test can be eliminated by setting end=5
//and changing all occurences of start==5 to start==4 instead.
//This results in frustum with "infinite" length
var end = 6;
// Convert AABB to center-extents representation
var eRep = new ExtentsRepresentation()
{
center = max + min, // Compute AABB center
extents = max - min // Compute positive extents
};
ref var planeAddr = ref Unsafe.As<float, Plane>(ref frustumData->nearPlane.Normal.X);
ref var plane = ref Unsafe.As<float, Plane>(ref frustumData->nearPlane.Normal.X);
if (shouldRenumberPlanes)
{
start = planeId;
end = start;
}
do
{
if (((planeBitmask >> start) & 1) == 0)
{
start = shouldRenumberPlanes && start == 5 ? 0 : start + 1;
continue;
}
plane = ref Unsafe.Add(ref planeAddr, start * 2);
var d = plane.D;
float m, r;
//Vector<float>.Count == 4 is not worth it since built-in VectorX and Plane are already vectorized
//and for Vector<float>.Count == 16 we should fuse A & B into one test
//and that will require sperate method with different signature and NET 5+ since Vector512 is required
if (Vector.IsHardwareAccelerated && Vector<float>.Count == 8)
{
ref Vector<float> planeData = ref Unsafe.As<Plane, Vector<float>>(ref plane);
ref Vector<float> bbData = ref Unsafe.As<ExtentsRepresentation, Vector<float>>(ref eRep);
var multi = bbData * planeData;
m = multi[0] + multi[1] + multi[2];
r = multi[4] + multi[5] + multi[6];
}
else
{
m = Vector3.Dot(plane.Normal, eRep.center);
plane = ref Unsafe.Add(ref plane, 1);//absolute normal
r = Vector3.Dot(plane.Normal, eRep.extents);
}
if (m + r < d)//outside
{
planeBitmask &= unchecked((uint)(~(1 << 6)));
//no need to renumber planes when id is 0
if (!shouldRenumberPlanes && start != 0 && start!=planeId)
failedPlane.Add(nodeIndex, start);
else if (start != 0 && start!=planeId)
failedPlane[nodeIndex] = start;
return planeBitmask;
}
if (m - r >= d)//inside
{
planeBitmask &= ~(uint)(1 << start);
}
/*else//intersect
{
}*/
start = shouldRenumberPlanes && start == 5 ? 0 : start + 1;
} while (start != end);
if (shouldRenumberPlanes)
failedPlane.Remove(nodeIndex);
return planeBitmask;
}
struct FrustumLeafTester<TFrustumTester> : IFrustumLeafTester where TFrustumTester : IBroadPhaseFrustumTester
{
public TFrustumTester LeafTester;
public Buffer<CollidableReference> Leaves;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public unsafe void TestLeaf(int leafIndex, FrustumData* frustumData)
{
LeafTester.FrustumTest(Leaves[leafIndex], frustumData);
}
}
public interface IBroadPhaseFrustumTester
{
unsafe void FrustumTest(CollidableReference collidable, FrustumData* frustumData);
}
/// <summary>
/// Finds any intersections between a frustum and leaf bounding boxes.
/// </summary>
/// <typeparam name="TFrustumTester">Type of the callback to execute on frustum-leaf bounding box intersections.</typeparam>
/// <param name="matrix">should be multiply of inversed view matrix (or camera's model matrix) and projection matrix </param>
/// <param name="frustumTester">Callback to execute on frustum-leaf bounding box intersections.</param>
/// <param name="columnMajor">matrix is column-major or not</param>
/// <param name="id">User specified id of the frustum.</param>
public unsafe void FrustumSweep<TFrustumTester>(in Matrix4x4 matrix, ref TFrustumTester frustumTester, bool columnMajor = false, int id = 0) where TFrustumTester : IBroadPhaseFrustumTester
{
//TODO: maybe use preprocessor directive instead of bool?
//#if COLUMNMAJOR
var frustumData = new FrustumData()
{
Id = id
};
var planeSpan = new Span<Plane>(&frustumData.nearPlane, 12);
if (columnMajor)
{
ref var refMat = ref Unsafe.AsRef(matrix);
ref var lastColumn = ref Unsafe.As<float, Vector4>(ref refMat.M41);
ref var firstColumn = ref Unsafe.As<float, Vector4>(ref refMat.M11);
ref var secondColumn = ref Unsafe.As<float, Vector4>(ref refMat.M21);
ref var thirdColumn = ref Unsafe.As<float, Vector4>(ref refMat.M31);
// Near clipping plane
planeSpan[0] = Plane.Normalize(new Plane(lastColumn + thirdColumn));
// Top clipping plane
planeSpan[2] = Plane.Normalize(new Plane(lastColumn - secondColumn));
// Bottom clipping plane
planeSpan[4] = Plane.Normalize(new Plane(lastColumn + secondColumn));
// Left clipping plane
planeSpan[6] = Plane.Normalize(new Plane(lastColumn + firstColumn));
// Right clipping plane
planeSpan[8] = Plane.Normalize(new Plane(lastColumn - firstColumn));
// Far clipping plane
planeSpan[10] = Plane.Normalize(new Plane(lastColumn - thirdColumn));
}
else
{
var lastColumn = new Vector4(matrix.M14, matrix.M24, matrix.M34, matrix.M44);
var firstColumn = new Vector4(matrix.M11, matrix.M21, matrix.M31, matrix.M41);
var secondColumn = new Vector4(matrix.M12, matrix.M22, matrix.M32, matrix.M42);
var thirdColumn = new Vector4(matrix.M13, matrix.M23, matrix.M33, matrix.M43);
// Near clipping plane
planeSpan[0] = Plane.Normalize(new Plane(matrix.M13, matrix.M23, matrix.M33, matrix.M43));
// Top clipping plane
planeSpan[2] = Plane.Normalize(new Plane(lastColumn - secondColumn));
// Bottom clipping plane
planeSpan[4] = Plane.Normalize(new Plane(lastColumn + secondColumn));
// Left clipping plane
planeSpan[6] = Plane.Normalize(new Plane(lastColumn + firstColumn));
// Right clipping plane
planeSpan[8] = Plane.Normalize(new Plane(lastColumn - firstColumn));
// Far clipping plane
planeSpan[10] = Plane.Normalize(new Plane(lastColumn - thirdColumn));
}
planeSpan[1].Normal = Vector3.Abs(planeSpan[0].Normal);
planeSpan[3].Normal = Vector3.Abs(planeSpan[2].Normal);
planeSpan[5].Normal = Vector3.Abs(planeSpan[4].Normal);
planeSpan[7].Normal = Vector3.Abs(planeSpan[6].Normal);
planeSpan[9].Normal = Vector3.Abs(planeSpan[8].Normal);
planeSpan[11].Normal = Vector3.Abs(planeSpan[10].Normal);
planeSpan[0].D *= -2;
planeSpan[2].D *= -2;
planeSpan[4].D *= -2;
planeSpan[6].D *= -2;
planeSpan[8].D *= -2;
planeSpan[10].D *= -2;
FrustumLeafTester<TFrustumTester> tester;
tester.LeafTester = frustumTester;
tester.Leaves = activeLeaves;
ActiveTree.FrustumSweep(&frustumData, ref tester);
tester.Leaves = staticLeaves;
StaticTree.FrustumSweep(&frustumData, ref tester);
//The sweep tester probably relies on mutation to function; copy any mutations back to the original reference.
frustumTester = tester.LeafTester;
}
[StructLayout(LayoutKind.Sequential)]
public struct FrustumData
{
public Plane nearPlane;
public Plane nearPlaneAbsNormal;
public Plane topPlane;
public Plane topPlaneAbsNormal;
public Plane bottomPlane;
public Plane bottomPlaneAbsNormal;
public Plane leftPlane;
public Plane leftPlaneAbsNormal;
public Plane rightPlane;
public Plane rightPlaneAbsNormal;
public Plane farPlane;
public Plane farPlaneAbsNormal;
public int Id;
} There are still some optimization opportunities outside multithreading but at this point they are difficult in a sense that you have to make tradeoffs, that is disable conflicting optimizations or move to NET 5+ (going ham on SIMD, although it is already SIMDified for 128 and 256 bit to some degree but not for 512 bit and beyond) or change internals eg data representation. if someone wants to try and see which alternative optimizations work for them then i suggest reading https://fgiesen.wordpress.com/2010/10/17/view-frustum-culling/ on top of what i already linked ealier (there is even full SIMD sample at the end for Cell processor from Playstation 3 heydeys which was known for insane vectorization capabilities, heh) changelog:
@RossNordby BTW would you be interested in PR with features related to frustum culling? (mostly what i showed here). Im willing to rewrite fragments if something doesnt suit you. If yes there are some design and improvement choices to be made (not much) but for that i will create separate discussion. Thanks for all help so far. |
Beta Was this translation helpful? Give feedback.
-
Adding cheap tree for culling purposes (or making phys object completely disabled and not awakenable) turned out to be easier than expected (includes multithreading) using BepuUtilities;
using BepuUtilities.Memory;
using BepuPhysics.Collidables;
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Numerics;
using BepuPhysics.Trees;
namespace BepuPhysics.CollisionDetection
{
public unsafe partial class BroadPhase : IDisposable
{
internal Buffer<CollidableReference> activeLeaves;
internal Buffer<CollidableReference> staticLeaves;
internal Buffer<CollidableReference> frozenLeaves;
public BufferPool Pool { get; private set; }
public Tree ActiveTree;
public Tree StaticTree;
public Tree FrozenTree;
Tree.RefitAndRefineMultithreadedContext activeRefineContext;
//TODO: static trees do not need to do nearly as much work as the active; this will change in the future.
Tree.RefitAndRefineMultithreadedContext staticRefineContext;
Tree.RefitAndRefineMultithreadedContext frozenRefineContext;
public BroadPhase(BufferPool pool, int initialActiveLeafCapacity = 4096, int initialStaticLeafCapacity = 8192, int initialFrozenLeafCapacity = 8192)
{
Pool = pool;
ActiveTree = new Tree(pool, initialActiveLeafCapacity);
StaticTree = new Tree(pool, initialStaticLeafCapacity);
FrozenTree = new Tree(pool, initialFrozenLeafCapacity);
pool.TakeAtLeast(initialActiveLeafCapacity, out activeLeaves);
pool.TakeAtLeast(initialStaticLeafCapacity, out staticLeaves);
pool.TakeAtLeast(initialFrozenLeafCapacity, out frozenLeaves);
activeRefineContext = new Tree.RefitAndRefineMultithreadedContext();
staticRefineContext = new Tree.RefitAndRefineMultithreadedContext();
frozenRefineContext = new Tree.RefitAndRefineMultithreadedContext();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Add(CollidableReference collidable, ref BoundingBox bounds, ref Tree tree, BufferPool pool, ref Buffer<CollidableReference> leaves)
{
var leafIndex = tree.Add(ref bounds, pool);
if (leafIndex >= leaves.Length)
{
pool.ResizeToAtLeast(ref leaves, tree.LeafCount + 1, leaves.Length);
}
leaves[leafIndex] = collidable;
return leafIndex;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool RemoveAt(int index, ref Tree tree, ref Buffer<CollidableReference> leaves, out CollidableReference movedLeaf)
{
Debug.Assert(index >= 0);
var movedLeafIndex = tree.RemoveAt(index);
if (movedLeafIndex >= 0)
{
movedLeaf = leaves[movedLeafIndex];
leaves[index] = movedLeaf;
return true;
}
movedLeaf = new CollidableReference();
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int AddActive(CollidableReference collidable, ref BoundingBox bounds)
{
return Add(collidable, ref bounds, ref ActiveTree, Pool, ref activeLeaves);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool RemoveActiveAt(int index, out CollidableReference movedLeaf)
{
return RemoveAt(index, ref ActiveTree, ref activeLeaves, out movedLeaf);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int AddStatic(CollidableReference collidable, ref BoundingBox bounds)
{
return Add(collidable, ref bounds, ref StaticTree, Pool, ref staticLeaves);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool RemoveStaticAt(int index, out CollidableReference movedLeaf)
{
return RemoveAt(index, ref StaticTree, ref staticLeaves, out movedLeaf);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int AddFrozen(CollidableReference collidable, ref BoundingBox bounds)
{
return Add(collidable, ref bounds, ref FrozenTree, Pool, ref frozenLeaves);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool RemoveFrozenAt(int index, out CollidableReference movedLeaf)
{
return RemoveAt(index, ref FrozenTree, ref frozenLeaves, out movedLeaf);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void GetBoundsPointers(int broadPhaseIndex, ref Tree tree, out Vector3* minPointer, out Vector3* maxPointer)
{
var leaf = tree.Leaves[broadPhaseIndex];
var nodeChild = (&tree.Nodes.Memory[leaf.NodeIndex].A) + leaf.ChildIndex;
minPointer = &nodeChild->Min;
maxPointer = &nodeChild->Max;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void GetActiveBoundsPointers(int index, out Vector3* minPointer, out Vector3* maxPointer)
{
GetBoundsPointers(index, ref ActiveTree, out minPointer, out maxPointer);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void GetStaticBoundsPointers(int index, out Vector3* minPointer, out Vector3* maxPointer)
{
GetBoundsPointers(index, ref StaticTree, out minPointer, out maxPointer);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void GetFrozenBoundsPointers(int index, out Vector3* minPointer, out Vector3* maxPointer)
{
GetBoundsPointers(index, ref FrozenTree, out minPointer, out maxPointer);
}
/// <summary>
/// Applies updated bounds to the given leaf index in the given tree, refitting the tree to match.
/// </summary>
/// <param name="broadPhaseIndex">Index of the leaf in the tree to update.</param>
/// <param name="tree">Tree containing the leaf to update.</param>
/// <param name="min">New minimum bounds for the leaf.</param>
/// <param name="max">New maximum bounds for the leaf.</param>
public unsafe static void UpdateBounds(int broadPhaseIndex, ref Tree tree, in Vector3 min, in Vector3 max)
{
GetBoundsPointers(broadPhaseIndex, ref tree, out var minPointer, out var maxPointer);
*minPointer = min;
*maxPointer = max;
tree.RefitForNodeBoundsChange(tree.Leaves[broadPhaseIndex].NodeIndex);
}
/// <summary>
/// Applies updated bounds to the given active leaf index, refitting the tree to match.
/// </summary>
/// <param name="broadPhaseIndex">Index of the leaf to update.</param>
/// <param name="min">New minimum bounds for the leaf.</param>
/// <param name="max">New maximum bounds for the leaf.</param>
public void UpdateActiveBounds(int broadPhaseIndex, in Vector3 min, in Vector3 max)
{
UpdateBounds(broadPhaseIndex, ref ActiveTree, min, max);
}
/// <summary>
/// Applies updated bounds to the given active leaf index, refitting the tree to match.
/// </summary>
/// <param name="broadPhaseIndex">Index of the leaf to update.</param>
/// <param name="min">New minimum bounds for the leaf.</param>
/// <param name="max">New maximum bounds for the leaf.</param>
public void UpdateStaticBounds(int broadPhaseIndex, in Vector3 min, in Vector3 max)
{
UpdateBounds(broadPhaseIndex, ref StaticTree, min, max);
}
public void UpdateFrozenBounds(int broadPhaseIndex, in Vector3 min, in Vector3 max)
{
UpdateBounds(broadPhaseIndex, ref FrozenTree, min, max);
}
int frameIndex;
public void Update(IThreadDispatcher threadDispatcher = null)
{
if (frameIndex == int.MaxValue)
frameIndex = 0;
if (threadDispatcher != null)
{
activeRefineContext.RefitAndRefine(ref ActiveTree, Pool, threadDispatcher, frameIndex);
}
else
{
ActiveTree.RefitAndRefine(Pool, frameIndex);
}
//TODO: for now, the inactive/static tree is simply updated like another active tree. This is enormously inefficient compared to the ideal-
//by nature, static and inactive objects do not move every frame!
//This should be replaced by a dedicated inactive/static refinement approach. It should also run alongside the active tree to extract more parallelism;
//in other words, generate jobs from both trees and dispatch over all of them together. No internal dispatch.
if (threadDispatcher != null)
{
staticRefineContext.RefitAndRefine(ref StaticTree, Pool, threadDispatcher, frameIndex);
}
else
{
StaticTree.RefitAndRefine(Pool, frameIndex);
}
if (threadDispatcher != null)
{
frozenRefineContext.RefitAndRefine(ref FrozenTree, Pool, threadDispatcher, frameIndex);
}
else
{
FrozenTree.RefitAndRefine(Pool, frameIndex);
}
++frameIndex;
}
/// <summary>
/// Clears out the broad phase's structures without releasing any resources.
/// </summary>
public void Clear()
{
ActiveTree.Clear();
StaticTree.Clear();
FrozenTree.Clear();
}
void EnsureCapacity(ref Tree tree, ref Buffer<CollidableReference> leaves, int capacity)
{
if (tree.Leaves.Length < capacity)
tree.Resize(Pool, capacity);
if (leaves.Length < capacity)
Pool.ResizeToAtLeast(ref leaves, capacity, tree.LeafCount);
}
void ResizeCapacity(ref Tree tree, ref Buffer<CollidableReference> leaves, int capacity)
{
capacity = Math.Max(capacity, tree.LeafCount);
if (tree.Leaves.Length != BufferPool.GetCapacityForCount<Leaf>(capacity))
tree.Resize(Pool, capacity);
if (leaves.Length != BufferPool.GetCapacityForCount<CollidableReference>(capacity))
Pool.ResizeToAtLeast(ref leaves, capacity, tree.LeafCount);
}
void Dispose(ref Tree tree, ref Buffer<CollidableReference> leaves)
{
Pool.Return(ref leaves);
tree.Dispose(Pool);
}
/// <summary>
/// Ensures that the broad phase structures can hold at least the given number of leaves.
/// </summary>
/// <param name="activeCapacity">Number of leaves to allocate space for in the active tree.</param>
/// <param name="staticCapacity">Number of leaves to allocate space for in the static tree.</param>
public void EnsureCapacity(int activeCapacity, int staticCapacity)
{
EnsureCapacity(ref ActiveTree, ref activeLeaves, activeCapacity);
EnsureCapacity(ref StaticTree, ref staticLeaves, staticCapacity);
}
/// <summary>
/// Resizes the broad phase structures to hold the given number of leaves. Note that this is conservative; it will never orphan any existing leaves.
/// </summary>
/// <param name="activeCapacity">Number of leaves to allocate space for in the active tree.</param>
/// <param name="staticCapacity">Number of leaves to allocate space for in the static tree.</param>
public void Resize(int activeCapacity, int staticCapacity)
{
ResizeCapacity(ref ActiveTree, ref activeLeaves, activeCapacity);
ResizeCapacity(ref StaticTree, ref staticLeaves, staticCapacity);
}
/// <summary>
/// Releases memory used by the broad phase. Leaves the broad phase unusable.
/// </summary>
public void Dispose()
{
Dispose(ref ActiveTree, ref activeLeaves);
Dispose(ref StaticTree, ref staticLeaves);
Dispose(ref FrozenTree, ref frozenLeaves);
}
}
} all that was required is tweak
in any broad phase query that you want to support (ray cast, frustum or box sweep etc.) at this point it may also be worth adding a flag enum that will indicate which trees you want to query instead of blindly query all Didnt tweak any filling frozen tree can look like this: public static Simulation simulation;
private static int index = 0;
public static void AddBody(Vector3 pos, Vector3 min, Vector3 max)
{
var minInWorld = pos + min;
var maxInWorld = pos + max;
var box = new BepuUtilities.BoundingBox(minInWorld, maxInWorld);
var midpoint = Vector3.Lerp(minInWorld, maxInWorld, 0.5f);
var handle = simulation.BroadPhase.AddFrozen(new CollidableReference(new StaticHandle(index)), ref box);
index++;
int i = 0;
//produce bounding boxes for debugging.
//this produces BB only for childs but not for space partitioning
foreach (var p in ..simulation.BroadPhase.FrozenTree.LeafCount)
{
unsafe
{
simulation.BroadPhase.GetFrozenBoundsPointers(p, out var minp, out var maxp);
var cube = CreatePrimitiveMesh.GenerateCube(Matrix4x4.CreateScale(MathF.Abs(maxp->X - minp->X), MathF.Abs(maxp->Y - minp->Y), MathF.Abs(maxp->Z - minp->Z)), "cube" + i, vertexColor: Manipulators.zColor);
Pipeline.Get<SharpAsset.Mesh>().Register(cube);
}
i++;
}
}
} BTW @RossNordby is there builtin function to get space partitioning BB ? doesnt matter if it will include leaves. On [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void GetSpacePartitionBoundsPointers(int broadPhaseIndex, ref Tree tree, out Vector3* minPointerA, out Vector3* maxPointerA, out Vector3* minPointerB, out Vector3* maxPointerB)
{
var nodeChildA = &tree.Nodes.Memory[broadPhaseIndex].A;
var nodeChildB = &tree.Nodes.Memory[broadPhaseIndex].B;
minPointerA = &nodeChildA->Min;
maxPointerA = &nodeChildA->Max;
minPointerB = &nodeChildB->Min;
maxPointerB = &nodeChildB->Max;
} or will i get duplicates? |
Beta Was this translation helpful? Give feedback.
-
@RossNordby hey im currently wrestling with multithreaded frustum culling and im almost done but got stuck at optimization. Is there a helper method that would give me remaining leaves or nodes under specified My previous attempt assumed that If that not possible i will have to visit every leaf and decrement leaf count even when branch got rejected which leaves a lot to be desired - at least compared to non-multithreaded algorithm (although thanks to bitmasking all that would have to be done is just loop through all rejected nodes without testing except for checking bitmask). Or rewrite algorithm altogether (eg go back to subdividing frustum but that also leaves a lot to be desired) Also just curious - is there a method to grab all |
Beta Was this translation helpful? Give feedback.
-
Allright i think im finished. Wasnt as bad as i feared despite the fact it is my first attempt at seriously multithreading anything (anything in the past was exclusively simple async on single thread or no thread at all). here is full code for
and
Notes: And btw if you know better/cheaper synchronization object than |
Beta Was this translation helpful? Give feedback.
-
OK i managed to rewrite algorithm into one that has no Also fixed very rare bug in my previous version - i did not realize tree can be degenerated in worst way possible, that is all leafs landed on left or right branch of node 0 except one leaf and the last leaf is in opposite branch. This resulted in 0 index appearing in multithreaded algorithm which was not supported. As i mentioned this was very rare, despite 50+ manual tests (and each test took at least minute during which tree was reconfigured constantly, so in reality it was easily 1k+ tests) it happened only once. Here is new version: [StructLayout(LayoutKind.Sequential)]
struct ExtentsRepresentation
{
public Vector3 center;
private float padding1;
public Vector3 extents;
private float padding2;
}
//representation of 0b1111_1111_1111_1111_1111_1111_1100_0000 on little endian that also works on big endian
private const uint isInside = 4294967232;
private static ConcurrentDictionary<int, int> failedPlane = new();
private static Buffer<QuickList<int>> leavesToTest = new();
private static int[] startingIndexes = new int[4];
private static int startingIndexesLength;
private static int currentIndex;
private static int interThreadExchange;
private static IThreadDispatcher threadDispatcher;
private static int globalRemainingLeavesToVisit;
private unsafe static FrustumData* frustumData;
public unsafe void FrustumSweep<TLeafTester>(FrustumData* frustumData, BufferPool pool, ref TLeafTester leafTester) where TLeafTester : IFrustumLeafTester
{
if (leafCount == 0)
return;
Tree.frustumData = frustumData;
if (leafCount == 1)
{
//If the first node isn't filled, we have to use a special case.
if ((IntersectsOrInside(Nodes[0].A.Min, Nodes[0].A.Max, frustumData, Nodes[0].A.Index) & (1 << 6)) != 0)
{
leafTester.TestLeaf(0, frustumData);
}
}
else
{
//TODO: Explicitly tracking depth in the tree during construction/refinement is practically required to guarantee correctness.
//While it's exceptionally rare that any tree would have more than 256 levels, the worst case of stomping stack memory is not acceptable in the long run.
var stack = stackalloc int[TraversalStackCapacity];
FrustumSweep(0, stack, ref leafTester);
}
}
public unsafe void FrustumSweepMultithreaded<TLeafTester>(FrustumData* frustumData, BufferPool pool, ref TLeafTester leafTester, IThreadDispatcher dispatcher) where TLeafTester : IFrustumLeafTester
{
if (leafCount == 0)
return;
Tree.frustumData = frustumData;
if (leafCount == 1)
{
//If the first node isn't filled, we have to use a special case.
if ((IntersectsOrInside(Nodes[0].A.Min, Nodes[0].A.Max, frustumData, Nodes[0].A.Index) & (1 << 6)) != 0)
{
leafTester.TestLeaf(0, frustumData);
}
}
else if (leafCount < dispatcher.ThreadCount * 4)
{
//TODO: Explicitly tracking depth in the tree during construction/refinement is practically required to guarantee correctness.
//While it's exceptionally rare that any tree would have more than 256 levels, the worst case of stomping stack memory is not acceptable in the long run.
var stack = stackalloc int[TraversalStackCapacity];
FrustumSweep(0, stack, ref leafTester);
}
else
{
FrustumSweepMultithreaded(0, pool, ref leafTester, dispatcher);
}
}
private unsafe void TestAABBs(ref int counter, ref int nodeIndex, ref int leafIndex, ref int fullyContainedStack, ref uint planeBitmask, ref int stackEnd, int* stack)
{
if (nodeIndex < 0)
{
//This is actually a leaf node.
leafIndex = Encode(nodeIndex);
counter--;
//Leaves have no children; have to pull from the stack to get a new target.
if (stackEnd == 0)
return;
nodeIndex = stack[--stackEnd];
//check if we are no longer fully inside frustum. if yes reset fullyContainedStack
if (stackEnd < fullyContainedStack)
fullyContainedStack = -1;
//reset bitmask every time when we have to go back and we arent fully inside frustum
//we must make separate test from previous test because for fullyContainedStack=-1 prev test would never be true
if (fullyContainedStack < -1)
planeBitmask = uint.MaxValue;
}
else
{
ref var node = ref Nodes[nodeIndex];
//skip tests if frustum fully contains childs,
//unset bit means fully inside single plane,
//set bit means intersection with that plane,
//and 7th least significant bit UNset means that AABB is outside frustum
//we have six planes thats why we check 6 zeroes
if (planeBitmask == isInside)
{
Debug.Assert(stackEnd < TraversalStackCapacity - 1, "At the moment, we use a fixed size stack. Until we have explicitly tracked depths, watch out for excessive depth traversals.");
nodeIndex = node.A.Index;
//make sure leaf never lands on stack
//this is necessary for multithreaded algorithm
if (node.B.Index < 0)
{
//This is actually a leaf node.
leafIndex = Encode(node.B.Index);
counter--;
}
else
stack[stackEnd++] = node.B.Index;
}
else
{
var aBitmask = IntersectsOrInside(node.A.Min, node.A.Max, frustumData, node.A.Index, planeBitmask);
var bBitmask = IntersectsOrInside(node.B.Min, node.B.Max, frustumData, node.B.Index, planeBitmask);
var aIntersected = (aBitmask & (1 << 6)) != 0;
var bIntersected = (bBitmask & (1 << 6)) != 0;
if (aIntersected && !bIntersected)
{
nodeIndex = node.A.Index;
planeBitmask = aBitmask;
//One of branches was discarded. Discard all leaves in this branch
counter -= node.B.LeafCount;
//check if A child is fully contained in frustum
//remember we can still intersect at this point and we need to be fully inside
if (aBitmask == isInside)
fullyContainedStack = stackEnd;
}
else if (aIntersected && bIntersected)
{
Debug.Assert(stackEnd < TraversalStackCapacity - 1, "At the moment, we use a fixed size stack. Until we have explicitly tracked depths, watch out for excessive depth traversals.");
nodeIndex = node.A.Index;
planeBitmask = aBitmask;
//check if both childs are fully contained in frustum
//remember we can still intersect at this point and we need to be fully inside
if (aBitmask == isInside && bBitmask == isInside)
fullyContainedStack = stackEnd;
//make sure leaf never lands on stack
//this is necessary for multithreaded algorithm
if (node.B.Index < 0)
{
//This is actually a leaf node.
leafIndex = Encode(node.B.Index);
counter--;
}
else
stack[stackEnd++] = node.B.Index;
}
else if (bIntersected)
{
nodeIndex = node.B.Index;
planeBitmask = bBitmask;
//One of branches was discarded. Discard all leaves in this branch
counter -= node.A.LeafCount;
//check if B child is fully contained in frustum
//remember we can still intersect at this point and we need to be fully inside
if (bBitmask == isInside)
fullyContainedStack = stackEnd;
}
else
{
//Both branches were discarded. Discard all leaves in these branches
counter -= node.A.LeafCount + node.B.LeafCount;
//No intersection. Need to pull from the stack to get a new target.
if (stackEnd == 0)
return;
nodeIndex = stack[--stackEnd];
//check if we are no longer fully inside frustum. if yes reset fullyContainedStack
if (stackEnd < fullyContainedStack)
fullyContainedStack = -1;
//reset bitmask every time when we have to go back and we arent fully inside frustum
//we must make separate test from previous test because for fullyContainedStack=-1 prev test would never be true
if (fullyContainedStack == -1)
planeBitmask = uint.MaxValue;
}
}
}
}
internal unsafe void FrustumSweepMultithreaded<TLeafTester>(int nodeIndex, BufferPool pool, ref TLeafTester leafTester, IThreadDispatcher dispatcher) where TLeafTester : IFrustumLeafTester
{
threadDispatcher = dispatcher;
globalRemainingLeavesToVisit = leafCount;
currentIndex = -1;
startingIndexesLength = 0;
interThreadExchange = 0;
Debug.Assert((nodeIndex >= 0 && nodeIndex < nodeCount) || (Encode(nodeIndex) >= 0 && Encode(nodeIndex) < leafCount));
Debug.Assert(leafCount >= 2, "This implementation assumes all nodes are filled.");
pool.Take(threadDispatcher.ThreadCount, out leavesToTest);
for (int i = 0; i < threadDispatcher.ThreadCount; i++)
leavesToTest[i] = new QuickList<int>(leafCount, threadDispatcher.GetThreadMemoryPool(i));
int multithreadingLeafCountThreshold = leafCount / (threadDispatcher.ThreadCount);
CollectNodesForMultithreadedCulling(0, multithreadingLeafCountThreshold);
threadDispatcher.DispatchWorkers(FrustumSweepThreaded);
for (var i = 0; i < threadDispatcher.ThreadCount; i++)
{
foreach (var leafIndex in leavesToTest[i])
leafTester.TestLeaf(leafIndex, frustumData);
}
pool.Return(ref leavesToTest);
}
internal unsafe void FrustumSweep<TLeafTester>(int nodeIndex, int* stack, ref TLeafTester leafTester) where TLeafTester : IFrustumLeafTester
{
Debug.Assert((nodeIndex >= 0 && nodeIndex < nodeCount) || (Encode(nodeIndex) >= 0 && Encode(nodeIndex) < leafCount));
Debug.Assert(leafCount >= 2, "This implementation assumes all nodes are filled.");
globalRemainingLeavesToVisit = leafCount;
ref uint planeBitmask = ref Unsafe.AsRef(uint.MaxValue);
ref int leafIndex = ref Unsafe.AsRef(-1);
ref int stackEnd = ref Unsafe.AsRef(0);
ref int fullyContainedStack = ref Unsafe.AsRef(-1);
while (true)
{
TestAABBs(ref globalRemainingLeavesToVisit, ref nodeIndex, ref leafIndex, ref fullyContainedStack, ref planeBitmask, ref stackEnd, stack);
if (leafIndex > -1)
{
leafTester.TestLeaf(leafIndex, frustumData);
leafIndex = -1;
}
if (globalRemainingLeavesToVisit is 0)
return;
}
}
private unsafe void FrustumSweepThreaded(int workerIndex)
{
var stack = stackalloc int[TraversalStackCapacity];
ref var node = ref Nodes[0];
var remainingLeavesToVisit = 0;
var takenLeavesCount = 0;
ref int nodeIndex = ref Unsafe.AsRef(0);
ref uint planeBitmask = ref Unsafe.AsRef(uint.MaxValue);
ref int leafIndex = ref Unsafe.AsRef(-1);
ref int stackEnd = ref Unsafe.AsRef(0);
ref int fullyContainedStack = ref Unsafe.AsRef(-1);
var givenUpCount = 0;
while (true)
{
if (leafIndex > -1)
{
Debug.Assert(leavesToTest[workerIndex].Contains(leafIndex) is false, "Duplicates are unacceptable");
leavesToTest[workerIndex].AddUnsafely(leafIndex);
leafIndex = -1;
}
if (remainingLeavesToVisit is 0)
{
Interlocked.Add(ref globalRemainingLeavesToVisit, -takenLeavesCount);
nodeIndex = 0;
//we no longer have work on this thread. We try to steal some from other threads
if (currentIndex < startingIndexesLength)
{
var nextExtraStartingIndex = Interlocked.Increment(ref currentIndex);
//due to multithreading it might happen that 2 threads attempt to increment
//while currentIndex is smaller by 1 than startingIndexesLength
//resulting in error if we dont check again
if (nextExtraStartingIndex < startingIndexesLength)
nodeIndex = startingIndexes[nextExtraStartingIndex];
}
while (nodeIndex is 0 && globalRemainingLeavesToVisit > 0)
nodeIndex = Interlocked.Exchange(ref interThreadExchange, 0);
if (nodeIndex != 0)
{
if (nodeIndex < 0)
{
nodeIndex = -nodeIndex;
planeBitmask = isInside;
}
else
planeBitmask = uint.MaxValue;
node = ref Nodes[nodeIndex];
remainingLeavesToVisit = node.A.LeafCount + node.B.LeafCount;
takenLeavesCount = remainingLeavesToVisit;
}
else return;
leafIndex = -1;
fullyContainedStack = -1;
givenUpCount = 0;
}
else if (globalRemainingLeavesToVisit > 0 && stackEnd > 0)
{
//We actively give up one branch when other thread stole it,
//to simplify interthread communication
//provided we have anything to give up
//We give up branch at the bottom not at the top
//because bottom branch usually has the most amount of leaves left to check
var index = stack[0];
var oldVal = Interlocked.CompareExchange(ref interThreadExchange, fullyContainedStack is 0 ? -index : index, 0);
if (oldVal is 0)
{
stackEnd--;
node = ref Nodes[index];
remainingLeavesToVisit -= (node.A.LeafCount + node.B.LeafCount);
takenLeavesCount -= (node.A.LeafCount + node.B.LeafCount);
if (fullyContainedStack > 0)
fullyContainedStack--;
if (stackEnd == 0)
{
stack = (int*)Unsafe.Subtract<int>(stack, givenUpCount);
givenUpCount = 0;
}
else
{
stack = (int*)Unsafe.Add<int>(stack, 1);
givenUpCount++;
}
}
}
TestAABBs(ref remainingLeavesToVisit, ref nodeIndex, ref leafIndex, ref fullyContainedStack, ref planeBitmask, ref stackEnd, stack);
}
}
unsafe void CollectNodesForMultithreadedCulling(int nodeIndex, int leafCountThreshold)
{
ref var node = ref Nodes[nodeIndex];
if (node.A.Index > 0)
if (node.A.LeafCount > leafCountThreshold)
{
CollectNodesForMultithreadedCulling(node.A.Index, leafCountThreshold);
}
else
{
startingIndexes[startingIndexesLength] = node.A.Index;
startingIndexesLength++;
if (startingIndexesLength == startingIndexes.Length)
Array.Resize(ref startingIndexes, startingIndexes.Length * 2);
}
else
{
//we met leaf very early. we might as well test it since this is very rare
if ((IntersectsOrInside(node.A.Min, node.A.Max, frustumData, node.A.Index) & (1 << 6)) != 0)
leavesToTest[0].AddUnsafely(Encode(node.A.Index));
globalRemainingLeavesToVisit--;
}
if (node.B.Index > 0)
if (node.B.LeafCount > leafCountThreshold)
{
CollectNodesForMultithreadedCulling(node.B.Index, leafCountThreshold);
}
else
{
startingIndexes[startingIndexesLength] = node.B.Index;
startingIndexesLength++;
if (startingIndexesLength == startingIndexes.Length)
Array.Resize(ref startingIndexes, startingIndexes.Length * 2);
}
else
{
//we met leaf very early. we might as well test it since this is very rare
if ((IntersectsOrInside(node.B.Min, node.B.Max, frustumData, node.B.Index) & (1 << 6)) != 0)
leavesToTest[0].AddUnsafely(Encode(node.B.Index));
globalRemainingLeavesToVisit--;
}
}
public unsafe static uint IntersectsOrInside(in Vector3 min, in Vector3 max, FrustumData* frustumData, int nodeIndex, uint planeBitmask = uint.MaxValue)
{
var shouldRenumberPlanes = failedPlane.TryGetValue(nodeIndex, out var planeId);
var start = 0;
//far plane test can be eliminated by setting end=5
//and changing all occurences of start==5 to start==4 instead.
//This results in frustum with "infinite" length
var end = 6;
//Convert AABB to center-extents representation
//On NET 5+ can skip conversion and instead use Vector.ConditionalSelect & Vector.GreaterThan with Vector128
//and use p, n-vertex optimization
var eRep = new ExtentsRepresentation()
{
center = max + min, // Compute AABB center
extents = max - min // Compute positive extents
};
ref var planeAddr = ref Unsafe.As<float, Plane>(ref frustumData->nearPlane.Normal.X);
ref var plane = ref Unsafe.As<float, Plane>(ref frustumData->nearPlane.Normal.X);
if (shouldRenumberPlanes)
{
start = planeId;
end = start;
}
do
{
if (((planeBitmask >> start) & 1) == 0)
{
start = shouldRenumberPlanes && start == 5 ? 0 : start + 1;
continue;
}
plane = ref Unsafe.Add(ref planeAddr, start * 2);
/*var eRep = new ExtentsRepresentation()
{
center = new Vector3(), // Compute AABB center
extents = new Vector3() // Compute positive extents
};*/
var d = plane.D;
float m, r;
//Vector<float>.Count == 4 is not worth it since built-in VectorX and Plane are already vectorized
//and for Vector<float>.Count == 16 we should fuse A & B into one test
//and that will require sperate method with different signature and NET 5+ since Vector512 is required
if (Vector.IsHardwareAccelerated && Vector<float>.Count == 8)
{
//Vector.ConditionalSelect<float>();
ref Vector<float> planeData = ref Unsafe.As<Plane, Vector<float>>(ref plane);
ref Vector<float> bbData = ref Unsafe.As<ExtentsRepresentation, Vector<float>>(ref eRep);
var multi = bbData * planeData;
m = multi[0] + multi[1] + multi[2];
r = multi[4] + multi[5] + multi[6];
}
else
{
m = Vector3.Dot(plane.Normal, eRep.center);
plane = ref Unsafe.Add(ref plane, 1);//absolute normal
r = Vector3.Dot(plane.Normal, eRep.extents);
}
if (m + r < d)//outside
{
planeBitmask &= unchecked((uint)(~(1 << 6)));
//no need to renumber planes when id is 0
if (!shouldRenumberPlanes && start != 0 && start != planeId)
failedPlane.TryAdd(nodeIndex, start);
else if (start != 0 && start != planeId)
failedPlane.TryUpdate(nodeIndex, start, start);
return planeBitmask;
}
if (m - r >= d)//inside
{
planeBitmask &= ~(uint)(1 << start);
}
/*else//intersect
{
}*/
start = shouldRenumberPlanes && start == 5 ? 0 : start + 1;
} while (start != end);
if (shouldRenumberPlanes)
failedPlane.TryRemove(nodeIndex, out _);
return planeBitmask;
} any extra optimizations could be done after moving to .NET 5/6 but that will have to wait when bepu also moves there. |
Beta Was this translation helpful? Give feedback.
For the purposes of rendering or similar use cases using bepuphysics2 infrastructure, the easiest thing would be to piggy back on the broad phase Tree (or a Tree maintained specifically for whatever the use case is, if it's not the same as physics objects) and implement a frustum-tree test. I haven't implemented that in bepuphysics2 yet unfortunately (all frustum culling is done on the GPU in my projects), but it would not be too hard to do.
You could take the existing ray-tree test and replace the ray-AABB test with a frustum-AABB test. I might add such a thing eventually, but if you'd like to give it a shot, here's some HLSL for such a test.
First, since the frustum is going to be teste…