Init

2025-11-28 23:13:44 +05:30
commit a3a8e79709
7360 changed files with 1156074 additions and 0 deletions
--- a/physx/source/lowleveldynamics/src/DyArticulation.cpp
+++ b/physx/source/lowleveldynamics/src/DyArticulation.cpp
--- a/physx/source/lowleveldynamics/src/DyArticulationContactPrep.cpp
+++ b/physx/source/lowleveldynamics/src/DyArticulationContactPrep.cpp
@ -0,0 +1,542 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverConstraint1D.h"
+#include "DyArticulationHelper.h"
+#include "PxcNpWorkUnit.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintExtShared.h"
+#include "DyConstraintPrep.h"
+
+using namespace physx::Gu;
+
+namespace physx
+{
+
+namespace Dy
+{
+
+// constraint-gen only, since these use getVelocity methods
+// which aren't valid during the solver phase
+
+//PX_INLINE void computeFrictionTangents(const Ps::aos::Vec3V& vrel,const Ps::aos::Vec3V& unitNormal, Ps::aos::Vec3V& t0, Ps::aos::Vec3V& t1)
+//{
+//	using namespace Ps::aos;
+//	//PX_ASSERT(PxAbs(unitNormal.magnitude()-1)<1e-3f);
+//
+//	t0 = V3Sub(vrel, V3Scale(unitNormal, V3Dot(unitNormal, vrel)));
+//	const FloatV ll = V3Dot(t0, t0);
+//
+//	if (FAllGrtr(ll, FLoad(1e10f)))										//can set as low as 0.
+//	{
+//		t0 = V3Scale(t0, FRsqrt(ll));
+//		t1 = V3Cross(unitNormal, t0);
+//	}
+//	else
+//		Ps::normalToTangents(unitNormal, t0, t1);		//fallback
+//}
+
+PxReal SolverExtBody::projectVelocity(const PxVec3& linear, const PxVec3& angular) const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return mBodyData->projectVelocity(linear, angular);
+	else
+	{
+		const Cm::SpatialVectorV velocity = mArticulation->getLinkVelocity(mLinkIndex);
+
+		const FloatV fv = velocity.dot(Cm::SpatialVector(linear, angular));
+
+		PxF32 f;
+		FStore(fv, &f);
+		return f;
+		/*PxF32 f;
+		FStore(getVelocity(*mFsData)[mLinkIndex].dot(Cm::SpatialVector(linear, angular)), &f);
+		return f;*/
+	}
+}
+
+Ps::aos::FloatV SolverExtBody::projectVelocity(const Ps::aos::Vec3V& linear, const Ps::aos::Vec3V& angular) const
+{
+	if (mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+	{
+		return V3SumElems(V3Add(V3Mul(V3LoadA(mBodyData->linearVelocity), linear), V3Mul(V3LoadA(mBodyData->angularVelocity), angular)));
+	}
+	else
+	{
+		const Cm::SpatialVectorV velocity = mArticulation->getLinkVelocity(mLinkIndex);
+
+		return velocity.dot(Cm::SpatialVectorV(linear, angular));
+	}
+}
+
+
+PxVec3 SolverExtBody::getLinVel() const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return mBodyData->linearVelocity;
+	else
+	{
+		const Vec3V linear = mArticulation->getLinkVelocity(mLinkIndex).linear;
+
+		PxVec3 result;
+		V3StoreU(linear, result);
+		/*PxVec3 result;
+		V3StoreU(getVelocity(*mFsData)[mLinkIndex].linear, result);*/
+		return result;
+	}
+}
+
+PxVec3 SolverExtBody::getAngVel() const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return mBodyData->angularVelocity;
+	else
+	{
+		const Vec3V angular = mArticulation->getLinkVelocity(mLinkIndex).angular;
+		
+		PxVec3 result;
+		V3StoreU(angular, result);
+		/*PxVec3 result;
+		V3StoreU(getVelocity(*mFsData)[mLinkIndex].angular, result);*/
+		return result;
+	}
+}
+
+Ps::aos::Vec3V SolverExtBody::getLinVelV() const
+{
+	if (mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return V3LoadA(mBodyData->linearVelocity);
+	else
+	{
+		return mArticulation->getLinkVelocity(mLinkIndex).linear;
+	}
+}
+
+
+Vec3V SolverExtBody::getAngVelV() const
+{
+	if (mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return V3LoadA(mBodyData->angularVelocity);
+	else
+	{
+
+		return mArticulation->getLinkVelocity(mLinkIndex).angular;
+	}
+}
+
+Cm::SpatialVectorV SolverExtBody::getVelocity() const
+{
+	if(mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return Cm::SpatialVectorV(V3LoadA(mBodyData->linearVelocity), V3LoadA(mBodyData->angularVelocity));
+	else
+		return mArticulation->getLinkVelocity(mLinkIndex);
+}
+
+Cm::SpatialVector createImpulseResponseVector(const PxVec3& linear, const PxVec3& angular, const SolverExtBody& body)
+{
+	if(body.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		return Cm::SpatialVector(linear, body.mBodyData->sqrtInvInertia * angular);
+
+	return Cm::SpatialVector(linear, angular);
+}
+
+Cm::SpatialVectorV createImpulseResponseVector(const Ps::aos::Vec3V& linear, const Ps::aos::Vec3V& angular, const SolverExtBody& body)
+{
+	if (body.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+	{
+		return Cm::SpatialVectorV(linear, M33MulV3(M33Load(body.mBodyData->sqrtInvInertia), angular));
+	}
+	return Cm::SpatialVectorV(linear, angular);
+}
+
+PxReal getImpulseResponse(const SolverExtBody& b0, const Cm::SpatialVector& impulse0, Cm::SpatialVector& deltaV0, PxReal dom0, PxReal angDom0, 
+	const SolverExtBody& b1, const Cm::SpatialVector& impulse1, Cm::SpatialVector& deltaV1, PxReal dom1, PxReal angDom1, 
+	Cm::SpatialVectorF* Z, bool allowSelfCollision)
+{
+	PxReal response;
+	allowSelfCollision = false;
+	// right now self-collision with contacts crashes the solver
+	
+	//KS - knocked this out to save some space on SPU
+	if(allowSelfCollision && b0.mLinkIndex!=PxSolverConstraintDesc::NO_LINK && b0.mArticulation == b1.mArticulation && 0)
+	{
+		/*ArticulationHelper::getImpulseSelfResponse(*b0.mFsData,b0.mLinkIndex, impulse0, deltaV0, 
+													  b1.mLinkIndex, impulse1, deltaV1);*/
+
+		b0.mArticulation->getImpulseSelfResponse(b0.mLinkIndex, b1.mLinkIndex, Z, impulse0.scale(dom0, angDom0), impulse1.scale(dom1, angDom1), 
+			deltaV0, deltaV1);
+		
+		response = impulse0.dot(deltaV0) + impulse1.dot(deltaV1);
+	}
+	else 
+	{
+		if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		{
+			deltaV0.linear = impulse0.linear * b0.mBodyData->invMass * dom0;
+			deltaV0.angular = impulse0.angular * angDom0;
+		}
+		else
+		{
+			b0.mArticulation->getImpulseResponse(b0.mLinkIndex, Z, impulse0.scale(dom0, angDom0), deltaV0);
+		}
+
+		response = impulse0.dot(deltaV0);
+		if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		{
+			deltaV1.linear = impulse1.linear * b1.mBodyData->invMass * dom1;
+			deltaV1.angular = impulse1.angular * angDom1;
+		}
+		else
+		{
+			b1.mArticulation->getImpulseResponse( b1.mLinkIndex, Z, impulse1.scale(dom1, angDom1), deltaV1);
+		}
+		response += impulse1.dot(deltaV1);
+	}
+
+	return response;
+}
+
+FloatV getImpulseResponse(const SolverExtBody& b0, const Cm::SpatialVectorV& impulse0, Cm::SpatialVectorV& deltaV0, const FloatV& dom0, const FloatV& angDom0,
+	const SolverExtBody& b1, const Cm::SpatialVectorV& impulse1, Cm::SpatialVectorV& deltaV1, const FloatV& dom1, const FloatV& angDom1,
+	Cm::SpatialVectorV* Z, bool /*allowSelfCollision*/)
+{
+	Vec3V response;
+	{
+
+		if (b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		{
+			deltaV0.linear = V3Scale(impulse0.linear,  FMul(FLoad(b0.mBodyData->invMass), dom0));
+			deltaV0.angular = V3Scale(impulse0.angular, angDom0);
+		}
+		else
+		{
+			b0.mArticulation->getImpulseResponse(b0.mLinkIndex, Z, impulse0.scale(dom0, angDom0), deltaV0);
+		}
+
+		response = V3Add(V3Mul(impulse0.linear, deltaV0.linear), V3Mul(impulse0.angular, deltaV0.angular));
+		if (b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		{
+			deltaV1.linear = V3Scale(impulse1.linear, FMul(FLoad(b1.mBodyData->invMass), dom1));
+			deltaV1.angular = V3Scale(impulse1.angular, angDom1);
+		}
+		else
+		{
+			b1.mArticulation->getImpulseResponse(b1.mLinkIndex, Z, impulse1.scale(dom1, angDom1), deltaV1);
+		}
+		response = V3Add(response, V3Add(V3Mul(impulse1.linear, deltaV1.linear), V3Mul(impulse1.angular, deltaV1.angular)));
+	}
+
+	return V3SumElems(response);
+}
+
+
+
+void setupFinalizeExtSolverContacts(
+	const ContactPoint* buffer,
+	const CorrelationBuffer& c,
+	const PxTransform& bodyFrame0,
+	const PxTransform& bodyFrame1,
+	PxU8* workspace,
+	const SolverExtBody& b0,
+	const SolverExtBody& b1,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal invMassScale0, PxReal invInertiaScale0,
+	PxReal invMassScale1, PxReal invInertiaScale1,
+	const PxReal restDist,
+	PxU8* frictionDataPtr,
+	PxReal ccdMaxContactDist,
+	Cm::SpatialVectorF* Z)
+{
+	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
+	// therefore did not get written back to the cache) but the patch addresses are dense,
+	// corresponding to valid patches
+
+	/*const bool haveFriction = PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;*/
+
+	const FloatV ccdMaxSeparation = FLoad(ccdMaxContactDist);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	const FloatV zero = FZero();
+
+	const PxF32 maxPenBias0 = b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b0.mBodyData->penBiasClamp : b0.mArticulation->getLinkMaxPenBias(b0.mLinkIndex);
+	const PxF32 maxPenBias1 = b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b1.mBodyData->penBiasClamp : b1.mArticulation->getLinkMaxPenBias(b1.mLinkIndex);
+
+	const FloatV maxPenBias = FLoad(PxMax(maxPenBias0, maxPenBias1));
+
+	const Vec3V frame0p = V3LoadU(bodyFrame0.p);
+	const Vec3V frame1p = V3LoadU(bodyFrame1.p);
+
+	const Cm::SpatialVectorV vel0 = b0.getVelocity();
+	const Cm::SpatialVectorV vel1 = b1.getVelocity();
+
+
+	const FloatV d0 = FLoad(invMassScale0);
+	const FloatV d1 = FLoad(invMassScale1);
+
+	const FloatV angD0 = FLoad(invInertiaScale0);
+	const FloatV angD1 = FLoad(invInertiaScale1);
+
+	Vec4V staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = V4Zero();
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, d0);
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, d1);
+
+	const FloatV restDistance = FLoad(restDist); 
+
+	PxU32 frictionPatchWritebackAddrIndex = 0;
+	PxU32 contactWritebackCount = 0;
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+	PxU8 flags = 0;
+
+	for(PxU32 i=0;i<c.frictionPatchCount;i++)
+	{
+		PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+		PX_ASSERT(frictionPatch.anchorCount <= 2);  //0==anchorCount is allowed if all the contacts in the manifold have a large offset. 
+
+		const Gu::ContactPoint* contactBase0 = buffer + c.contactPatches[c.correlationListHeads[i]].start;
+		const PxReal combinedRestitution = contactBase0->restitution;
+
+		const PxReal coefficient = (contactBase0->materialFlags & PxMaterialFlag::eIMPROVED_PATCH_FRICTION && frictionPatch.anchorCount == 2) ? 0.5f : 1.f;
+
+		const PxReal staticFriction = contactBase0->staticFriction * coefficient;
+		const PxReal dynamicFriction = contactBase0->dynamicFriction * coefficient;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(staticFriction));
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(dynamicFriction));
+	
+		SolverContactHeader* PX_RESTRICT header = reinterpret_cast<SolverContactHeader*>(ptr);
+		ptr += sizeof(SolverContactHeader);		
+
+		Ps::prefetchLine(ptr + 128);
+		Ps::prefetchLine(ptr + 256);
+		Ps::prefetchLine(ptr + 384);
+		
+		const bool haveFriction = (disableStrongFriction == 0) ;//PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;
+		header->numNormalConstr		= Ps::to8(contactCount);
+		header->numFrictionConstr	= Ps::to8(haveFriction ? frictionPatch.anchorCount*2 : 0);
+	
+		header->type				= Ps::to8(DY_SC_TYPE_EXT_CONTACT);
+
+		header->flags = flags;
+
+		const FloatV restitution = FLoad(combinedRestitution);
+	
+		header->staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;
+
+		header->angDom0 = invInertiaScale0;
+		header->angDom1 = invInertiaScale1;
+	
+		const PxU32 pointStride = sizeof(SolverContactPointExt);
+		const PxU32 frictionStride = sizeof(SolverContactFrictionExt);
+
+		const Vec3V normal = V3LoadU(buffer[c.contactPatches[c.correlationListHeads[i]].start].normal);
+		
+		FloatV accumImpulse = FZero();
+
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer + c.contactPatches[patch].start;
+				
+			PxU8* p = ptr;
+
+			for(PxU32 j=0;j<count;j++)
+			{
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPointExt* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPointExt*>(p);
+				p += pointStride;
+
+				accumImpulse = FAdd(accumImpulse, setupExtSolverContact(b0, b1, d0, d1, angD0, angD1, frame0p, frame1p, normal, invDt, invDtp8, restDistance, maxPenBias, restitution,
+					bounceThreshold, contact, *solverContact, ccdMaxSeparation, Z, vel0, vel1));
+			
+			}
+
+			ptr = p;
+		}
+		contactWritebackCount += contactCount;
+
+		accumImpulse = FDiv(accumImpulse, FLoad(PxF32(contactCount)));
+
+		header->normal_minAppliedImpulseForFrictionW = V4SetW(Vec4V_From_Vec3V(normal), accumImpulse);
+
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(ptr);
+		PxMemZero(forceBuffer, sizeof(PxF32) * contactCount);
+		ptr += sizeof(PxF32) * ((contactCount + 3) & (~3));
+
+		header->broken = 0;
+
+		if(haveFriction)
+		{
+			//const Vec3V normal = Vec3V_From_PxVec3(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+			//Vec3V normalS = V3LoadA(buffer[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+
+			const Vec3V linVrel = V3Sub(vel0.linear, vel1.linear);
+			//const Vec3V normal = Vec3V_From_PxVec3_Aligned(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+			const FloatV orthoThreshold = FLoad(0.70710678f);
+			const FloatV p1 = FLoad(0.0001f);
+			// fallback: normal.cross((1,0,0)) or normal.cross((0,0,1))
+			const FloatV normalX = V3GetX(normal);
+			const FloatV normalY = V3GetY(normal);
+			const FloatV normalZ = V3GetZ(normal);
+
+			Vec3V t0Fallback1 = V3Merge(zero, FNeg(normalZ), normalY);
+			Vec3V t0Fallback2 = V3Merge(FNeg(normalY), normalX, zero);
+			Vec3V t0Fallback = V3Sel(FIsGrtr(orthoThreshold, FAbs(normalX)), t0Fallback1, t0Fallback2);
+
+			Vec3V t0 = V3Sub(linVrel, V3Scale(normal, V3Dot(normal, linVrel)));
+			t0 = V3Sel(FIsGrtr(V3LengthSq(t0), p1), t0, t0Fallback);
+			t0 = V3Normalize(t0);
+
+			const VecCrossV t0Cross = V3PrepareCross(t0);
+
+			const Vec3V t1 = V3Cross(normal, t0Cross);
+			const VecCrossV t1Cross = V3PrepareCross(t1);
+			
+			//We want to set the writeBack ptr to point to the broken flag of the friction patch.
+			//On spu we have a slight problem here because the friction patch array is 
+			//in local store rather than in main memory. The good news is that the address of the friction 
+			//patch array in main memory is stored in the work unit. These two addresses will be equal 
+			//except on spu where one is local store memory and the other is the effective address in main memory.
+			//Using the value stored in the work unit guarantees that the main memory address is used on all platforms.
+			PxU8* PX_RESTRICT writeback = frictionDataPtr + frictionPatchWritebackAddrIndex*sizeof(FrictionPatch);
+
+			header->frictionBrokenWritebackByte = writeback;			
+
+			for(PxU32 j = 0; j < frictionPatch.anchorCount; j++)
+			{
+				SolverContactFrictionExt* PX_RESTRICT f0 = reinterpret_cast<SolverContactFrictionExt*>(ptr);
+				ptr += frictionStride;
+				SolverContactFrictionExt* PX_RESTRICT f1 = reinterpret_cast<SolverContactFrictionExt*>(ptr);
+				ptr += frictionStride;
+
+				Vec3V ra = V3LoadU(bodyFrame0.q.rotate(frictionPatch.body0Anchors[j]));
+				Vec3V rb = V3LoadU(bodyFrame1.q.rotate(frictionPatch.body1Anchors[j]));
+				Vec3V error = V3Sub(V3Add(ra, frame0p), V3Add(rb, frame1p));
+
+				{
+					const Vec3V raXn = V3Cross(ra, t0Cross);
+					const Vec3V rbXn = V3Cross(rb, t0Cross);
+
+					Cm::SpatialVectorV deltaV0, deltaV1;
+
+					const Cm::SpatialVectorV resp0 = createImpulseResponseVector(t0, raXn, b0);
+					const Cm::SpatialVectorV resp1 = createImpulseResponseVector(V3Neg(t0), V3Neg(rbXn), b1);
+					FloatV resp = getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+															 b1, resp1, deltaV1, d1, angD1, reinterpret_cast<Cm::SpatialVectorV*>(Z));
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, FLoad(DY_ARTICULATION_MIN_RESPONSE)), FDiv(p8, resp), zero);
+
+					const PxU32 index = c.contactPatches[c.correlationListHeads[i]].start;
+					FloatV targetVel = V3Dot(V3LoadA(buffer[index].targetVel), t0);
+
+					if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel = FSub(targetVel, b0.projectVelocity(t0, raXn));
+					else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel = FAdd(targetVel, b1.projectVelocity(t0, rbXn));
+
+					f0->normalXYZ_appliedForceW = V4SetW(t0, zero);
+					f0->raXnXYZ_velMultiplierW = V4SetW(Vec4V_From_Vec3V(resp0.angular), velMultiplier);
+					f0->rbXnXYZ_biasW = V4SetW(V4Neg(Vec4V_From_Vec3V(resp1.angular)), FMul(V3Dot(t0, error), invDt));
+					f0->linDeltaVA = deltaV0.linear;
+					f0->angDeltaVA = deltaV0.angular;
+					f0->linDeltaVB = deltaV1.linear;
+					f0->angDeltaVB = deltaV1.angular;
+					FStore(targetVel, &f0->targetVel);
+				}
+
+				{
+					const Vec3V raXn = V3Cross(ra, t1Cross);
+					const Vec3V rbXn = V3Cross(rb, t1Cross);
+
+					Cm::SpatialVectorV deltaV0, deltaV1;
+
+					const Cm::SpatialVectorV resp0 = createImpulseResponseVector(t1, raXn, b0);
+					const Cm::SpatialVectorV resp1 = createImpulseResponseVector(V3Neg(t1), V3Neg(rbXn), b1);
+
+					FloatV resp = getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+														   b1, resp1, deltaV1, d1, angD1, reinterpret_cast<Cm::SpatialVectorV*>(Z));
+
+					//const FloatV velMultiplier = FSel(FIsGrtr(resp, FLoad(DY_ARTICULATION_MIN_RESPONSE)), FMul(p8, FRecip(resp)), zero);
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, FLoad(DY_ARTICULATION_MIN_RESPONSE)), FDiv(p8, resp), zero);
+
+					const PxU32 index = c.contactPatches[c.correlationListHeads[i]].start;
+					FloatV targetVel = V3Dot(V3LoadA(buffer[index].targetVel), t1);
+
+					if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel = FSub(targetVel, b0.projectVelocity(t1, raXn));
+					else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+						targetVel = FAdd(targetVel, b1.projectVelocity(t1, rbXn));
+
+					f1->normalXYZ_appliedForceW = V4SetW(t1, zero);
+					f1->raXnXYZ_velMultiplierW = V4SetW(Vec4V_From_Vec3V(resp0.angular), velMultiplier);
+					f1->rbXnXYZ_biasW = V4SetW(V4Neg(Vec4V_From_Vec3V(resp1.angular)), FMul(V3Dot(t1, error), invDt));
+					f1->linDeltaVA = deltaV0.linear;
+					f1->angDeltaVA = deltaV0.angular;
+					f1->linDeltaVB = deltaV1.linear;
+					f1->angDeltaVB = deltaV1.angular;
+					FStore(targetVel, &f1->targetVel);
+				}
+			}
+		}
+
+		frictionPatchWritebackAddrIndex++;
+	}
+}
+
+}
+
+
+}
--- a/physx/source/lowleveldynamics/src/DyArticulationContactPrep.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationContactPrep.h
@ -0,0 +1,103 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+   
+
+#ifndef DY_SOLVERCONSTRAINTEXT_H
+#define DY_SOLVERCONSTRAINTEXT_H
+
+#include "DySolverExt.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+
+
+namespace Gu
+{
+	class ContactBuffer;
+	struct ContactPoint;
+}
+
+namespace Dy
+{
+
+	struct CorrelationBuffer;
+
+	PxReal getImpulseResponse(const SolverExtBody& b0, const Cm::SpatialVector& impulse0, Cm::SpatialVector& deltaV0, PxReal dom0, PxReal angDom0,
+		const SolverExtBody& b1, const Cm::SpatialVector& impulse1, Cm::SpatialVector& deltaV1, PxReal dom1, PxReal angDom1,
+		Cm::SpatialVectorF* Z, bool allowSelfCollision = false);
+
+	Ps::aos::FloatV getImpulseResponse(const SolverExtBody& b0, const Cm::SpatialVectorV& impulse0, Cm::SpatialVectorV& deltaV0, const Ps::aos::FloatV& dom0, const Ps::aos::FloatV& angDom0,
+		const SolverExtBody& b1, const Cm::SpatialVectorV& impulse1, Cm::SpatialVectorV& deltaV1, const Ps::aos::FloatV& dom1, const Ps::aos::FloatV& angDom1,
+		Cm::SpatialVectorV* Z, bool allowSelfCollision = false);
+
+	Cm::SpatialVector createImpulseResponseVector(const PxVec3& linear, const PxVec3& angular, const SolverExtBody& body);
+	Cm::SpatialVectorV createImpulseResponseVector(const Ps::aos::Vec3V& linear, const Ps::aos::Vec3V& angular, const SolverExtBody& body);
+
+	void setupFinalizeExtSolverContacts(
+		const Gu::ContactPoint* buffer,
+		const CorrelationBuffer& c,
+		const PxTransform& bodyFrame0,
+		const PxTransform& bodyFrame1,
+		PxU8* workspace,
+		const SolverExtBody& b0,
+		const SolverExtBody& b1,
+		const PxReal invDtF32,
+		PxReal bounceThresholdF32,
+		PxReal invMassScale0, PxReal invInertiaScale0,
+		PxReal invMassScale1, PxReal invInertiaScale1,
+		PxReal restDistance, PxU8* frictionDataPtr,
+		PxReal ccdMaxContactDist,
+		Cm::SpatialVectorF* Z);
+
+
+	bool setupFinalizeExtSolverContactsCoulomb(
+		const Gu::ContactBuffer& buffer,
+		const CorrelationBuffer& c,
+		const PxTransform& bodyFrame0,
+		const PxTransform& bodyFrame1,
+		PxU8* workspace,
+		PxReal invDt,
+		PxReal bounceThreshold,
+		const SolverExtBody& b0,
+		const SolverExtBody& b1,
+		PxU32 frictionCountPerPoint,
+		PxReal invMassScale0, PxReal invInertiaScale0,
+		PxReal invMassScale1, PxReal invInertiaScale1,
+		PxReal restDist,
+		PxReal ccdMaxContactDist,
+		Cm::SpatialVectorF* Z);
+
+} //namespace Dy
+
+}
+
+#endif //DY_SOLVERCONSTRAINTEXT_H
--- a/physx/source/lowleveldynamics/src/DyArticulationContactPrepPF.cpp
+++ b/physx/source/lowleveldynamics/src/DyArticulationContactPrepPF.cpp
@ -0,0 +1,310 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DyArticulationHelper.h"
+#include "PxcNpWorkUnit.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintExtShared.h"
+
+using namespace physx;
+using namespace Gu;
+
+// constraint-gen only, since these use getVelocityFast methods
+// which aren't valid during the solver phase
+
+namespace physx
+{
+
+namespace Dy
+{
+
+
+bool setupFinalizeExtSolverContactsCoulomb(
+						    const ContactBuffer& buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							PxReal invDt,
+							PxReal bounceThresholdF32,
+							const SolverExtBody& b0,
+							const SolverExtBody& b1,
+							PxU32 frictionCountPerPoint,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							PxReal restDist,
+							PxReal ccdMaxDistance,
+							Cm::SpatialVectorF* Z)
+{
+	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
+	// therefore did not get written back to the cache) but the patch addresses are dense,
+	// corresponding to valid patches
+
+	const FloatV ccdMaxSeparation = FLoad(ccdMaxDistance);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	const PxF32 maxPenBias0 = b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b0.mBodyData->penBiasClamp : b0.mArticulation->getLinkMaxPenBias(b0.mLinkIndex);
+	const PxF32 maxPenBias1 = b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK ? b1.mBodyData->penBiasClamp : b1.mArticulation->getLinkMaxPenBias(b1.mLinkIndex);
+
+	const FloatV maxPenBias = FLoad(PxMax(maxPenBias0, maxPenBias1)/invDt);
+
+	const FloatV restDistance = FLoad(restDist); 
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+
+	const Cm::SpatialVectorV vel0 = b0.getVelocity();
+	const Cm::SpatialVectorV vel1 = b1.getVelocity();
+
+	const FloatV invDtV = FLoad(invDt);
+	const FloatV pt8 = FLoad(0.8f);
+
+	const FloatV invDtp8 = FMul(invDtV, pt8);
+
+	const Vec3V bodyFrame0p = V3LoadU(bodyFrame0.p);
+	const Vec3V bodyFrame1p = V3LoadU(bodyFrame1.p);
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	const PxU32 frictionPatchCount = c.frictionPatchCount;
+
+	const PxU32 pointStride = sizeof(SolverContactPointExt);
+	const PxU32 frictionStride = sizeof(SolverContactFrictionExt);
+	const PxU8 pointHeaderType = DY_SC_TYPE_EXT_CONTACT;
+	const PxU8 frictionHeaderType = DY_SC_TYPE_EXT_FRICTION;
+
+	const FloatV d0 = FLoad(invMassScale0);
+	const FloatV d1 = FLoad(invMassScale1);
+	const FloatV angD0 = FLoad(invInertiaScale0);
+	const FloatV angD1 = FLoad(invInertiaScale1);
+
+	PxU8 flags = 0;
+
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		const Vec3V normalV = Ps::aos::V3LoadA(contactBase0->normal);
+		const Vec3V normal = V3LoadA(contactBase0->normal);
+
+		const PxReal combinedRestitution = contactBase0->restitution;
+	
+		
+		SolverContactCoulombHeader* PX_RESTRICT header = reinterpret_cast<SolverContactCoulombHeader*>(ptr);
+		ptr += sizeof(SolverContactCoulombHeader);
+
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+		const FloatV restitution = FLoad(combinedRestitution);
+
+
+		header->numNormalConstr		= PxU8(contactCount);
+		header->type				= pointHeaderType;
+		//header->setRestitution(combinedRestitution);
+
+		header->setDominance0(d0);
+		header->setDominance1(d1);
+		header->angDom0 = invInertiaScale0;
+		header->angDom1 = invInertiaScale1;
+		header->flags = flags;
+		
+		header->setNormal(normalV);
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer.contacts + c.contactPatches[patch].start;
+				
+			PxU8* p = ptr;
+			for(PxU32 j=0;j<count;j++)
+			{
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPointExt* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPointExt*>(p);
+				p += pointStride;
+
+				setupExtSolverContact(b0, b1, d0, d1, angD0, angD1, bodyFrame0p, bodyFrame1p, normal, invDtV, invDtp8, restDistance, maxPenBias, restitution,
+					bounceThreshold, contact, *solverContact, ccdMaxSeparation, Z, vel0, vel1);
+			}			
+			ptr = p;
+		}
+	}
+
+	//construct all the frictions
+
+	PxU8* PX_RESTRICT ptr2 = workspace;
+
+	const PxF32 orthoThreshold = 0.70710678f;
+	const PxF32 eps = 0.00001f;
+	bool hasFriction = false;
+
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		SolverContactCoulombHeader* header = reinterpret_cast<SolverContactCoulombHeader*>(ptr2); 
+		header->frictionOffset = PxU16(ptr - ptr2);
+		ptr2 += sizeof(SolverContactCoulombHeader) + header->numNormalConstr * pointStride;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		PxVec3 normal = contactBase0->normal;
+
+		const PxReal staticFriction = contactBase0->staticFriction;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		const bool haveFriction = (disableStrongFriction == 0);
+	
+		SolverFrictionHeader* frictionHeader = reinterpret_cast<SolverFrictionHeader*>(ptr);
+		frictionHeader->numNormalConstr = Ps::to8(c.frictionPatchContactCounts[i]);
+		frictionHeader->numFrictionConstr = Ps::to8(haveFriction ? c.frictionPatchContactCounts[i] * frictionCountPerPoint : 0);
+		frictionHeader->flags = flags;
+		ptr += sizeof(SolverFrictionHeader);
+		PxF32* forceBuffer = reinterpret_cast<PxF32*>(ptr);
+		ptr += frictionHeader->getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+		PxMemZero(forceBuffer, sizeof(PxF32) * c.frictionPatchContactCounts[i]);
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+
+		const PxVec3 t0Fallback1(0.f, -normal.z, normal.y);
+		const PxVec3 t0Fallback2(-normal.y, normal.x, 0.f) ;
+		const PxVec3 tFallback1 = orthoThreshold > PxAbs(normal.x) ? t0Fallback1 : t0Fallback2;
+		const PxVec3 vrel = b0.getLinVel() - b1.getLinVel();
+		const PxVec3 t0_ = vrel - normal * (normal.dot(vrel));
+		const PxReal sqDist = t0_.dot(t0_);
+		const PxVec3 tDir0 = (sqDist > eps ? t0_: tFallback1).getNormalized();
+		const PxVec3 tDir1 = tDir0.cross(normal);
+		PxVec3 tFallback[2] = {tDir0, tDir1};
+
+		PxU32 ind = 0;
+
+		if(haveFriction)
+		{
+			hasFriction = true;
+			frictionHeader->setStaticFriction(staticFriction);
+			frictionHeader->invMass0D0 = invMassScale0;
+			frictionHeader->invMass1D1 = invMassScale1;
+			frictionHeader->angDom0 = invInertiaScale0;
+			frictionHeader->angDom1 = invInertiaScale1;
+			frictionHeader->type			= frictionHeaderType;
+			
+			PxU32 totalPatchContactCount = 0;
+		
+			for(PxU32 patch=c.correlationListHeads[i]; 
+				patch!=CorrelationBuffer::LIST_END; 
+				patch = c.contactPatches[patch].next)
+			{
+				const PxU32 count = c.contactPatches[patch].count;
+				const PxU32 start = c.contactPatches[patch].start;
+				const Gu::ContactPoint* contactBase = buffer.contacts + start;
+					
+				PxU8* p = ptr;
+
+				for(PxU32 j =0; j < count; j++)
+				{
+					const Gu::ContactPoint& contact = contactBase[j];
+					const Vec3V ra = V3Sub(V3LoadA(contact.point), bodyFrame0p);
+					const Vec3V rb = V3Sub(V3LoadA(contact.point), bodyFrame1p);
+						
+					const Vec3V targetVel = V3LoadA(contact.targetVel);
+					const Vec3V pVRa = V3Add(vel0.linear, V3Cross(vel0.angular, ra));
+					const Vec3V pVRb = V3Add(vel1.linear, V3Cross(vel1.angular, rb));
+					//const PxVec3 vrel = pVRa - pVRb;
+
+					for(PxU32 k = 0; k < frictionCountPerPoint; ++k)
+					{
+						SolverContactFrictionExt* PX_RESTRICT f0 = reinterpret_cast<SolverContactFrictionExt*>(p);
+						p += frictionStride;
+
+						Vec3V t0 = V3LoadU(tFallback[ind]);
+						ind = 1 - ind;
+						const Vec3V raXn = V3Cross(ra, t0); 
+						const Vec3V rbXn = V3Cross(rb, t0);
+						Cm::SpatialVectorV deltaV0, deltaV1;
+
+						const Cm::SpatialVectorV resp0 = createImpulseResponseVector(t0, raXn, b0);
+						const Cm::SpatialVectorV resp1 = createImpulseResponseVector(V3Neg(t0), V3Neg(rbXn), b1);
+
+						FloatV unitResponse = getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+																 b1, resp1, deltaV1, d1, angD1, reinterpret_cast<Cm::SpatialVectorV*>(Z));
+
+						FloatV tv = V3Dot(targetVel, t0);
+						if(b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+							tv = FAdd(tv, V3Dot(pVRa, t0));
+						else if(b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+							tv = FSub(tv, V3Dot(pVRb, t0));
+
+						const FloatV recipResponse = FSel(FIsGrtr(unitResponse, FZero()), FRecip(unitResponse), FZero());
+
+						f0->raXnXYZ_velMultiplierW = V4SetW(Vec4V_From_Vec3V(resp0.angular), recipResponse);
+						f0->rbXnXYZ_biasW = V4SetW(V4Neg(Vec4V_From_Vec3V(resp1.angular)), FZero());
+						f0->normalXYZ_appliedForceW = V4SetW(Vec4V_From_Vec3V(t0), FZero());
+						FStore(tv, &f0->targetVel);
+						f0->linDeltaVA = deltaV0.linear;
+						f0->angDeltaVA = deltaV0.angular;
+						f0->linDeltaVB = deltaV1.linear;
+						f0->angDeltaVB = deltaV1.angular;
+					}					
+				}
+
+				totalPatchContactCount += c.contactPatches[patch].count;
+				
+				ptr = p;	
+			}
+		}
+	}
+	//PX_ASSERT(ptr - workspace == n.solverConstraintSize);
+	return hasFriction;
+}
+
+
+}
+
+}
--- a/physx/source/lowleveldynamics/src/DyArticulationFnsDebug.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationFnsDebug.h
@ -0,0 +1,262 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_DEBUG_FNS_H
+#define DY_ARTICULATION_DEBUG_FNS_H
+
+#include "DyArticulationFnsScalar.h"
+#include "DyArticulationFnsSimd.h"
+
+namespace physx
+{
+namespace Dy
+{
+#if 0
+	void printMomentum(const char* id, PxTransform* pose, Cm::SpatialVector* velocity, FsInertia* inertia, PxU32 linkCount)
+	{
+		typedef ArticulationFnsScalar Fns;
+
+		Cm::SpatialVector m = Cm::SpatialVector::zero();
+		for(PxU32 i=0;i<linkCount;i++)
+			m += Fns::translateForce(pose[i].p - pose[0].p, Fns::multiply(inertia[i], velocity[i]));
+		printf("momentum (%20s): (%f, %f, %f), (%f, %f, %f)\n", id, m.linear.x, m.linear.y, m.linear.z, m.angular.x, m.angular.y, m.angular.z);
+	}
+#endif
+
+class ArticulationFnsDebug
+{
+	typedef ArticulationFnsSimdBase SimdBase;
+	typedef ArticulationFnsSimd<ArticulationFnsDebug> Simd;
+	typedef ArticulationFnsScalar Scalar;
+
+public:
+
+	static PX_FORCE_INLINE	FsInertia addInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Add(in1.ll, in2.ll),
+							M33Add(in1.la, in2.la),
+							M33Add(in1.aa, in2.aa));
+	}
+
+	static PX_FORCE_INLINE	FsInertia subtractInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Sub(in1.ll, in2.ll),
+							M33Sub(in1.la, in2.la),
+							M33Sub(in1.aa, in2.aa));
+	}
+
+	static Mat33V invertSym33(const Mat33V &m)
+	{
+		PxMat33 n_ = Scalar::invertSym33(unsimdify(m));
+		Mat33V n = SimdBase::invertSym33(m);
+		compare33(n_, unsimdify(n));
+
+		return n;
+	}
+
+	static Mat33V invSqrt(const Mat33V &m)
+	{
+		PxMat33 n_ = Scalar::invSqrt(unsimdify(m));
+		Mat33V n = SimdBase::invSqrt(m);
+		compare33(n_, unsimdify(n));
+
+		return n;
+	}
+
+
+
+	static FsInertia invertInertia(const FsInertia &I)
+	{
+		SpInertia J_ = Scalar::invertInertia(unsimdify(I));
+		FsInertia J = SimdBase::invertInertia(I);
+		compareInertias(J_,unsimdify(J));
+
+		return J;
+	}
+
+	static Mat33V computeSIS(const FsInertia &I, const Cm::SpatialVectorV S[3], Cm::SpatialVectorV*PX_RESTRICT IS)
+	{
+		Cm::SpatialVector IS_[3];
+		Scalar::multiply(IS_, unsimdify(I), unsimdify(&S[0]));
+		PxMat33 D_ = Scalar::multiplySym(IS_, unsimdify(&S[0]));
+
+		Mat33V D = SimdBase::computeSIS(I, S, IS);
+
+		compare33(unsimdify(D), D_);
+
+		return D;
+	}
+
+
+	static FsInertia multiplySubtract(const FsInertia &I, const Mat33V &D, const Cm::SpatialVectorV IS[3], Cm::SpatialVectorV*PX_RESTRICT DSI)
+	{
+		Cm::SpatialVector DSI_[3];
+
+		Scalar::multiply(DSI_, unsimdify(IS), unsimdify(D));
+		SpInertia J_ = Scalar::multiplySubtract(unsimdify(I), DSI_, unsimdify(IS));
+
+		FsInertia J = SimdBase::multiplySubtract(I, D, IS, DSI);
+
+		compareInertias(unsimdify(J), J_);
+
+		return J;
+	} 
+
+
+	static FsInertia multiplySubtract(const FsInertia &I, const Cm::SpatialVectorV S[3])
+	{
+		SpInertia J_ = Scalar::multiplySubtract(unsimdify(I), unsimdify(S), unsimdify(S));
+		FsInertia J = SimdBase::multiplySubtract(I, S);
+		compareInertias(unsimdify(J), J_);
+		return J;
+	} 
+
+
+	static FsInertia translateInertia(Vec3V offset, const FsInertia &I)
+	{
+		PxVec3 offset_;
+		V3StoreU(offset, offset_);
+		SpInertia J_ = Scalar::translate(offset_, unsimdify(I));
+		FsInertia J = SimdBase::translateInertia(offset, I);
+		compareInertias(J_, unsimdify(J));
+
+		return J;
+	}
+
+
+	static PX_FORCE_INLINE FsInertia propagate(const FsInertia &I,
+												  const Cm::SpatialVectorV S[3],
+												  const Mat33V &load,
+												  const FloatV isf)
+	{
+		SpInertia J_ = Scalar::propagate(unsimdify(I), unsimdify(&S[0]), unsimdify(load), unsimdify(isf));
+		FsInertia J = Simd::propagate(I, S, load, isf);
+
+		compareInertias(J_, unsimdify(J));
+		return J;
+	}
+
+
+	static PX_FORCE_INLINE Mat33V computeDriveInertia(const FsInertia &I0, 
+													  const	FsInertia &I1, 
+													  const Cm::SpatialVectorV S[3])
+	{
+		PxMat33 m_ = Scalar::computeDriveInertia(unsimdify(I0), unsimdify(I1), unsimdify(&S[0]));
+		Mat33V m = Simd::computeDriveInertia(I0, I1, S);
+
+		compare33(m_, unsimdify(m));
+		return m;
+	}
+
+	static const PxMat33 unsimdify(const Mat33V &m)
+	{
+		PX_ALIGN(16, PxMat33) m_;
+		PxMat33_From_Mat33V(m, m_);
+		return m_;
+	}
+
+	static PxReal unsimdify(const FloatV &m)
+	{
+		PxF32 f;
+		FStore(m, &f);
+		return f;
+	}
+
+	static SpInertia unsimdify(const FsInertia &I)
+	{
+		return SpInertia (unsimdify(I.ll),
+						  unsimdify(I.la),
+						  unsimdify(I.aa));
+	}
+
+	static const Cm::SpatialVector* unsimdify(const Cm::SpatialVectorV *S)
+	{
+		return reinterpret_cast<const Cm::SpatialVector*>(S);
+	}
+
+
+private:
+
+	static PxReal absmax(const PxVec3& n)
+	{
+		return PxMax(PxAbs(n.x), PxMax(PxAbs(n.y),PxAbs(n.z)));
+	}
+
+	static PxReal norm(const PxMat33& n)
+	{
+		return PxMax(absmax(n.column0), PxMax(absmax(n.column1), absmax(n.column2)));
+	}
+
+	static void compare33(const PxMat33& ref, const PxMat33& n)
+	{
+		PxReal errNorm = norm(ref-n);	
+		PX_UNUSED(errNorm);
+		PX_ASSERT(errNorm <= PxMax(norm(ref)*1e-3f, 1e-4f));
+	}
+
+	static void compareInertias(const SpInertia& a, const SpInertia& b)
+	{
+		compare33(a.mLL, b.mLL);
+		compare33(a.mLA, b.mLA);
+		compare33(a.mAA, b.mAA);
+	}
+
+
+};
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+static bool isPositiveDefinite(const Mat33V& m)
+{
+	PX_ALIGN_PREFIX(16) PxMat33 m1 PX_ALIGN_SUFFIX(16);
+	PxMat33_From_Mat33V(m, m1);
+	return isPositiveDefinite(m1);
+}
+
+
+static bool isPositiveDefinite(const FsInertia& s)
+{
+	return isPositiveDefinite(ArticulationFnsDebug::unsimdify(s));
+}
+
+static PxReal magnitude(const Cm::SpatialVectorV &v)
+{
+	return PxSqrt(FStore(V3Dot(v.linear, v.linear)) + FStore(V3Dot(v.angular, v.angular)));
+}
+
+static bool almostEqual(const Cm::SpatialVectorV &ref, const Cm::SpatialVectorV& test, PxReal tolerance)
+{
+	return magnitude(ref-test)<=tolerance*magnitude(ref);
+}
+#endif
+}
+}
+
+#endif //DY_ARTICULATION_DEBUG_FNS_H
--- a/physx/source/lowleveldynamics/src/DyArticulationFnsScalar.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationFnsScalar.h
@ -0,0 +1,397 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_SCALAR_FNS_H
+#define DY_ARTICULATION_SCALAR_FNS_H
+
+// Scalar helpers for articulations
+
+#include "DyArticulationUtils.h"
+#include "DyArticulationScalar.h"
+#include "DySpatial.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+/*
+namespace
+{
+	static void print(const PxMat33 &m)
+	{
+		printf("(%f, %f, %f)\n(%f, %f, %f)\n(%f, %f, %f)\n\n",
+			m[0][0], m[0][1], m[0][2], m[1][0], m[1][1], m[1][2], m[2][0], m[2][1], m[2][2]);
+	}
+
+	static void print(const Cm::SpatialVector *v, PxU32 count)
+	{
+		for(PxU32 i=0;i<count;i++)
+		{
+			printf("(%f, %f, %f), (%f, %f, %f)\n", 
+				v[i].linear.x, v[i].linear.y, v[i].linear.z,
+				v[i].angular.x, v[i].angular.y, v[i].angular.z);
+		}
+	}
+}
+*/
+
+class ArticulationDiagnostics
+{
+public:
+static bool cholesky(const PxMat33& in, PxMat33& out)
+{
+	out = in;
+
+	if(out[0][0]<=0)
+		return false;
+
+	out[0] /= PxSqrt(out[0][0]);
+	out[1] -= out[0][1]*out[0];
+	out[2] -= out[0][2]*out[0];
+
+	if(out[1][1]<=0)
+		return false;
+
+	out[1] /= PxSqrt(out[1][1]);
+
+	out[2] -= out[1][2]*out[1];
+	if(out[2][2]<=0)
+		return false;
+	out[2] /= PxSqrt(out[2][2]);
+
+	out[1][0] = out[2][0] = out[2][1] = 0;
+	return true;
+}
+
+static bool isSymmetric(const PxMat33&a)
+{
+	return a[0][1] == a[1][0] && a[0][2] == a[2][0] && a[1][2] == a[2][1];
+}
+
+static bool isSymmetric(const Mat33V&a)
+{
+	PxMat33 m;
+	PxMat33_From_Mat33V(a,m);
+	return isSymmetric(m);
+}
+
+static bool isSymmetric(const SpInertia&a)
+{
+	return isSymmetric(a.mLL) && isSymmetric(a.mAA);
+}
+
+
+static bool isPositiveDefinite(const PxMat33& m)
+{
+	PxMat33 _;
+	return cholesky(m, _);
+}
+
+
+static bool isPositiveDefinite(const SpInertia &s)
+{
+	// compute
+	// (a 0)
+	// (b c)
+
+	PxMat33 a;
+	if(!cholesky(s.mLL, a))
+		return false;
+
+	PxMat33 bt = a.getInverse() * s.mLA;
+	PxMat33 x = s.mAA - bt.getTranspose()*bt;
+	PxMat33 c;
+	return cholesky(x, c);
+}
+
+};
+
+class ArticulationFnsScalar
+{
+public:
+
+	static PX_FORCE_INLINE Cm::SpatialVector translateMotion(const PxVec3& p, const Cm::SpatialVector& v)
+	{
+		return Cm::SpatialVector(v.linear + p.cross(v.angular), v.angular);
+	}
+
+	// translate a force resolved at position p to the origin
+
+	static PX_FORCE_INLINE Cm::SpatialVector translateForce(const PxVec3& p, const Cm::SpatialVector& v)
+	{
+		return Cm::SpatialVector(v.linear, v.angular + p.cross(v.linear));
+	}
+
+	static PX_FORCE_INLINE PxMat33 invertSym33(const PxMat33& in)
+	{
+		PxVec3 v0 = in[1].cross(in[2]),
+			   v1 = in[2].cross(in[0]),
+			   v2 = in[0].cross(in[1]);
+
+		PxReal det = v0.dot(in[0]);
+
+
+		PX_ASSERT(det!=0);
+		PxReal recipDet = 1.0f/det;
+
+		return PxMat33(v0 * recipDet,
+					   PxVec3(v0.y, v1.y, v1.z) * recipDet,
+					   PxVec3(v0.z, v1.z, v2.z) * recipDet);
+	}
+
+	static PX_FORCE_INLINE SpInertia multiplySubtract(const SpInertia& I, const Cm::SpatialVector in0[3], const Cm::SpatialVector in1[3])
+	{
+		return I - SpInertia::dyad(in0[0], in1[0])
+				 - SpInertia::dyad(in0[1], in1[1])
+				 - SpInertia::dyad(in0[2], in1[2]);
+	}
+
+	static PX_FORCE_INLINE PxMat33 multiplySym(const Cm::SpatialVector* IS, const Cm::SpatialVector* S)
+	{
+	//		return PxMat33(axisDot(IS, S[0]), axisDot(IS, S[1]), axisDot(IS, S[2]));
+
+		PxReal a00 = IS[0].dot(S[0]), a01 = IS[0].dot(S[1]), a02 = IS[0].dot(S[2]),
+									  a11 = IS[1].dot(S[1]), a12 = IS[1].dot(S[2]),
+															 a22 = IS[2].dot(S[2]);
+
+		return PxMat33(PxVec3(a00, a01, a02),
+					   PxVec3(a01, a11, a12),
+					   PxVec3(a02, a12, a22));
+	}
+
+	static PX_FORCE_INLINE void multiply(Cm::SpatialVector out[3], const SpInertia& I, const Cm::SpatialVector in[3])
+	{
+		out[0] = I * in[0];
+		out[1] = I * in[1];
+		out[2] = I * in[2];
+	}
+
+	static PX_FORCE_INLINE void multiply(Cm::SpatialVector out[3], const Cm::SpatialVector in[3], const PxMat33& D)
+	{
+		out[0] = axisMultiply(in, D[0]);
+		out[1] = axisMultiply(in, D[1]);
+		out[2] = axisMultiply(in, D[2]);
+	}
+
+	static PxMat33 invSqrt(const PxMat33 &m)
+	{
+		// cholesky factor to 
+		// (a 0 0)
+		// (b c 0)
+		// (d e f)
+		// except that a,c,f are the reciprocal sqrts rather than sqrts
+
+		PxVec3 v0 = m.column0, v1 = m.column1, v2 = m.column2;
+
+		PxReal a = PxRecipSqrt(v0.x);
+		PxReal b = v0.y*a;
+		PxReal c = PxRecipSqrt(v1.y - b*b);
+		PxReal d = v0.z*a;
+		PxReal e = (v1.z-d*b) * c;
+		PxReal f = PxRecipSqrt(v2.z - d*d - e*e);
+
+		// invert 
+		PxReal x = -b*a*c, y = (-e*x-d*a)*f, z = -e*c*f;
+
+		PxMat33 r(PxVec3(a, 0,  0 ),
+				  PxVec3(x,  c, 0 ),
+				  PxVec3(y,  z,  f));
+
+		return r;
+	}
+
+
+	static PX_FORCE_INLINE PxMat33 computeSIS(const Cm::SpatialVector S[3], const SpInertia& I)
+	{
+		Cm::SpatialVector IS[3];
+		multiply(IS, I, S);
+		return multiplySym(IS, S);
+	}
+
+	// translate from COM-centered world-aligned inertia matrix to a displaced frame
+	static PX_INLINE SpInertia translate(const PxVec3& p, const SpInertia& i)
+	{
+		PxMat33 S = Ps::star(p), ST = S.getTranspose();
+		PxMat33 sla = S * i.mLA, llst = i.mLL * ST;
+//		return SpInertia(i.mLL, i.mLA + llst, i.mAA + sla + sla.getTranspose() + S * llst);
+
+		// this yields a symmetric result
+		PxMat33 t = sla+S*llst*0.5f;
+		return SpInertia(i.mLL, i.mLA + llst, i.mAA + (t+t.getTranspose()));	}
+
+	static PX_FORCE_INLINE Cm::SpatialVector axisMultiply(const Cm::SpatialVector* a, const PxVec3& v)
+	{
+		return a[0]*v[0]+a[1]*v[1]+a[2]*v[2];
+	}
+
+	static PX_FORCE_INLINE PxVec3 axisDot(const Cm::SpatialVector* a, const Cm::SpatialVector& v)
+	{
+		return PxVec3(a[0].dot(v), a[1].dot(v), a[2].dot(v));
+	}
+
+	static PX_FORCE_INLINE SpInertia invertInertia(const SpInertia& I)
+	{
+		PxMat33 aa = I.mAA, ll = I.mLL, la = I.mLA;
+
+		aa = (aa + aa.getTranspose())*0.5f;
+		ll = (ll + ll.getTranspose())*0.5f;
+
+		PxMat33 AAInv = invertSym33(aa);
+
+		PxMat33 z = -la * AAInv;
+		PxMat33 S = ll + z * la.getTranspose();	// Schur complement of mAA
+		
+		PxMat33 LL = invertSym33(S);
+
+		PxMat33 LA = LL * z;
+		PxMat33 AA = AAInv + z.getTranspose() * LA;
+
+		SpInertia result(LL, LA, AA);
+
+		return result;
+	}
+
+	static SpInertia propagate(const SpInertia& I,
+							   const Cm::SpatialVector S[3],
+							   const PxMat33& load,
+							   PxReal isf)
+	{
+		Cm::SpatialVector IS[3], ISD[3];
+		multiply(IS, I, S);
+		
+		PxMat33 SIS = multiplySym(S, IS);
+
+		// yields a symmetric result
+		PxMat33 D = invSqrt(SIS+load*isf);
+		multiply(ISD, IS, D);
+		return multiplySubtract(I, ISD, ISD);
+	}
+
+	static PxMat33 computeDriveInertia(const SpInertia& I0, 
+									   const SpInertia& I1, 
+									   const Cm::SpatialVector S[3])
+	{
+		// this could be a lot more efficient, especially since it can be combined with
+		// the inertia accumulation. Also it turns out to be symmetric in I0 and I1, which
+		// isn't obvious from the formulation, so it's likely there's a more efficient formulation
+
+		PxMat33 D = invertSym33(computeSIS(S,I0));
+		Cm::SpatialVector IS[3], ISD[3];
+
+		multiply(IS,I0,S);
+		multiply(ISD, IS, D);
+
+		SpInertia tot = multiplySubtract(I0+I1,ISD,IS);
+		SpInertia invTot = invertInertia(tot);
+
+		PxMat33 E = computeSIS(ISD,invTot);
+
+		PxMat33 load = invertSym33(E+D);
+
+		PX_ASSERT(load[0].isFinite() && load[1].isFinite() && load[2].isFinite());
+		PX_ASSERT(ArticulationDiagnostics::isSymmetric(load) && ArticulationDiagnostics::isPositiveDefinite(load));
+		return load;
+	}
+
+	static PX_INLINE Cm::SpatialVector propagateImpulse(const FsRow& row, 
+														const FsJointVectors& jv,
+													    PxVec3& SZ,
+													    const Cm::SpatialVector& Z,
+													    const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+		SZ = Z.angular + Z.linear.cross(getJointOffset(jv));
+		Cm::SpatialVector result = translateForce(getParentOffset(jv), Z - axisMultiply(getDSI(row), SZ));
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	PxVec3 SZcheck;
+	Cm::SpatialVector check = ArticulationRef::propagateImpulse(row, jv, SZcheck, Z, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+	PX_ASSERT((SZ-SZcheck).magnitude()<1e-5*PxMax(SZcheck.magnitude(), 1.0f));
+#endif
+		return result;
+	}
+
+	static PX_INLINE Cm::SpatialVector propagateVelocity(const FsRow& row, 
+														 const FsJointVectors& jv,
+														 const PxVec3& SZ, 
+														 const Cm::SpatialVector& v,
+														 const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+
+		Cm::SpatialVector w = translateMotion(-getParentOffset(jv), v);
+		PxVec3 DSZ = multiply(row.D, SZ);
+
+		PxVec3 n = axisDot(getDSI(row), w) + DSZ;
+		Cm::SpatialVector result = w - Cm::SpatialVector(getJointOffset(jv).cross(n),n);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector check = ArticulationRef::propagateVelocity(row, jv, SZ, v, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+#endif
+		return result;
+	}
+
+
+	static PX_FORCE_INLINE PxVec3 multiply(const Mat33V& m, const PxVec3& v)
+	{
+		return reinterpret_cast<const PxVec3&>(m.col0) * v.x 
+			 + reinterpret_cast<const PxVec3&>(m.col1) * v.y 
+			 + reinterpret_cast<const PxVec3&>(m.col2) * v.z;
+	}
+
+	static PX_FORCE_INLINE PxVec3 multiplyTranspose(const Mat33V& m, const PxVec3& v)
+	{
+		return PxVec3(v.dot(reinterpret_cast<const PxVec3&>(m.col0)), 
+					  v.dot(reinterpret_cast<const PxVec3&>(m.col1)), 
+					  v.dot(reinterpret_cast<const PxVec3&>(m.col2)));
+	}
+
+	static Cm::SpatialVector multiply(const FsInertia& m, const Cm::SpatialVector& v)
+	{
+		return Cm::SpatialVector(multiply(m.ll,v.linear) + multiply(m.la,v.angular),
+								 multiplyTranspose(m.la, v.linear) + multiply(m.aa, v.angular));
+	}
+
+	static PX_FORCE_INLINE Cm::SpatialVector getRootDeltaV(const FsData& matrix, const Cm::SpatialVector& Z)
+	{
+		return multiply(getRootInverseInertia(matrix), Z);
+	}
+};
+
+}
+
+}
+
+#endif //DY_ARTICULATION_SCALAR_FNS_H
--- a/physx/source/lowleveldynamics/src/DyArticulationFnsSimd.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationFnsSimd.h
@ -0,0 +1,442 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_SIMD_FNS_H
+#define DY_ARTICULATION_SIMD_FNS_H
+
+#include "DyArticulationUtils.h"
+
+namespace physx
+{
+namespace Dy
+{
+
+template <typename T, PxU32 count>
+class PodULike
+{
+	PxU8 space[sizeof(T)*count];
+public:
+	PX_FORCE_INLINE operator T*() { return reinterpret_cast<T*>(space); }
+};
+
+#define POD_U_LIKE(_T, _count, _alignment) PX_ALIGN_PREFIX(_alignment) PodULike<_T, _count> PX_ALIGN_SUFFIX(_alignment)
+
+class ArticulationFnsSimdBase
+{
+public:
+
+	static PX_FORCE_INLINE	FsInertia addInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Add(in1.ll, in2.ll),
+							M33Add(in1.la, in2.la),
+							M33Add(in1.aa, in2.aa));
+	}
+
+	static PX_FORCE_INLINE	FsInertia subtractInertia(const FsInertia& in1, const FsInertia& in2)
+	{
+		return FsInertia(M33Sub(in1.ll, in2.ll),
+							M33Sub(in1.la, in2.la),
+							M33Sub(in1.aa, in2.aa));
+	}
+
+	static PX_FORCE_INLINE Vec3V axisDot(const Cm::SpatialVectorV S[3], const Cm::SpatialVectorV &v)
+	{
+		return V3Merge(FAdd(V3Dot(S[0].linear,v.linear), V3Dot(S[0].angular,v.angular)),
+					   FAdd(V3Dot(S[1].linear,v.linear), V3Dot(S[1].angular,v.angular)),
+					   FAdd(V3Dot(S[2].linear,v.linear), V3Dot(S[2].angular,v.angular)));
+	}
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV axisMultiply(const Cm::SpatialVectorV S[3], Vec3V v)
+	{
+		return Cm::SpatialVectorV(V3ScaleAdd(S[0].linear, V3GetX(v), V3ScaleAdd(S[1].linear, V3GetY(v), V3Scale(S[2].linear, V3GetZ(v)))),
+							  V3ScaleAdd(S[0].angular, V3GetX(v), V3ScaleAdd(S[1].angular, V3GetY(v), V3Scale(S[2].angular, V3GetZ(v)))));
+	}
+
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV subtract(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b)
+	{
+		return Cm::SpatialVectorV(V3Sub(a.linear, b.linear), V3Sub(a.angular, b.angular));
+	}
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV add(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b)
+	{
+		return Cm::SpatialVectorV(V3Add(a.linear, b.linear), V3Add(a.angular, b.angular));
+	}
+
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV multiply(const FsInertia &I, const Cm::SpatialVectorV &S)
+	{
+		return Cm::SpatialVectorV(V3Add(M33MulV3(I.ll,S.linear),	   M33MulV3(I.la,S.angular)),
+							  V3Add(M33TrnspsMulV3(I.la,S.linear), M33MulV3(I.aa,S.angular)));
+	}
+
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV translateMotion(const Vec3V& p, const Cm::SpatialVectorV& v)
+	{
+		return Cm::SpatialVectorV(V3Add(v.linear, V3Cross(p, v.angular)), v.angular);
+	}
+
+	// translate a force resolved at position p to the origin
+
+	static PX_FORCE_INLINE Cm::SpatialVectorV translateForce(const Vec3V& p, const Cm::SpatialVectorV& v)
+	{
+		return Cm::SpatialVectorV(v.linear, V3Add(v.angular, V3Cross(p, v.linear)));
+	}
+
+	static PX_FORCE_INLINE Mat33V invertSym33(const Mat33V &m)
+	{
+		Vec3V a0 = V3Cross(m.col1, m.col2);
+		Vec3V a1 = V3Cross(m.col2, m.col0);
+		Vec3V a2 = V3Cross(m.col0, m.col1);
+		FloatV det = V3Dot(a0, m.col0);
+		FloatV recipDet = FRecip(det);
+
+		a1 = V3SetX(a1, V3GetY(a0));
+		a2 = V3Merge(V3GetZ(a0), V3GetZ(a1), V3GetZ(a2));		// make sure it's symmetric
+
+		return Mat33V(V3Scale(a0, recipDet),
+					  V3Scale(a1, recipDet),
+					  V3Scale(a2, recipDet));
+	}
+
+
+	static PX_FORCE_INLINE FloatV safeInvSqrt(FloatV v)
+	{
+		return FSqrt(FMax(FZero(), FRecip(v)));
+	}
+	static PX_FORCE_INLINE Mat33V invSqrt(const Mat33V& m)
+	{
+		// cholesky factor to 
+		// (a 0 0)
+		// (b c 0)
+		// (d e f)
+		// except that a,c,f are the reciprocal sqrts rather than sqrts
+
+		// PxVec3 v0 = m.column0, v1 = m.column1, v2 = m.column2;
+		Vec3V v0 = m.col0, v1 = m.col1, v2 = m.col2;
+
+		const FloatV x0 = V3GetX(v0), y1 = V3GetY(v1), z2 = V3GetZ(v2);
+		
+		FloatV a	= safeInvSqrt(x0);								// PxReal a = PxRecipSqrt(v0.x);
+	
+		Vec3V abd	= V3Scale(v0, a);								// PxReal b = v0.y*a;
+		FloatV b	= V3GetY(abd);
+
+		FloatV c2	= FNegScaleSub(b, b, y1);						// PxReal c = PxRecipSqrt(v1.y - b*b);
+		FloatV c	= safeInvSqrt(c2);
+
+		FloatV d	= V3GetZ(abd);									// PxReal d = v0.z*a;
+		
+		FloatV e	= FMul(FNegScaleSub(b, d, V3GetZ(v1)), c);		// PxReal e = (v1.z-d*b) * c;
+
+		FloatV f2	= FNegScaleSub(d, d, FNegScaleSub(e, e, z2));	// PxReal f = PxRecipSqrt(v2.z - d*d - e*e);
+		FloatV f	= safeInvSqrt(f2);
+
+		// invert 
+		FloatV x = FMul(FMul(b,a),c),								// x = -b*a*c
+			   y = FMul((FNegScaleSub(d,a, FMul(e,x))), f),			// y = (-e*x-d*a)*f 
+			   z = FMul(e, FMul(c,f));								// z = -e*c*f
+
+		return Mat33V(V3Merge(a,		FZero(),		FZero()),
+					  V3Merge(FNeg(x),	c,				FZero()),
+					  V3Merge(y,		FNeg(z),		f));
+	}
+
+
+	static PX_FORCE_INLINE FsInertia invertInertia(const FsInertia &I)
+	{
+		Mat33V aa = M33Scale(M33Add(I.aa, M33Trnsps(I.aa)), FHalf());
+		Mat33V ll = M33Scale(M33Add(I.ll, M33Trnsps(I.ll)), FHalf());
+
+		Mat33V AAInv = invertSym33(aa);
+		Mat33V z = M33MulM33(M33Neg(I.la), AAInv);
+		Mat33V S = M33Add(ll, M33MulM33(z, M33Trnsps(I.la)));
+
+		Mat33V LL = invertSym33(S);
+		Mat33V LA = M33MulM33(LL, z);
+		Mat33V AA = M33Add(AAInv, M33MulM33(M33Trnsps(z), LA));
+
+		return FsInertia(LL, LA, AA);
+	}
+
+	static PX_NOINLINE Mat33V computeSIS(const FsInertia &I, const Cm::SpatialVectorV S[3], Cm::SpatialVectorV IS[3])
+	{
+		Vec3V S0l = S[0].linear, S0a = S[0].angular;
+		Vec3V S1l = S[1].linear, S1a = S[1].angular;
+		Vec3V S2l = S[2].linear, S2a = S[2].angular;
+
+		Vec3V IS0l = V3Add(M33MulV3(I.ll,S0l), M33MulV3(I.la,S0a));
+		Vec3V IS0a = V3Add(M33TrnspsMulV3(I.la,S0l), M33MulV3(I.aa,S0a));
+		Vec3V IS1l = V3Add(M33MulV3(I.ll,S1l), M33MulV3(I.la,S1a));
+		Vec3V IS1a = V3Add(M33TrnspsMulV3(I.la,S1l), M33MulV3(I.aa,S1a));
+		Vec3V IS2l = V3Add(M33MulV3(I.ll,S2l), M33MulV3(I.la,S2a));
+		Vec3V IS2a = V3Add(M33TrnspsMulV3(I.la,S2l), M33MulV3(I.aa,S2a));
+
+		// compute SIS
+		FloatV a00 = FAdd(V3Dot(S0l, IS0l), V3Dot(S0a, IS0a));
+		FloatV a01 = FAdd(V3Dot(S0l, IS1l), V3Dot(S0a, IS1a));
+		FloatV a02 = FAdd(V3Dot(S0l, IS2l), V3Dot(S0a, IS2a));
+		FloatV a11 = FAdd(V3Dot(S1l, IS1l), V3Dot(S1a, IS1a));
+		FloatV a12 = FAdd(V3Dot(S1l, IS2l), V3Dot(S1a, IS2a));
+		FloatV a22 = FAdd(V3Dot(S2l, IS2l), V3Dot(S2a, IS2a));
+
+		// write IS, a useful side-effect
+		IS[0].linear = IS0l; IS[0].angular = IS0a;
+		IS[1].linear = IS1l; IS[1].angular = IS1a;
+		IS[2].linear = IS2l; IS[2].angular = IS2a;
+
+		return Mat33V(V3Merge(a00, a01, a02),
+					  V3Merge(a01, a11, a12),
+					  V3Merge(a02, a12, a22));
+	}
+
+
+	static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Mat33V &D, const Cm::SpatialVectorV IS[3], Cm::SpatialVectorV DSI[3])
+	{
+		// cut'n'paste, how I love ya, how I love ya
+
+		Vec3V IS0l = IS[0].linear, IS0a = IS[0].angular;
+		Vec3V IS1l = IS[1].linear, IS1a = IS[1].angular;
+		Vec3V IS2l = IS[2].linear, IS2a = IS[2].angular;
+
+		Vec3V D0 = D.col0, D1 = D.col1, D2 = D.col2;
+
+		// compute IDS
+		Vec3V DSI0l = V3ScaleAdd(IS0l, V3GetX(D0), V3ScaleAdd(IS1l, V3GetY(D0), V3Scale(IS2l, V3GetZ(D0))));
+		Vec3V DSI1l = V3ScaleAdd(IS0l, V3GetX(D1), V3ScaleAdd(IS1l, V3GetY(D1), V3Scale(IS2l, V3GetZ(D1))));
+		Vec3V DSI2l = V3ScaleAdd(IS0l, V3GetX(D2), V3ScaleAdd(IS1l, V3GetY(D2), V3Scale(IS2l, V3GetZ(D2))));
+
+		Vec3V DSI0a = V3ScaleAdd(IS0a, V3GetX(D0), V3ScaleAdd(IS1a, V3GetY(D0), V3Scale(IS2a, V3GetZ(D0))));
+		Vec3V DSI1a = V3ScaleAdd(IS0a, V3GetX(D1), V3ScaleAdd(IS1a, V3GetY(D1), V3Scale(IS2a, V3GetZ(D1))));
+		Vec3V DSI2a = V3ScaleAdd(IS0a, V3GetX(D2), V3ScaleAdd(IS1a, V3GetY(D2), V3Scale(IS2a, V3GetZ(D2))));
+
+		// compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad
+
+		Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2;
+		Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2;
+		Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2;
+
+#define SUBTRACT_DYAD(_a, _b) \
+	ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0);	la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0);	aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \
+	ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1);	la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1);	aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \
+	ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2);	la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2);	aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2); 
+
+		SUBTRACT_DYAD(IS0, DSI0);
+		SUBTRACT_DYAD(IS1, DSI1);
+		SUBTRACT_DYAD(IS2, DSI2);
+#undef SUBTRACT_DYAD
+
+		DSI[0].linear = DSI0l;	DSI[0].angular = DSI0a;
+		DSI[1].linear = DSI1l;	DSI[1].angular = DSI1a;
+		DSI[2].linear = DSI2l;	DSI[2].angular = DSI2a;
+
+		return FsInertia(Mat33V(ll0, ll1, ll2), 
+							Mat33V(la0, la1, la2),
+							Mat33V(aa0, aa1, aa2));
+	} 
+
+
+	static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Cm::SpatialVectorV S[3])
+	{
+		// cut'n'paste, how I love ya, how I love ya
+
+		const Vec3V S0l = S[0].linear, S0a = S[0].angular;
+		const Vec3V S1l = S[1].linear, S1a = S[1].angular;
+		const Vec3V S2l = S[2].linear, S2a = S[2].angular;
+
+		// compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad
+
+		Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2;
+		Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2;
+		Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2;
+
+#define SUBTRACT_DYAD(_a, _b) \
+	ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0);	la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0);	aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \
+	ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1);	la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1);	aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \
+	ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2);	la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2);	aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2); 
+
+	SUBTRACT_DYAD(S0, S0);
+	SUBTRACT_DYAD(S1, S1);
+	SUBTRACT_DYAD(S2, S2);
+#undef SUBTRACT_DYAD
+
+		return FsInertia(Mat33V(ll0, ll1, ll2), 
+							Mat33V(la0, la1, la2),
+							Mat33V(aa0, aa1, aa2));
+	} 
+
+
+	static PX_FORCE_INLINE FsInertia translateInertia(Vec3V a, const FsInertia &input)
+	{
+		Vec3V b = V3Neg(a);
+		
+		Vec3V la0 = input.la.col0, la1 = input.la.col1, la2 = input.la.col2;
+		Vec3V ll0 = input.ll.col0, ll1 = input.ll.col1, ll2 = input.ll.col2;
+		Vec3V aa0 = input.aa.col0, aa1 = input.aa.col1, aa2 = input.aa.col2;
+
+		FloatV aX = V3GetX(a), aY = V3GetY(a), aZ = V3GetZ(a);
+		FloatV bX = V3GetX(b), bY = V3GetY(b), bZ = V3GetZ(b);
+		FloatV Z = FZero();
+
+		// s - star matrix of a
+		Vec3V s0 = V3Merge(Z, aZ, bY),
+			  s1 = V3Merge(bZ, Z, aX),
+			  s2 = V3Merge(aY, bX, Z);
+
+		// s * la
+		Vec3V sla0 = V3ScaleAdd(s0, V3GetX(la0), V3ScaleAdd(s1, V3GetY(la0), V3Scale(s2, V3GetZ(la0))));
+		Vec3V sla1 = V3ScaleAdd(s0, V3GetX(la1), V3ScaleAdd(s1, V3GetY(la1), V3Scale(s2, V3GetZ(la1))));
+		Vec3V sla2 = V3ScaleAdd(s0, V3GetX(la2), V3ScaleAdd(s1, V3GetY(la2), V3Scale(s2, V3GetZ(la2))));
+
+		// ll * s.transpose() (ll is symmetric)
+		Vec3V llst0 = V3ScaleAdd(ll2, aY, V3Scale(ll1, bZ)),
+			  llst1 = V3ScaleAdd(ll0, aZ, V3Scale(ll2, bX)),
+			  llst2 = V3ScaleAdd(ll1, aX, V3Scale(ll0, bY));
+
+		// t = sla+S*llst*0.5f;
+
+		Vec3V sllst0 = V3ScaleAdd(s2, V3GetZ(llst0), V3ScaleAdd(s1, V3GetY(llst0), V3Scale(s0, V3GetX(llst0))));
+		Vec3V sllst1 = V3ScaleAdd(s2, V3GetZ(llst1), V3ScaleAdd(s1, V3GetY(llst1), V3Scale(s0, V3GetX(llst1))));
+		Vec3V sllst2 = V3ScaleAdd(s2, V3GetZ(llst2), V3ScaleAdd(s1, V3GetY(llst2), V3Scale(s0, V3GetX(llst2))));
+
+		Vec3V t0 = V3ScaleAdd(sllst0, FHalf(), sla0);
+		Vec3V t1 = V3ScaleAdd(sllst1, FHalf(), sla1);
+		Vec3V t2 = V3ScaleAdd(sllst2, FHalf(), sla2);
+		
+		// t+t.transpose()
+		Vec3V r0 = V3Add(t0, V3Merge(V3GetX(t0), V3GetX(t1), V3GetX(t2))),
+			  r1 = V3Add(t1, V3Merge(V3GetY(t0), V3GetY(t1), V3GetY(t2))),
+			  r2 = V3Add(t2, V3Merge(V3GetZ(t0), V3GetZ(t1), V3GetZ(t2)));
+
+		return FsInertia(Mat33V(ll0, ll1, ll2),
+
+							Mat33V(V3Add(la0, llst0),
+								   V3Add(la1, llst1),
+								   V3Add(la2, llst2)),
+
+							Mat33V(V3Add(aa0, r0),
+								   V3Add(aa1, r1),
+								   V3Add(aa2, r2)));
+	}
+
+};
+
+template<class Base>
+class ArticulationFnsSimd : public Base
+{
+	static PX_FORCE_INLINE void axisMultiplyLowerTriangular(Cm::SpatialVectorV ES[3], const Mat33V&E, const Cm::SpatialVectorV S[3])
+	{
+		const Vec3V l0 = S[0].linear,  l1 = S[1].linear,  l2 = S[2].linear;
+		const Vec3V a0 = S[0].angular, a1 = S[1].angular, a2 = S[2].angular;
+		ES[0] = Cm::SpatialVectorV(V3Scale(l0, V3GetX(E.col0)),
+							   V3Scale(a0, V3GetX(E.col0)));
+		ES[1] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col1), V3Scale(l1, V3GetY(E.col1))),
+							   V3ScaleAdd(a0, V3GetX(E.col1), V3Scale(a1, V3GetY(E.col1))));
+		ES[2] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col2), V3ScaleAdd(l1, V3GetY(E.col2), V3Scale(l2, V3GetZ(E.col2)))),
+							   V3ScaleAdd(a0, V3GetX(E.col2), V3ScaleAdd(a1, V3GetY(E.col2), V3Scale(a2, V3GetZ(E.col2)))));
+	}
+
+public:
+	static PX_FORCE_INLINE FsInertia propagate(const FsInertia &I,
+												  const Cm::SpatialVectorV S[3],
+												  const Mat33V &load,
+												  const FloatV isf)
+	{
+		Cm::SpatialVectorV IS[3], ISE[3];
+		Mat33V D = Base::computeSIS(I, S, IS);
+
+		D.col0 = V3ScaleAdd(load.col0, isf, D.col0);
+		D.col1 = V3ScaleAdd(load.col1, isf, D.col1);
+		D.col2 = V3ScaleAdd(load.col2, isf, D.col2);
+
+		axisMultiplyLowerTriangular(ISE, Base::invSqrt(D), IS);
+		return Base::multiplySubtract(I, ISE);
+	}
+
+
+
+	static PX_INLINE Cm::SpatialVectorV propagateImpulse(const FsRow& row, 
+													 const FsJointVectors& jv,
+												     Vec3V& SZ,
+												     const Cm::SpatialVectorV& Z,
+												     const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+
+		SZ = V3Add(Z.angular, V3Cross(Z.linear, jv.jointOffset));
+		return Base::translateForce(jv.parentOffset, Z - Base::axisMultiply(row.DSI, SZ));
+	}
+
+	static PX_INLINE Cm::SpatialVectorV propagateVelocity(const FsRow& row, 
+													  const FsJointVectors& jv,
+													  const Vec3V& SZ, 
+													  const Cm::SpatialVectorV& v,
+													  const FsRowAux& aux)
+	{
+		PX_UNUSED(aux);
+
+		Cm::SpatialVectorV w = Base::translateMotion(V3Neg(jv.parentOffset), v);
+		Vec3V DSZ = M33MulV3(row.D, SZ);
+
+		Vec3V n = V3Add(Base::axisDot(row.DSI, w), DSZ);
+		return w - Cm::SpatialVectorV(V3Cross(jv.jointOffset, n), n);
+	}
+
+
+
+
+
+	static
+#if !(PX_ANDROID && PX_DEBUG) // Not inlining for Android Debug to avoid "conditional branch out of range" compilation error on arm64-v8a ABI
+	PX_FORCE_INLINE
+#endif
+	Mat33V computeDriveInertia(const FsInertia &I0, 
+													  const FsInertia &I1, 
+													  const Cm::SpatialVectorV S[3])
+	{
+		POD_U_LIKE(Cm::SpatialVectorV, 3, 16) IS, ISD, dummy;		
+		Mat33V D = Base::computeSIS(I0, S, IS);
+		Mat33V DInv = Base::invertSym33(D);
+
+		FsInertia tmp = Base::addInertia(I0, I1);
+		tmp = Base::multiplySubtract(tmp, DInv, IS, ISD);
+		FsInertia J = Base::invertInertia(tmp);
+
+		Mat33V E = Base::computeSIS(J, ISD, dummy);
+		return Base::invertSym33(M33Add(DInv,E));
+
+	}
+};
+
+}
+}
+
+#endif //DY_ARTICULATION_SIMD_FNS_H
--- a/physx/source/lowleveldynamics/src/DyArticulationHelper.cpp
+++ b/physx/source/lowleveldynamics/src/DyArticulationHelper.cpp
@ -0,0 +1,500 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxMemory.h"
+#include "common/PxProfileZone.h"
+
+#include "PsUtilities.h"
+#include "CmSpatialVector.h"
+#include "DyArticulationHelper.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationFnsSimd.h"
+#include "DyArticulationFnsScalar.h"
+#include "DyArticulationFnsDebug.h"
+#include "DySolverConstraintDesc.h"
+#include "PxvDynamics.h"
+#include "DyArticulation.h"
+#include "CmConeLimitHelper.h"
+#include "DySolverConstraint1D.h"
+#include "PxcConstraintBlockStream.h"
+#include "DySolverConstraint1D.h"
+#include "DyArticulationPImpl.h"
+#include "PsFoundation.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+void getImpulseResponseSlow(const FsData& matrix, 
+							PxU32 linkID0, const Cm::SpatialVectorV& impulse0, Cm::SpatialVectorV& deltaV0,
+							PxU32 linkID1, const Cm::SpatialVectorV& impulse1, Cm::SpatialVectorV& deltaV1)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+	PxU32 stack[DY_ARTICULATION_MAX_SIZE];
+	Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+
+	PxU32 i0, i1, ic;
+	
+	for(i0 = linkID0, i1 = linkID1; i0!=i1;)	// find common path
+	{
+		if(i0<i1)
+			i1 = matrix.parent[i1];
+		else
+			i0 = matrix.parent[i0];
+	}
+
+	PxU32 common = i0;
+
+	Cm::SpatialVectorV Z0 = -impulse0, Z1 = -impulse1;
+	for(i0 = 0; linkID0!=common; linkID0 = matrix.parent[linkID0])
+	{
+		Z0 = Fns::propagateImpulse(rows[linkID0], jointVectors[linkID0], SZ[linkID0], Z0, aux[linkID0]);
+		stack[i0++] = linkID0;
+	}
+
+	for(i1 = i0; linkID1!=common; linkID1 = matrix.parent[linkID1])
+	{
+		Z1 = Fns::propagateImpulse(rows[linkID1], jointVectors[linkID1], SZ[linkID1], Z1, aux[linkID1]);
+		stack[i1++] = linkID1;
+	}
+
+	Cm::SpatialVectorV Z = Z0 + Z1;
+	for(ic = i1; common; common = matrix.parent[common])
+	{
+		Z = Fns::propagateImpulse(rows[common], jointVectors[common], SZ[common], Z, aux[common]);
+		stack[ic++] = common;
+	}
+
+	Cm::SpatialVectorV v = Fns::multiply(getRootInverseInertia(matrix), -Z);
+
+	for(PxU32 index = ic; index-->i1 ;)
+		v = Fns::propagateVelocity(rows[stack[index]], jointVectors[stack[index]], SZ[stack[index]], v, aux[stack[index]]);
+
+	deltaV1 = v;
+	for(PxU32 index = i1; index-->i0 ;)
+		deltaV1 = Fns::propagateVelocity(rows[stack[index]], jointVectors[stack[index]], SZ[stack[index]], deltaV1, aux[stack[index]]);
+
+	deltaV0 = v;
+	for(PxU32 index = i0; index-->0;)
+		deltaV0 = Fns::propagateVelocity(rows[stack[index]], jointVectors[stack[index]], SZ[stack[index]], deltaV0, aux[stack[index]]);
+}
+
+void PxcFsGetImpulseResponse(const FsData& matrix,
+							 PxU32 linkID, const Cm::SpatialVectorV& impulse, Cm::SpatialVectorV& deltaV)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+	Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	Cm::SpatialVectorV Z = -impulse;
+	
+	for(PxU32 i = linkID; i; i = matrix.parent[i])
+		Z = Fns::propagateImpulse(rows[i], jointVectors[i], SZ[i], Z, aux[i]);
+
+	deltaV = Fns::multiply(getRootInverseInertia(matrix), -Z);
+
+	PX_ASSERT(rows[linkID].pathToRoot&1);
+
+	for(ArticulationBitField i=rows[linkID].pathToRoot-1; i; i &= (i-1))
+	{
+		const PxU32 index = ArticulationLowestSetBit(i);
+		deltaV = Fns::propagateVelocity(rows[index], jointVectors[index], SZ[index], deltaV, aux[index]);
+	}
+}
+
+void PxcFsGetImpulseSelfResponse(const FsData& matrix, 
+								 PxU32 linkID0, const Cm::SpatialVectorV& impulse0, Cm::SpatialVectorV& deltaV0,
+								 PxU32 linkID1, const Cm::SpatialVectorV& impulse1, Cm::SpatialVectorV& deltaV1)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	PX_ASSERT(linkID0 != linkID1);
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	// standard case: parent-child limit
+	if(matrix.parent[linkID1] == linkID0)
+	{
+		Vec3V SZ;
+		const Cm::SpatialVectorV Z = impulse0 - Fns::propagateImpulse(rows[linkID1], jointVectors[linkID1], SZ, -impulse1, aux[linkID1]);
+		PxcFsGetImpulseResponse(matrix, linkID0, Z, deltaV0);
+		deltaV1 = Fns::propagateVelocity(rows[linkID1], jointVectors[linkID1], SZ, deltaV0, aux[linkID1]);
+	}
+	else
+		getImpulseResponseSlow(matrix, linkID0, impulse0, deltaV0, linkID1, impulse1, deltaV1);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector V[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<matrix.linkCount;i++) V[i] = Cm::SpatialVector::zero();
+	ArticulationRef::applyImpulse(matrix,V,linkID0, reinterpret_cast<const Cm::SpatialVector&>(impulse0));
+	ArticulationRef::applyImpulse(matrix,V,linkID1, reinterpret_cast<const Cm::SpatialVector&>(impulse1));
+
+	Cm::SpatialVector refV0 = V[linkID0];
+	Cm::SpatialVector refV1 = V[linkID1];
+#endif
+}
+
+namespace
+{
+	PX_FORCE_INLINE Cm::SpatialVectorV getImpulseResponseSimd(const FsData& matrix, PxU32 linkID, Vec3V lZ, Vec3V aZ)
+	{
+		PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+		Vec3V SZ[DY_ARTICULATION_MAX_SIZE];
+		PxU32 indices[DY_ARTICULATION_MAX_SIZE], iCount = 0;
+
+		const FsRow*PX_RESTRICT rows = getFsRows(matrix);
+		const FsRowAux*PX_RESTRICT aux = getAux(matrix);
+		const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+		PX_UNUSED(aux);
+		PX_ASSERT(rows[linkID].pathToRoot&1);
+
+		lZ = V3Neg(lZ);
+		aZ = V3Neg(aZ);
+
+		for(PxU32 i = linkID; i; i = matrix.parent[i])
+		{
+			const FsRow& r = rows[i];
+			const FsJointVectors& j = jointVectors[i];
+
+			Vec3V sz = V3Add(aZ, V3Cross(lZ, j.jointOffset));
+			SZ[iCount] = sz;
+			
+			lZ = V3NegScaleSub(r.DSI[0].linear, V3GetX(sz), V3NegScaleSub(r.DSI[1].linear, V3GetY(sz), V3NegScaleSub(r.DSI[2].linear, V3GetZ(sz), lZ)));
+			aZ = V3NegScaleSub(r.DSI[0].angular, V3GetX(sz), V3NegScaleSub(r.DSI[1].angular, V3GetY(sz), V3NegScaleSub(r.DSI[2].angular, V3GetZ(sz), aZ)));
+
+			aZ = V3Add(aZ, V3Cross(j.parentOffset, lZ));
+			indices[iCount++] = i;
+		}
+
+		const FsInertia& I = getRootInverseInertia(matrix);
+
+		Vec3V lV = V3Neg(V3Add(M33MulV3(I.ll, lZ), M33MulV3(I.la, aZ)));
+		Vec3V aV = V3Neg(V3Add(M33TrnspsMulV3(I.la, lZ), M33MulV3(I.aa, aZ)));
+
+		while(iCount)
+		{
+			PxU32 i = indices[--iCount];
+			const FsRow& r = rows[i];
+			const FsJointVectors& j = jointVectors[i];
+
+			lV = V3Sub(lV, V3Cross(j.parentOffset, aV));
+
+			Vec3V n = V3Add(V3Merge(V3Dot(r.DSI[0].linear, lV),  V3Dot(r.DSI[1].linear, lV),  V3Dot(r.DSI[2].linear, lV)),
+							V3Merge(V3Dot(r.DSI[0].angular, aV), V3Dot(r.DSI[1].angular, aV), V3Dot(r.DSI[2].angular, aV)));
+
+			n = V3Add(n, M33MulV3(r.D, SZ[iCount]));
+			lV = V3Sub(lV, V3Cross(j.jointOffset, n));
+			aV = V3Sub(aV, n);
+		}
+
+		return Cm::SpatialVectorV(lV, aV);
+	}
+}
+					
+void ArticulationHelper::getImpulseResponse(const FsData& matrix,
+											PxU32 linkID, const Cm::SpatialVectorV& impulse, Cm::SpatialVectorV& deltaV)
+{
+	PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+
+	deltaV = getImpulseResponseSimd(matrix, linkID, impulse.linear, impulse.angular);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVectorV deltaV_;
+	PxcFsGetImpulseResponse(matrix, linkID, impulse, deltaV_);
+	PX_ASSERT(almostEqual(deltaV_, deltaV,1e-3f));
+#endif
+}
+
+void ArticulationHelper::getImpulseSelfResponse(const FsData& matrix,
+												PxU32 linkID0, const Cm::SpatialVectorV& impulse0, Cm::SpatialVectorV& deltaV0,
+												PxU32 linkID1, const Cm::SpatialVectorV& impulse1, Cm::SpatialVectorV& deltaV1)
+{
+	PX_ASSERT(linkID0 != linkID1);
+
+	const FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_UNUSED(aux);
+
+	Cm::SpatialVectorV& dV0 = deltaV0, 
+				  & dV1 = deltaV1;
+
+	// standard case: parent-child limit
+	if(matrix.parent[linkID1] == linkID0)
+	{
+		const FsRow& r = rows[linkID1];
+		const FsJointVectors& j = jointVectors[linkID1];
+
+		Vec3V lZ = V3Neg(impulse1.linear),
+			  aZ = V3Neg(impulse1.angular);
+
+		Vec3V sz = V3Add(aZ, V3Cross(lZ, j.jointOffset));
+		
+		lZ = V3Sub(lZ, V3ScaleAdd(r.DSI[0].linear, V3GetX(sz), V3ScaleAdd(r.DSI[1].linear, V3GetY(sz), V3Scale(r.DSI[2].linear, V3GetZ(sz)))));
+		aZ = V3Sub(aZ, V3ScaleAdd(r.DSI[0].angular, V3GetX(sz), V3ScaleAdd(r.DSI[1].angular, V3GetY(sz), V3Scale(r.DSI[2].angular, V3GetZ(sz)))));
+
+		aZ = V3Add(aZ, V3Cross(j.parentOffset, lZ));
+
+		lZ = V3Sub(impulse0.linear, lZ);
+		aZ = V3Sub(impulse0.angular, aZ);
+
+		dV0 = getImpulseResponseSimd(matrix, linkID0, lZ, aZ);
+
+		Vec3V aV = dV0.angular;
+		Vec3V lV = V3Sub(dV0.linear, V3Cross(j.parentOffset, aV));
+
+		Vec3V n = V3Add(V3Merge(V3Dot(r.DSI[0].linear, lV),  V3Dot(r.DSI[1].linear, lV),  V3Dot(r.DSI[2].linear, lV)),
+						V3Merge(V3Dot(r.DSI[0].angular, aV), V3Dot(r.DSI[1].angular, aV), V3Dot(r.DSI[2].angular, aV)));
+
+		n = V3Add(n, M33MulV3(r.D, sz));
+		lV = V3Sub(lV, V3Cross(j.jointOffset, n));
+		aV = V3Sub(aV, n);
+
+		dV1 = Cm::SpatialVectorV(lV, aV);
+	}
+	else
+	{
+#if 0
+		getImpulseResponseSlow(matrix, linkID0, impulse0, deltaV0, linkID1, impulse1, deltaV1);
+#else
+		getImpulseResponse(matrix, linkID0, impulse0, deltaV0);
+		getImpulseResponse(matrix, linkID1, impulse1, deltaV1);
+#endif
+	}
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVectorV dV0_, dV1_;
+	PxcFsGetImpulseSelfResponse(matrix, linkID0, impulse0, dV0_, linkID1, impulse1, dV1_);
+
+	PX_ASSERT(almostEqual(dV0_, dV0, 1e-3f));
+	PX_ASSERT(almostEqual(dV1_, dV1, 1e-3f));
+#endif
+}
+
+PxU32 ArticulationHelper::getFsDataSize(PxU32 linkCount)
+{
+	return sizeof(FsInertia) + sizeof(FsRow) * linkCount;
+}
+
+PxU32 ArticulationHelper::getLtbDataSize(PxU32 linkCount)
+{
+	return sizeof(LtbRow) * linkCount;
+}
+
+void ArticulationHelper::createHardLimit(	const FsData& fsData,
+											const ArticulationLink* links,
+											PxU32 linkIndex,
+											SolverConstraint1DExt& s, 
+											const PxVec3& axis, 
+											PxReal err,
+											PxReal recipDt)
+{
+	init(s, PxVec3(0), PxVec3(0), axis, axis, 0, PX_MAX_F32);
+
+	ArticulationHelper::getImpulseSelfResponse(fsData, 
+												  links[linkIndex].parent,Cm::SpatialVector(PxVec3(0), axis), s.deltaVA,
+												  linkIndex, Cm::SpatialVector(PxVec3(0), -axis), s.deltaVB);
+
+	const PxReal unitResponse = axis.dot(reinterpret_cast<PxVec3&>(s.deltaVA.angular)) - axis.dot(reinterpret_cast<PxVec3&>(s.deltaVB.angular));
+	if(unitResponse<0.0f)
+		Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, "Warning: articulation ill-conditioned or under severe stress, joint limit ignored");
+
+	const PxReal recipResponse = unitResponse>0.0f ? 1.0f/unitResponse : 0.0f;
+
+	s.constant = recipResponse * -err * recipDt;
+	s.unbiasedConstant = err>0.0f ? s.constant : 0.0f;
+	s.velMultiplier = -recipResponse;
+	s.impulseMultiplier = 1.0f;
+}
+
+void ArticulationHelper::createTangentialSpring(const FsData& fsData,
+												const ArticulationLink* links,
+												PxU32 linkIndex,
+												SolverConstraint1DExt& s, 
+												const PxVec3& axis, 
+												PxReal stiffness,
+												PxReal damping,
+												PxReal dt)
+{
+	init(s, PxVec3(0), PxVec3(0), axis, axis, -PX_MAX_F32, PX_MAX_F32);
+
+	Cm::SpatialVector axis6(PxVec3(0), axis);
+	PxU32 parent = links[linkIndex].parent;
+	getImpulseSelfResponse(fsData, parent, axis6, s.deltaVA, linkIndex, -axis6, s.deltaVB);
+
+	const PxReal unitResponse = axis.dot(reinterpret_cast<PxVec3&>(s.deltaVA.angular)) - axis.dot(reinterpret_cast<PxVec3&>(s.deltaVB.angular));
+	if(unitResponse<0.0f)
+		Ps::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, "Warning: articulation ill-conditioned or under severe stress, tangential spring ignored");
+	const PxReal recipResponse = unitResponse>0.0F ? 1.0f/unitResponse : 0.0f;
+
+	// this is a specialization of the spring code in setSolverConstants() for acceleration springs.
+	// general case is  b = dt * (c.mods.spring.damping * c.velocityTarget - c.mods.spring.stiffness * geomError);
+    // but geomError and velocityTarget are both zero
+
+	const PxReal a = dt * dt * stiffness + dt * damping;
+    const PxReal x = 1.0f/(1.0f+a);
+    s.constant = s.unbiasedConstant = 0.0f;
+    s.velMultiplier = -x * recipResponse * a;
+    s.impulseMultiplier = 1.0f - x;
+}
+
+PxU32 ArticulationHelper::setupSolverConstraints(	Articulation& articulation, PxU32 /*solverDataSize*/,
+													PxConstraintAllocator& allocator,
+													PxSolverConstraintDesc* constraintDesc,
+													const ArticulationLink* links,
+													const ArticulationJointTransforms* jointTransforms,
+													PxReal dt,
+													PxU32& acCount)
+{
+	acCount = 0;
+
+	FsData& fsData = *articulation.getFsDataPtr();
+	const PxU16 linkCount = fsData.linkCount;
+	PxU32 descCount = 0;
+	const PxReal recipDt = 1.0f/dt;
+
+	const PxConstraintInvMassScale ims(1.0f, 1.0f, 1.0f, 1.0f);
+
+	for(PxU16 i=1;i<linkCount;i++)
+	{
+		const ArticulationJointCore& j = static_cast<const ArticulationJointCore&>(*links[i].inboundJoint);
+
+		if(i+1<linkCount)
+		{
+			Ps::prefetch(links[i+1].inboundJoint, sizeof (ArticulationJointCore));
+			Ps::prefetch(&jointTransforms[i+1], sizeof(ArticulationJointTransforms));
+		}
+		
+		if(!(j.twistLimited || j.swingLimited))
+			continue;
+
+		PxQuat swing, twist;
+		Ps::separateSwingTwist(jointTransforms[i].cB2cA.q, swing, twist);
+	
+		Cm::ConeLimitHelper eh(j.tanQSwingY, j.tanQSwingZ, j.tanQSwingPad);
+		PxVec3 swingLimitAxis;
+		PxReal swingLimitError = 0.0f;
+
+		const bool swingLimited = j.swingLimited && eh.getLimit(swing, swingLimitAxis, swingLimitError);
+		const bool tangentialStiffness = swingLimited && (j.tangentialStiffness>0 || j.tangentialDamping>0);
+
+		const PxVec3 twistAxis = jointTransforms[i].cB2w.rotate(PxVec3(1.0f,0,0));
+		const PxReal tqTwistAngle = Ps::tanHalf(twist.x, twist.w);
+
+		const bool twistLowerLimited = j.twistLimited && tqTwistAngle < Cm::tanAdd(j.tanQTwistLow, j.tanQTwistPad);
+		const bool twistUpperLimited = j.twistLimited && tqTwistAngle > Cm::tanAdd(j.tanQTwistHigh, -j.tanQTwistPad);
+	
+		const PxU8 constraintCount = PxU8(swingLimited + tangentialStiffness + twistUpperLimited + twistLowerLimited);
+		if(!constraintCount)
+			continue;
+
+		PxSolverConstraintDesc& desc = constraintDesc[descCount++];
+
+		desc.articulationA = &articulation;
+		desc.linkIndexA = Ps::to16(links[i].parent);
+
+		desc.articulationB = &articulation;
+		desc.linkIndexB = i;
+
+		const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + 
+								 sizeof(SolverConstraint1DExt) * constraintCount;
+
+		PX_ASSERT(0==(constraintLength & 0x0f));
+		desc.constraintLengthOver16 = Ps::to16(constraintLength/16);
+		
+		//desc.constraint = stream.reserve(constraintLength + 16u, constraintBlockManager);
+		desc.constraint = allocator.reserveConstraintData(constraintLength + 16u);
+
+		desc.writeBack = NULL;
+		
+		SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+		SolverConstraint1DExt* constraints = reinterpret_cast<SolverConstraint1DExt*>(desc.constraint + sizeof(SolverConstraint1DHeader));
+
+		init(*header, constraintCount, true, ims);
+
+		PxU32 cIndex = 0;
+
+		if(swingLimited)
+		{
+			const PxVec3 normal = jointTransforms[i].cA2w.rotate(swingLimitAxis);
+			createHardLimit(fsData, links, i, constraints[cIndex++], normal, swingLimitError, recipDt);
+			if(tangentialStiffness)
+			{
+				const PxVec3 tangent = twistAxis.cross(normal).getNormalized();
+				createTangentialSpring(fsData, links, i, constraints[cIndex++], tangent, j.tangentialStiffness, j.tangentialDamping, dt);
+			}
+		}
+
+		if(twistUpperLimited)
+			createHardLimit(fsData, links, i, constraints[cIndex++], twistAxis, (j.tanQTwistHigh - tqTwistAngle)*4, recipDt);
+
+		if(twistLowerLimited)
+			createHardLimit(fsData, links, i, constraints[cIndex++], -twistAxis, -(j.tanQTwistLow - tqTwistAngle)*4, recipDt);
+
+		*(desc.constraint + getConstraintLength(desc)) = 0;
+
+		PX_ASSERT(cIndex == constraintCount);
+		acCount += constraintCount;
+	}
+
+	return descCount;
+}
+
+ArticulationPImpl::ComputeUnconstrainedVelocitiesFn ArticulationPImpl::sComputeUnconstrainedVelocities[2] = { NULL, NULL };
+ArticulationPImpl::UpdateBodiesFn ArticulationPImpl::sUpdateBodies[2] = { NULL, NULL };
+ArticulationPImpl::UpdateBodiesFn ArticulationPImpl::sUpdateBodiesTGS[2] = { NULL, NULL };
+ArticulationPImpl::SaveVelocityFn ArticulationPImpl::sSaveVelocity[2] = { NULL, NULL };
+ArticulationPImpl::SaveVelocityTGSFn ArticulationPImpl::sSaveVelocityTGS[2] = { NULL, NULL };
+
+ArticulationPImpl::UpdateDeltaMotionFn ArticulationPImpl::sUpdateDeltaMotion[2] = { NULL, NULL };
+ArticulationPImpl::DeltaMotionToMotionVelFn ArticulationPImpl::sDeltaMotionToMotionVel[2] = { NULL, NULL };
+ArticulationPImpl::ComputeUnconstrainedVelocitiesTGSFn ArticulationPImpl::sComputeUnconstrainedVelocitiesTGS[2] = { NULL, NULL };
+
+ArticulationPImpl::SetupInternalConstraintsTGSFn ArticulationPImpl::sSetupInternalConstraintsTGS[2] = { NULL, NULL };
+}
+}
--- a/physx/source/lowleveldynamics/src/DyArticulationHelper.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationHelper.h
@ -0,0 +1,209 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_ARTICULATION_HELPER_H
+#define DY_ARTICULATION_HELPER_H
+
+
+#include "DyArticulation.h"
+
+namespace physx
+{
+struct PxsBodyCore;
+struct PxSolverConstraintDesc;
+struct PxSolverConstraintDesc;
+
+namespace Dy
+{
+	struct FsInertia;
+	struct SolverConstraint1DExt;
+	struct ArticulationJointCore;
+	struct ArticulationSolverDesc;
+	struct SolverConstraint1DExtStep;
+	struct PxcFsScratchAllocator;
+
+struct ArticulationJointTransforms
+{
+	PxTransform		cA2w;	// joint parent frame in world space 
+	PxTransform		cB2w;	// joint child frame in world space
+	PxTransform		cB2cA;	// joint relative pose in world space
+};
+
+class ArticulationHelper
+{
+public:
+
+	static void		getImpulseResponse(	const FsData& matrix, 
+										PxU32 linkID,
+										const Cm::SpatialVectorV& impulse,
+										Cm::SpatialVectorV& deltaV);
+
+
+	static PX_FORCE_INLINE 
+			void	getImpulseResponse(	const FsData& matrix, 
+										PxU32 linkID,
+										const Cm::SpatialVector& impulse,
+										Cm::SpatialVector& deltaV)
+	{
+		getImpulseResponse(matrix, linkID, reinterpret_cast<const Cm::SpatialVectorV&>(impulse), reinterpret_cast<Cm::SpatialVectorV&>(deltaV));
+	}
+
+	static void		getImpulseSelfResponse(const FsData& matrix, 
+										   PxU32 linkID0, 
+										   const Cm::SpatialVectorV& impulse0,
+										   Cm::SpatialVectorV& deltaV0,
+										   PxU32 linkID1,
+										   const Cm::SpatialVectorV& impulse1,
+										   Cm::SpatialVectorV& deltaV1);
+
+	//static void		flushVelocity(FsData& matrix);
+
+	static void		saveVelocity(const ArticulationSolverDesc& m, Cm::SpatialVectorF* deltaV);
+
+	static void		saveVelocityTGS(const ArticulationSolverDesc& m, const PxReal invDt);
+
+	//static void		recordDeltaMotion(const ArticulationSolverDesc& m, PxReal dt);
+
+	//static void		deltaMotionToMotionVelocity(const ArticulationSolverDesc& m, PxReal invDt);
+
+	//static void		getDataSizes(PxU32 linkCount, PxU32 &solverDataSize, PxU32& totalSize, PxU32& scratchSize);
+
+	/*static void		initializeDriveCache(Articulation& articulation, 
+										FsData &data,
+										 PxU16 linkCount,
+										 const ArticulationLink* links,
+										 PxReal compliance,
+										 PxU32 iterations,
+										 char* scratchMemory,
+										 PxU32 scratchMemorySize);*/
+
+	//static PxU32	getDriveCacheLinkCount(const FsData& cache);
+
+	/*static void		applyImpulses(const FsData& matrix,
+								  Cm::SpatialVectorV* Z,
+								  Cm::SpatialVectorV* V);*/
+
+//private:
+	static PxU32	getLtbDataSize(PxU32 linkCount);
+	static PxU32	getFsDataSize(PxU32 linkCount);
+
+	//static void		prepareDataBlock(ArticulationV& articualtion,
+	//								 const ArticulationLink* links,
+	//								 PxU16 linkCount,	
+	//								 PxTransform* poses,
+	//							 	 FsInertia *baseInertia,
+	//								 ArticulationJointTransforms* jointTransforms,
+	//								 PxU32 expectedSize);
+
+	//static void		setInertia(FsInertia& inertia,
+	//						   const PxsBodyCore& body,
+	//						   const PxTransform& pose);
+
+	//static void		setJointTransforms(ArticulationJointTransforms& transforms,
+	//								   const PxTransform& parentPose,
+	//								   const PxTransform& childPose,
+	//								   const ArticulationJointCore& joint);
+
+	/*static void		prepareLtbMatrix(FsData& fsData,
+									 const FsInertia* baseInertia,
+									 const PxTransform* poses,
+									 const ArticulationJointTransforms* jointTransforms,
+									 PxReal recipDt);*/
+
+	//static void		prepareFsData(FsData& fsData,
+	//							  const ArticulationLink* links);
+
+	//static PX_FORCE_INLINE PxReal getResistance(PxReal compliance);
+
+
+	static void		createHardLimit(const FsData& fsData,
+									const ArticulationLink* links,
+									PxU32 linkIndex,
+									SolverConstraint1DExt& s, 
+									const PxVec3& axis, 
+									PxReal err,
+									PxReal recipDt);
+
+	static void		createTangentialSpring(const FsData& fsData,
+										   const ArticulationLink* links,
+										   PxU32 linkIndex,
+										   SolverConstraint1DExt& s, 
+										   const PxVec3& axis, 
+										   PxReal stiffness,
+										   PxReal damping,
+										   PxReal dt);
+
+	static void		createHardLimitTGS(const FsData& fsData,
+		const ArticulationLink* links,
+		PxU32 linkIndex,
+		SolverConstraint1DExtStep& s,
+		const PxVec3& axis,
+		PxReal err,
+		PxReal recipDt);
+
+	static void		createTangentialSpringTGS(const FsData& fsData,
+		const ArticulationLink* links,
+		PxU32 linkIndex,
+		SolverConstraint1DExtStep& s,
+		const PxVec3& axis,
+		PxReal stiffness,
+		PxReal damping,
+		PxReal dt);
+
+	static PxU32 setupSolverConstraints(Articulation& articulation, PxU32 solverDataSize,
+													PxConstraintAllocator& allocator,
+													PxSolverConstraintDesc* constraintDesc,
+													const ArticulationLink* links,
+													const ArticulationJointTransforms* jointTransforms,
+													PxReal dt,
+													PxU32& acCount);
+
+	/*static void	computeUnconstrainedVelocitiesInternal(const ArticulationSolverDesc& desc,
+		PxReal dt,
+		const PxVec3& gravity, PxU64 contextID,
+		FsInertia* PX_RESTRICT baseInertia,
+		ArticulationJointTransforms* PX_RESTRICT jointTransforms,
+		PxcFsScratchAllocator& allocator);*/
+
+	/*static void		computeJointDrives(FsData& fsData,
+									   Ps::aos::Vec3V* drives, 
+									   const ArticulationLink* links,
+									   const PxTransform* poses, 
+									   const ArticulationJointTransforms* transforms, 
+									   const Ps::aos::Mat33V* loads, 
+									   PxReal dt);*/
+
+};
+
+}
+
+}
+
+#endif //DY_ARTICULATION_HELPER_H
--- a/physx/source/lowleveldynamics/src/DyArticulationPImpl.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationPImpl.h
@ -0,0 +1,202 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_INTERFACE_H
+#define DY_ARTICULATION_INTERFACE_H
+
+#include "DyArticulationUtils.h"
+
+namespace physx
+{
+
+class PxcConstraintBlockStream;
+class PxcScratchAllocator;
+class PxsConstraintBlockManager;
+struct PxSolverConstraintDesc;
+
+namespace Dy
+{
+	
+	struct ArticulationSolverDesc;
+
+
+class ArticulationPImpl
+{
+public:
+
+	typedef PxU32 (*ComputeUnconstrainedVelocitiesFn)(const ArticulationSolverDesc& desc,
+													 PxReal dt,
+													 PxConstraintAllocator& allocator,
+													 PxSolverConstraintDesc* constraintDesc,
+													 PxU32& acCount,
+													 const PxVec3& gravity, PxU64 contextID,
+													 Cm::SpatialVectorF* Z, Cm::SpatialVectorF* DeltaV);
+
+	typedef void (*UpdateBodiesFn)(const ArticulationSolverDesc& desc, PxReal dt);
+
+	typedef void (*SaveVelocityFn)(const ArticulationSolverDesc &m, Cm::SpatialVectorF* deltaV);
+
+	typedef void(*SaveVelocityTGSFn)(const ArticulationSolverDesc& m, PxReal invDtF32);
+
+	typedef PxU32(*SetupInternalConstraintsTGSFn)(const ArticulationSolverDesc& desc,
+		PxcConstraintBlockStream& stream,
+		PxSolverConstraintDesc* constraintDesc,
+		PxReal dt,
+		PxReal invDt,
+		PxReal totalDt,
+		PxU32& acCount,
+		PxsConstraintBlockManager& constraintBlockManager,
+		Cm::SpatialVectorF* Z);
+
+	typedef void(*ComputeUnconstrainedVelocitiesTGSFn)(const ArticulationSolverDesc& desc,
+		PxReal dt,
+		const PxVec3& gravity, PxU64 contextID, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* DeltaV);
+
+	typedef void(*UpdateDeltaMotionFn)(const ArticulationSolverDesc &m, const PxReal dt, Cm::SpatialVectorF* DeltaV, const PxReal totalInvDt);
+
+	typedef void(*DeltaMotionToMotionVelFn)(const ArticulationSolverDesc &m, const PxReal dt);
+
+	static ComputeUnconstrainedVelocitiesFn sComputeUnconstrainedVelocities[2];
+	static UpdateBodiesFn sUpdateBodies[2];
+	static UpdateBodiesFn sUpdateBodiesTGS[2];
+	static SaveVelocityFn sSaveVelocity[2];
+	static SaveVelocityTGSFn sSaveVelocityTGS[2];
+
+	static UpdateDeltaMotionFn sUpdateDeltaMotion[2];
+	static DeltaMotionToMotionVelFn sDeltaMotionToMotionVel[2];
+	static ComputeUnconstrainedVelocitiesTGSFn sComputeUnconstrainedVelocitiesTGS[2];
+	static SetupInternalConstraintsTGSFn sSetupInternalConstraintsTGS[2];
+
+	static PxU32 computeUnconstrainedVelocities(const ArticulationSolverDesc& desc,
+										   PxReal dt,
+										   PxConstraintAllocator& allocator,
+										   PxSolverConstraintDesc* constraintDesc,
+										   PxU32& acCount,
+										   PxcScratchAllocator&,
+										   const PxVec3& gravity, PxU64 contextID,
+										   Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sComputeUnconstrainedVelocities[type]);
+		if (sComputeUnconstrainedVelocities[type])
+			return (sComputeUnconstrainedVelocities[type])(desc, dt, allocator, constraintDesc, acCount,
+				gravity, contextID, Z, deltaV);
+		else
+			return 0;
+	}
+
+	static void	updateBodies(const ArticulationSolverDesc& desc,
+						 PxReal dt)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sUpdateBodies[type]);
+		if (sUpdateBodies[type])
+			(*sUpdateBodies[type])(desc, dt);
+	}
+
+	static void	updateBodiesTGS(const ArticulationSolverDesc& desc,
+		PxReal dt)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sUpdateBodiesTGS[type]);
+		if (sUpdateBodiesTGS[type])
+			(*sUpdateBodiesTGS[type])(desc, dt);
+	}
+
+	static void	saveVelocity(const ArticulationSolverDesc& desc, Cm::SpatialVectorF* deltaV)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sSaveVelocity[type]);
+		if (sSaveVelocity[type])
+			(*sSaveVelocity[type])(desc, deltaV);
+	}
+
+
+	static void	saveVelocityTGS(const ArticulationSolverDesc& desc, PxReal invDtF32)
+	{
+		PX_UNUSED(desc);
+		PX_UNUSED(invDtF32);
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sSaveVelocityTGS[type]);
+		if (sSaveVelocityTGS[type])
+			(*sSaveVelocityTGS[type])(desc, invDtF32);
+	}
+
+	static void computeUnconstrainedVelocitiesTGS(const ArticulationSolverDesc& desc,
+		PxReal dt,
+		const PxVec3& gravity, PxU64 contextID, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* DeltaV)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sComputeUnconstrainedVelocitiesTGS[type]);
+		if (sComputeUnconstrainedVelocitiesTGS[type])
+		(sComputeUnconstrainedVelocitiesTGS[type])(desc, dt, gravity, contextID, Z, DeltaV);
+	}
+
+	static void	updateDeltaMotion(const ArticulationSolverDesc& desc, const PxReal dt, Cm::SpatialVectorF* DeltaV, const PxReal totalInvDt)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sUpdateDeltaMotion[type]);
+		if (sUpdateDeltaMotion[type])
+			(*sUpdateDeltaMotion[type])(desc, dt, DeltaV, totalInvDt);
+	}
+
+	static void	deltaMotionToMotionVel(const ArticulationSolverDesc& desc, const PxReal invDt)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sDeltaMotionToMotionVel[type]);
+		if (sDeltaMotionToMotionVel[type])
+			(*sDeltaMotionToMotionVel)(desc, invDt);
+	}
+
+	static PxU32 setupSolverInternalConstraintsTGS(const ArticulationSolverDesc& desc,
+		PxcConstraintBlockStream& stream,
+		PxSolverConstraintDesc* constraintDesc,
+		PxReal dt,
+		PxReal invDt,
+		PxReal totalDt,
+		PxU32& acCount,
+		PxsConstraintBlockManager& constraintBlockManager,
+		Cm::SpatialVectorF* Z)
+	{
+		PxU32 type = desc.articulation->getType();
+		PX_ASSERT(sSetupInternalConstraintsTGS[type]);
+		if (sSetupInternalConstraintsTGS[type])
+			return sSetupInternalConstraintsTGS[type](desc, stream, constraintDesc, dt, invDt, totalDt, acCount, constraintBlockManager, Z);
+		return 0;
+
+	}
+};
+
+
+}
+}
+#endif //DY_ARTICULATION_INTERFACE_H
+
--- a/physx/source/lowleveldynamics/src/DyArticulationReference.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationReference.h
@ -0,0 +1,92 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_REFERENCE_H
+#define DY_ARTICULATION_REFERENCE_H
+
+// a per-row struct where we put extra data for debug and setup - ultimately this will move to be just
+// debug only
+
+
+
+#include "DyArticulationUtils.h"
+#include "DyArticulationScalar.h"
+#include "DyArticulationFnsScalar.h"
+#include "DySpatial.h"
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+
+namespace physx
+{
+
+PX_FORCE_INLINE Cm::SpatialVector propagateVelocity(const FsRow& row, 
+													const FsJointVectors& jv,
+													const PxVec3& SZ, 
+													const Cm::SpatialVector& v,
+													const FsRowAux& aux)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector w = Fns::translateMotion(-getParentOffset(jv), v);
+	PxVec3 DSZ = Fns::multiply(row.D, SZ);
+
+	PxVec3 n = Fns::axisDot(getDSI(row), w) + DSZ;
+	Cm::SpatialVector result = w - Cm::SpatialVector(getJointOffset(jv).cross(n),n);
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector check = ArticulationRef::propagateVelocity(row, jv, SZ, v, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+#endif
+	return result;
+}
+
+PX_FORCE_INLINE Cm::SpatialVector propagateImpulse(const FsRow& row, 
+												   const FsJointVectors& jv,
+												   PxVec3& SZ, 
+												   const Cm::SpatialVector& Z,
+												   const FsRowAux& aux)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	SZ = Z.angular + Z.linear.cross(getJointOffset(jv));
+	Cm::SpatialVector result = Fns::translateForce(getParentOffset(jv), Z - Fns::axisMultiply(getDSI(row), SZ));
+#if DY_ARTICULATION_DEBUG_VERIFY
+	PxVec3 SZcheck;
+	Cm::SpatialVector check = ArticulationRef::propagateImpulse(row, jv, SZcheck, Z, aux);
+	PX_ASSERT((result-check).magnitude()<1e-5*PxMax(check.magnitude(), 1.0f));
+	PX_ASSERT((SZ-SZcheck).magnitude()<1e-5*PxMax(SZcheck.magnitude(), 1.0f));
+#endif
+	return result;
+}
+
+}
+#endif
+
+#endif //DY_ARTICULATION_REFERENCE_H
--- a/physx/source/lowleveldynamics/src/DyArticulationSIMD.cpp
+++ b/physx/source/lowleveldynamics/src/DyArticulationSIMD.cpp
@ -0,0 +1,317 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "DySpatial.h"
+#include "DyArticulation.h"
+#include "DyArticulationScalar.h"
+#include "DyArticulationFnsScalar.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationFnsSimd.h"
+
+
+namespace physx
+{
+namespace Dy
+{
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+namespace 
+{
+	Cm::SpatialVector SpV(Vec3V linear, Vec3V angular) 
+	{
+		return Cm::SpatialVector((PxVec3 &)linear, (PxVec3&)angular);
+	}
+}
+#endif
+
+//void PxcFsApplyImpulse(ArticulationV &articulationV,
+//					   PxU32 linkID, 
+//					   Vec3V linear,
+//					   Vec3V angular)
+//{
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//	{	
+//		Cm::SpatialVectorV imp(linear, angular);
+//		ArticulationRef::applyImpulse(matrix, reinterpret_cast<Cm::SpatialVector *>(getRefVelocity(matrix)), linkID, reinterpret_cast<Cm::SpatialVector&>(imp));
+//	}
+//#endif
+//	Articulation& articulation = static_cast<>(articulationV
+//	FsData& matrix = *articulation.getFsDataPtr();
+//	Vec3V linZ = V3Neg(linear);
+//	Vec3V angZ = V3Neg(angular);
+//
+//	const FsRow *rows = getFsRows(matrix);
+//	const FsJointVectors* jointVectors = getJointVectors(matrix);
+//
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//	const FsRowAux *aux = getAux(matrix);
+//#endif
+//	Vec3V *deferredSZ = getDeferredSZ(matrix);
+//
+//	for(PxU32 i = linkID; i!=0; i = matrix.parent[i])
+//	{
+//		const FsRow &row = rows[i];
+//		const FsJointVectors& jv = jointVectors[i];
+//
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//		PxVec3 SZcheck;
+//		Cm::SpatialVector Zcheck = ArticulationRef::propagateImpulse(row, jv, SZcheck, SpV(linZ, angZ), aux[i]);
+//#endif
+//
+//		Vec3V SZ = V3Add(angZ, V3Cross(linZ, jv.jointOffset));
+//		Vec3V lrLinear = V3Sub(linZ,  V3ScaleAdd(row.DSI[0].linear, V3GetX(SZ),
+//									   V3ScaleAdd(row.DSI[1].linear, V3GetY(SZ),
+//									       V3Scale(row.DSI[2].linear, V3GetZ(SZ)))));
+//
+//		Vec3V lrAngular = V3Sub(angZ,  V3ScaleAdd(row.DSI[0].angular, V3GetX(SZ),
+//									    V3ScaleAdd(row.DSI[1].angular, V3GetY(SZ),
+//									        V3Scale(row.DSI[2].angular, V3GetZ(SZ)))));
+//
+//		linZ = lrLinear;
+//		angZ = V3Add(lrAngular, V3Cross(jv.parentOffset, lrLinear));
+//		deferredSZ[i] = V3Add(deferredSZ[i], SZ);
+//
+//		PX_ASSERT(Ps::aos::isFiniteVec3V(linZ));
+//		PX_ASSERT(Ps::aos::isFiniteVec3V(angZ));
+//
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//		Cm::SpatialVector Z = SpV(linZ,angZ);
+//		PX_ASSERT((Z - Zcheck).magnitude()<1e-4*PxMax(Zcheck.magnitude(), 1.0f));
+//		PX_ASSERT(((PxVec3&)SZ-SZcheck).magnitude()<1e-4*PxMax(SZcheck.magnitude(), 1.0f));
+//#endif
+//	}
+//
+//	matrix.deferredZ.linear = V3Add(matrix.deferredZ.linear, linZ);
+//	matrix.deferredZ.angular = V3Add(matrix.deferredZ.angular, angZ);
+//
+//	matrix.dirty |= rows[linkID].pathToRoot;
+//}
+
+//Cm::SpatialVectorV PxcFsGetVelocity(Articulation &articulation,
+//								PxU32 linkID)
+//{
+//	FsData& matrix = *articulation.getFsDataPtr();
+//	const FsRow *rows = getFsRows(matrix);
+//	const FsJointVectors* jointVectors = getJointVectors(matrix);
+//
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//	const FsRowAux *aux = getAux(matrix);
+//#endif
+//	Cm::SpatialVectorV* PX_RESTRICT V = getVelocity(matrix);
+//
+//	// find the dirty node on the path (including the root) with the lowest index
+//	ArticulationBitField toUpdate = rows[linkID].pathToRoot & matrix.dirty;
+//
+//
+//	if(toUpdate)
+//	{
+//		// store the dV elements densely and use an array map to decode - hopefully cache friendlier
+//		PxU32 indexToStackLoc[DY_ARTICULATION_MAX_SIZE], count = 0;
+//		Cm::SpatialVectorV dVStack[DY_ARTICULATION_MAX_SIZE];
+//
+//		ArticulationBitField ignoreNodes = (toUpdate & (0-toUpdate))-1;
+//		ArticulationBitField path = rows[linkID].pathToRoot & ~ignoreNodes, p = path;
+//		ArticulationBitField newDirty = 0;
+//
+//		Vec3V ldV = V3Zero(), adV = V3Zero();
+//		Cm::SpatialVectorV* PX_RESTRICT defV = getDeferredVel(matrix);
+//		Vec3V* PX_RESTRICT SZ = getDeferredSZ(matrix);
+//
+//		if(p & 1)
+//		{
+//			const FsInertia &m = getRootInverseInertia(matrix);
+//			Vec3V lZ = V3Neg(matrix.deferredZ.linear);
+//			Vec3V aZ = V3Neg(matrix.deferredZ.angular);
+//
+//			ldV = V3Add(M33MulV3(m.ll,lZ),		 M33MulV3(m.la,aZ));
+//			adV = V3Add(M33TrnspsMulV3(m.la,lZ), M33MulV3(m.aa,aZ));
+//
+//			V[0].linear = V3Add(V[0].linear, ldV);
+//			V[0].angular = V3Add(V[0].angular, adV);
+//
+//			matrix.deferredZ.linear = V3Zero();
+//			matrix.deferredZ.angular = V3Zero();
+//
+//			indexToStackLoc[0] = count;
+//			Cm::SpatialVectorV &e = dVStack[count++];
+//
+//			e.linear = ldV;
+//			e.angular = adV;
+//
+//			newDirty = rows[0].children;
+//			p--;
+//		}
+//
+//		
+//		while(p)	// using "for(;p;p &= (p-1))" here generates LHSs from the ArticulationLowestSetBit
+//		{
+//			PxU32 i = ArticulationLowestSetBit(p);
+//			const FsJointVectors& jv = jointVectors[i];
+//
+//			p &= (p-1);
+//	
+//			const FsRow* PX_RESTRICT row = rows + i;
+//
+//			ldV = V3Add(ldV, defV[i].linear);
+//			adV = V3Add(adV, defV[i].angular);
+//
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//			Cm::SpatialVector dVcheck = ArticulationRef::propagateVelocity(*row, jv, (PxVec3&)SZ[i], SpV(ldV,adV), aux[i]);
+//#endif
+//
+//			Vec3V DSZ = M33MulV3(row->D, SZ[i]);
+//
+//			Vec3V lW = V3Add(ldV, V3Cross(adV,jv.parentOffset));
+//			Vec3V aW = adV;
+//
+//			const Cm::SpatialVectorV*PX_RESTRICT DSI = row->DSI;
+//			Vec3V lN = V3Merge(V3Dot(DSI[0].linear, lW),   V3Dot(DSI[1].linear, lW),  V3Dot(DSI[2].linear, lW));
+//			Vec3V aN = V3Merge(V3Dot(DSI[0].angular, aW),  V3Dot(DSI[1].angular, aW), V3Dot(DSI[2].angular, aW));
+//
+//			Vec3V n = V3Add(V3Add(lN, aN), DSZ);
+//
+//			ldV = V3Sub(lW, V3Cross(jv.jointOffset,n));
+//			adV = V3Sub(aW, n);
+//
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//			Cm::SpatialVector dV = SpV(ldV,adV);
+//			PX_ASSERT((dV-dVcheck).magnitude()<1e-4*PxMax(dVcheck.magnitude(), 1.0f));
+//#endif
+//
+//			V[i].linear = V3Add(V[i].linear, ldV);
+//			V[i].angular = V3Add(V[i].angular, adV);
+//
+//			defV[i].linear = V3Zero();
+//			defV[i].angular = V3Zero();
+//			SZ[i] = V3Zero();
+//
+//			indexToStackLoc[i] = count;
+//			Cm::SpatialVectorV &e = dVStack[count++];
+//			newDirty |= rows[i].children;
+//
+//			e.linear = ldV;
+//			e.angular = adV;
+//		}
+//
+//		for(ArticulationBitField defer = newDirty&~path; defer; defer &= (defer-1))
+//		{
+//			PxU32 i = ArticulationLowestSetBit(defer);
+//			PxU32 parent = indexToStackLoc[matrix.parent[i]];
+//
+//			defV[i].linear = V3Add(defV[i].linear, dVStack[parent].linear);
+//			defV[i].angular = V3Add(defV[i].angular, dVStack[parent].angular);
+//		}
+//
+//		matrix.dirty = (matrix.dirty | newDirty)&~path;
+//	}
+//#if DY_ARTICULATION_DEBUG_VERIFY
+//	Cm::SpatialVector v = reinterpret_cast<Cm::SpatialVector&>(V[linkID]);
+//	Cm::SpatialVector rv = reinterpret_cast<Cm::SpatialVector&>(getRefVelocity(matrix)[linkID]);
+//	PX_ASSERT((v-rv).magnitude()<1e-4f * PxMax(rv.magnitude(),1.0f));
+//#endif
+//
+//	return V[linkID];
+//}
+
+Cm::SpatialVectorV PxcFsGetMotionVector(ArticulationV& articulation,
+	PxU32 linkID)
+{
+
+	/*const Cm::SpatialVectorV* PX_RESTRICT V = articulation.getMotionVector();
+	return V[linkID];*/
+
+	return articulation.getLinkMotionVector(linkID);
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV propagateVelocitySIMD(const FsRow& row,
+														const FsJointVectors& jv,
+														const Vec3V& SZ,
+														const Cm::SpatialVectorV& v,
+														const FsRowAux& aux)
+{
+	PX_UNUSED(aux);
+
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	Cm::SpatialVectorV w(V3Add(v.linear, V3Cross(v.angular, jv.parentOffset)), v.angular);
+	Vec3V DSZ = M33MulV3(row.D, SZ);
+
+	Vec3V n = V3Add(Fns::axisDot(row.DSI, w), DSZ);
+	Cm::SpatialVectorV result = w - Cm::SpatialVectorV(V3Cross(jv.jointOffset,n), n);
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector check = ArticulationRef::propagateVelocity(row, jv, reinterpret_cast<const PxVec3&>(SZ), reinterpret_cast<const Cm::SpatialVector&>(v), aux);
+	PX_ASSERT((reinterpret_cast<const Cm::SpatialVector&>(result)-check).magnitude()<1e-4*PxMax(check.magnitude(), 1.0f));
+#endif
+
+	return result;
+}
+
+void PxcFsFlushVelocity(FsData& matrix)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	const FsRow* PX_RESTRICT rows = getFsRows(matrix);
+	const FsRowAux* PX_RESTRICT aux = getAux(matrix);
+	const FsJointVectors*PX_RESTRICT jointVectors = getJointVectors(matrix);
+
+	Cm::SpatialVectorV V0 =  Fns::multiply(getRootInverseInertia(matrix), -matrix.deferredZ);
+	matrix.deferredZ = Cm::SpatialVectorV(PxZero);
+
+	getVelocity(matrix)[0] += V0;
+	for(ArticulationBitField defer = rows[0].children; defer; defer &= (defer-1))
+		getDeferredVel(matrix)[ArticulationLowestSetBit(defer)] += V0;
+
+	for(PxU32 i = 1; i<matrix.linkCount; i++)
+	{
+		Cm::SpatialVectorV V = propagateVelocitySIMD(rows[i], jointVectors[i], getDeferredSZ(matrix)[i], getDeferredVel(matrix)[i], aux[i]);
+		getDeferredVel(matrix)[i] = Cm::SpatialVectorV(PxZero);
+		getDeferredSZ(matrix)[i] = V3Zero();
+		getVelocity(matrix)[i] += V;
+		for(ArticulationBitField defer = rows[i].children; defer; defer &= (defer-1))
+			getDeferredVel(matrix)[ArticulationLowestSetBit(defer)] += V;
+	}
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+	{
+		Cm::SpatialVector v = velocityRef(matrix,i), rv = reinterpret_cast<Cm::SpatialVector&>(getRefVelocity(matrix)[i]);
+		Cm::SpatialVector diff = v-rv;
+		PxReal m = rv.magnitude();
+		PX_UNUSED(m);
+		PX_ASSERT(diff.magnitude()<1e-4*PxMax(1.0f,m));
+	}
+#endif
+
+	matrix.dirty = 0;
+}
+}
+}
--- a/physx/source/lowleveldynamics/src/DyArticulationScalar.cpp
+++ b/physx/source/lowleveldynamics/src/DyArticulationScalar.cpp
@ -0,0 +1,575 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "DyArticulationUtils.h"
+#include "DyArticulationScalar.h"
+#include "DyArticulationReference.h"
+#include "DyArticulationFnsDebug.h"
+
+namespace physx
+{
+namespace Dy
+{
+namespace ArticulationRef
+{
+	Cm::SpatialVector propagateImpulse(const FsRow& row, 
+									   const FsJointVectors& j,
+									   PxVec3& SZ,
+									   const Cm::SpatialVector& Z,
+									   const FsRowAux& aux)
+	{
+		typedef ArticulationFnsScalar Fns;
+	
+		SZ = Fns::axisDot(reinterpret_cast<const Cm::SpatialVector*>(aux.S), Z);
+		return Fns::translateForce(getParentOffset(j), Z - Fns::axisMultiply(getDSI(row), SZ));
+	}
+
+	Cm::SpatialVector propagateVelocity(const FsRow& row,
+									    const FsJointVectors& j,
+										const PxVec3& SZ,
+										const Cm::SpatialVector& v,
+										const FsRowAux& aux)
+	{
+		typedef ArticulationFnsScalar Fns;
+
+		Cm::SpatialVector w = Fns::translateMotion(-getParentOffset(j), v);
+		PxVec3 DSZ = Fns::multiply(row.D, SZ);
+
+		return w - Fns::axisMultiply(reinterpret_cast<const Cm::SpatialVector*>(aux.S), DSZ + Fns::axisDot(getDSI(row), w));
+	}
+
+	void applyImpulse(const FsData& matrix, 
+					  Cm::SpatialVector* velocity,
+					  PxU32 linkID, 
+					  const Cm::SpatialVector& impulse)
+	{
+		typedef ArticulationFnsScalar Fns;
+
+		PX_ASSERT(matrix.linkCount<=DY_ARTICULATION_MAX_SIZE);
+
+		const FsRow* rows = getFsRows(matrix);
+		const FsRowAux* aux = getAux(matrix);
+		const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+		Cm::SpatialVector dV[DY_ARTICULATION_MAX_SIZE];
+		PxVec3 SZ[DY_ARTICULATION_MAX_SIZE];
+
+		for(PxU32 i=0;i<matrix.linkCount;i++)
+			SZ[i] = PxVec3(0);
+
+		Cm::SpatialVector Z = -impulse;
+
+		for(;linkID!=0; linkID = matrix.parent[linkID])
+			Z = ArticulationRef::propagateImpulse(rows[linkID], jointVectors[linkID], SZ[linkID], Z, aux[linkID]);
+
+		dV[0] = Fns::getRootDeltaV(matrix,-Z);
+
+		for(PxU32 i=1;i<matrix.linkCount; i++)
+			dV[i] = ArticulationRef::propagateVelocity(rows[i], jointVectors[i], SZ[i], dV[matrix.parent[i]], aux[i]);
+
+		for(PxU32 i=0;i<matrix.linkCount;i++)
+			velocity[i] += dV[i];
+	}
+
+	void ltbFactor(FsData& m)
+	{
+		typedef ArticulationFnsScalar Fns;
+		LtbRow* rows = getLtbRows(m);
+
+		SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+		for(PxU32 i=0;i<m.linkCount;i++)
+			inertia[i] = ArticulationFnsDebug::unsimdify(rows[i].inertia);
+
+		Cm::SpatialVector j[3];
+		for(PxU32 i=m.linkCount; --i>0;)
+		{
+			LtbRow& b = rows[i];
+			inertia[i] = Fns::invertInertia(inertia[i]);
+			PxU32 p = m.parent[i];
+
+			Cm::SpatialVector* j0 = &reinterpret_cast<Cm::SpatialVector&>(*b.j0),
+							 * j1 = &reinterpret_cast<Cm::SpatialVector&>(*b.j1);
+
+			Fns::multiply(j, inertia[i], j1);
+			PxMat33 jResponse = Fns::invertSym33(-Fns::multiplySym(j, j1));
+			j1[0] = j[0]; j1[1] = j[1]; j1[2] = j[2];
+
+			b.jResponse = Mat33V_From_PxMat33(jResponse);
+			Fns::multiply(j, j0, jResponse);
+			inertia[p] = Fns::multiplySubtract(inertia[p], j, j0);
+			j0[0] = j[0]; j0[1] = j[1]; j0[2] = j[2];
+		}
+
+		rows[0].inertia = Fns::invertInertia(inertia[0]);
+		for(PxU32 i=1;i<m.linkCount;i++)
+			rows[i].inertia = inertia[i];
+	}
+
+
+}
+
+#if 0
+
+
+void ltbSolve(const FsData& m, 
+			  Vec3V* c,					// rhs error to solve for
+			  Cm::SpatialVector* y)		// velocity delta output
+{
+	typedef ArticulationFnsScalar Fns;
+
+	PxVec4* b = reinterpret_cast<PxVec4*>(c);
+	const LtbRow* rows = getLtbRows(m);
+	PxMemZero(y, m.linkCount*sizeof(Cm::SpatialVector));
+
+	for(PxU32 i=m.linkCount;i-->1;)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		b[i] -= PxVec4(Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j1), y[i]),0);
+		y[p] -= Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j0), b[i].getXYZ());
+	}
+
+	y[0] = Fns::multiply(rows[0].inertia,y[0]);
+
+	for(PxU32 i=1; i<m.linkCount; i++)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		PxVec3 t = Fns::multiply(r.jResponse, b[i].getXYZ()) - Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j0), y[p]);
+		y[i] = Fns::multiply(r.inertia, y[i]) - Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j1), t);
+	}
+}
+
+void PxcFsPropagateDrivenInertiaScalar(FsData& matrix,
+								 const FsInertia* baseInertia,
+								 const PxReal* isf,
+								 const Mat33V* load)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector IS[3], DSI[3];
+	PxMat33 D;
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+		inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		Fns::multiply(IS, inertia[i], &static_cast<const Cm::SpatialVector&>(*a.S));
+
+		PX_ALIGN(16, PxMat33) L;
+		PxMat33_From_Mat33V(load[i], L);
+		D = Fns::invertSym33(Fns::multiplySym(&static_cast<const Cm::SpatialVector&>(*a.S), IS) + L*isf[i]);
+
+		Fns::multiply(DSI, IS, D);
+
+		r.D = Mat33V_From_PxMat33(D);
+		static_cast<Cm::SpatialVector&>(r.DSI[0]) = DSI[0];
+		static_cast<Cm::SpatialVector&>(r.DSI[1]) = DSI[1];
+		static_cast<Cm::SpatialVector&>(r.DSI[2]) = DSI[2];
+
+		inertia[matrix.parent[i]] += Fns::translate(getParentOffset(jv), Fns::multiplySubtract(inertia[i], DSI, IS));
+	}
+
+	FsInertia& m = getRootInverseInertia(matrix);
+	m = FsInertia(Fns::invertInertia(inertia[0]));
+}
+
+// no need to compile this ecxcept for verification, and it consumes huge amounts of stack space
+void PxcFsComputeJointLoadsScalar(const FsData& matrix,
+							      const FsInertia*PX_RESTRICT baseInertia,
+							      Mat33V*PX_RESTRICT load,
+   							      const PxReal*PX_RESTRICT isf,
+							      PxU32 linkCount,
+							      PxU32 maxIterations)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	// the childward S
+	SpInertia leafwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia rootwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia contribToParent[DY_ARTICULATION_MAX_SIZE];
+
+	// total articulated inertia assuming the articulation is rooted here
+
+	const FsRow* row = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_UNUSED(row);
+
+	PxMat33 load_[DY_ARTICULATION_MAX_SIZE];
+
+	for(PxU32 iter=0;iter<maxIterations;iter++)
+	{
+		for(PxU32 i=0;i<linkCount;i++)
+			inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+			
+		for(PxU32 i=linkCount;i-->1;)
+		{
+			const FsJointVectors& j = jointVectors[i];
+
+			leafwardInertia[i] = inertia[i];
+			contribToParent[i] = Fns::propagate(inertia[i], &static_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+			inertia[matrix.parent[i]] += Fns::translate((PxVec3&)j.parentOffset, contribToParent[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			rootwardInertia[i] = Fns::translate(-(PxVec3&)jointVectors[i].parentOffset, inertia[matrix.parent[i]]) - contribToParent[i];				
+			inertia[i] += Fns::propagate(rootwardInertia[i], &static_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			load_[i] = Fns::computeDriveInertia(leafwardInertia[i], rootwardInertia[i], &static_cast<const Cm::SpatialVector&>(*aux[i].S));
+			PX_ASSERT(load_[i][0].isFinite() && load_[i][1].isFinite() && load_[2][i].isFinite());
+		}					
+	}
+	for(PxU32 i=1;i<linkCount;i++)
+		load[i] = Mat33V_From_PxMat33(load_[i]);
+}
+
+
+void PxcFsApplyImpulse(const FsData& matrix, 
+					   PxU32 linkID, 
+					   const Cm::SpatialVector& impulse)
+{
+#if DY_ARTICULATION_DEBUG_VERIFY
+	PxcFsRefApplyImpulse(matrix, state.refVelocity, linkID, impulse);
+#endif
+
+	Cm::SpatialVector Z = -impulse;
+
+	for(PxU32 i = linkID; i!=0; i = matrix.row[i].parent)
+	{
+		PxVec3 SZ;
+		Z = propagateImpulse(matrix.row[i], SZ, Z, matrix.aux[i]);
+		deferredSZRef(state,i) += SZ;
+	}
+
+	static_cast<Cm::SpatialVector &>(state.deferredZ) += Z;
+	state.dirty |= matrix.row[linkID].pathToRoot;
+}
+
+Cm::SpatialVector PxcFsGetVelocity(const FsData& matrix,
+								  PxU32 linkID)
+{
+	// find the dirty node on the path (including the root) with the lowest index
+	ArticulationBitField toUpdate = matrix.row[linkID].pathToRoot & state.dirty;
+
+	if(toUpdate)
+	{
+		ArticulationBitField ignoreNodes = (toUpdate & (0-toUpdate))-1;
+		ArticulationBitField path = matrix.row[linkID].pathToRoot & ~ignoreNodes, p = path;
+		ArticulationBitField newDirty = 0;
+
+		Cm::SpatialVector dV = Cm::SpatialVector::zero();
+		if(p & 1)
+		{
+			dV = getRootDeltaV(matrix, -deferredZ(state));
+
+			velocityRef(state, 0) += dV;
+			for(ArticulationBitField defer = matrix.row[0].children & ~path; defer; defer &= (defer-1))
+				deferredVelRef(state, ArticulationLowestSetBit(defer)) += dV;
+
+			deferredZRef(state) = Cm::SpatialVector::zero();
+			newDirty = matrix.row[0].children;
+			p--;
+		}
+
+		for(; p; p &= (p-1))
+		{
+			PxU32 i = ArticulationLowestSetBit(p);
+
+			dV = propagateVelocity(matrix.row[i], deferredSZ(state,i), dV + state.deferredVel[i], matrix.aux[i]);
+
+			velocityRef(state,i) += dV;
+			for(ArticulationBitField defer = matrix.row[i].children & ~path; defer; defer &= (defer-1))
+				deferredVelRef(state,ArticulationLowestSetBit(defer)) += dV;
+
+			newDirty |= matrix.row[i].children;
+			deferredVelRef(state,i) = Cm::SpatialVector::zero();
+			deferredSZRef(state,i) = PxVec3(0);
+		}
+
+		state.dirty = (state.dirty | newDirty)&~path;
+	}
+#if DY_ARTICULATION_DEBUG_VERIFY
+	Cm::SpatialVector v = state.velocity[linkID];
+	Cm::SpatialVector rv = state.refVelocity[linkID];
+	PX_ASSERT((v-rv).magnitude()<1e-4f * rv.magnitude());
+#endif
+
+	return state.velocity[linkID];
+}
+
+void PxcFsFlushVelocity(const FsData& matrix)
+{
+	Cm::SpatialVector V = getRootDeltaV(matrix, -deferredZ(state));
+	deferredZRef(state) = Cm::SpatialVector::zero();
+	velocityRef(state,0) += V;
+	for(ArticulationBitField defer = matrix.row[0].children; defer; defer &= (defer-1))
+		deferredVelRef(state,ArticulationLowestSetBit(defer)) += V;
+
+	for(PxU32 i = 1; i<matrix.linkCount; i++)
+	{
+		Cm::SpatialVector V = propagateVelocity(matrix.row[i], deferredSZ(state,i), state.deferredVel[i], matrix.aux[i]);
+		deferredVelRef(state,i) = Cm::SpatialVector::zero();
+		deferredSZRef(state,i) = PxVec3(0);
+		velocityRef(state,i) += V;
+		for(ArticulationBitField defer = matrix.row[i].children; defer; defer &= (defer-1))
+			deferredVelRef(state,ArticulationLowestSetBit(defer)) += V;
+	}
+
+	state.dirty = 0;
+}
+
+void PxcFsPropagateDrivenInertiaScalar(FsData& matrix,
+									   const FsInertia* baseInertia,
+									   const PxReal* isf,
+									   const Mat33V* load,
+									   PxcFsScratchAllocator allocator)
+{
+	typedef ArticulationFnsSimd<ArticulationFnsSimdBase> Fns;
+
+	Cm::SpatialVectorV IS[3];
+	PxMat33 D;
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	FsInertia *inertia = allocator.alloc<FsInertia>(matrix.linkCount);
+	PxMemCopy(inertia, baseInertia, matrix.linkCount*sizeof(FsInertia));
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		Mat33V m = Fns::computeSIS(inertia[i], a.S, IS);
+		FloatV f = FLoad(isf[i]);
+
+		Mat33V D = Fns::invertSym33(Mat33V(V3ScaleAdd(load[i].col0, f, m.col0),
+										   V3ScaleAdd(load[i].col1, f, m.col1),
+										   V3ScaleAdd(load[i].col2, f, m.col2)));
+		r.D = D;
+
+		inertia[matrix.parent[i]] = Fns::addInertia(inertia[matrix.parent[i]], 
+													Fns::translateInertia(jv.parentOffset, Fns::multiplySubtract(inertia[i], D,  IS,  r.DSI)));
+	}
+
+	getRootInverseInertia(matrix) = Fns::invertInertia(inertia[0]);
+}
+
+void PxcLtbSolve(const FsData& m, 
+				 Vec3V* c,					// rhs error to solve for
+				 Cm::SpatialVector* y)		// velocity delta output
+{
+	typedef ArticulationFnsScalar Fns;
+
+	PxVec4* b = reinterpret_cast<PxVec4*>(c);
+	const LtbRow* rows = getLtbRows(m);
+	PxMemZero(y, m.linkCount*sizeof(Cm::SpatialVector));
+
+	for(PxU32 i=m.linkCount;i-->1;)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		b[i] -= PxVec4(Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j1), y[i]),0);
+		y[p] -= Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j0), b[i].getXYZ());
+	}
+
+	y[0] = Fns::multiply(rows[0].inertia,y[0]);
+
+	for(PxU32 i=1; i<m.linkCount; i++)
+	{
+		PxU32 p = m.parent[i];
+		const LtbRow& r = rows[i];
+		PxVec3 t = Fns::multiply(r.jResponse, b[i].getXYZ()) - Fns::axisDot(&static_cast<const Cm::SpatialVector&>(*r.j0), y[p]);
+		y[i] = Fns::multiply(r.inertia, y[i]) - Fns::axisMultiply(&static_cast<const Cm::SpatialVector&>(*r.j1), t);
+	}
+}
+
+
+#endif
+
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+void PxcLtbFactorScalar(FsData& m)
+{
+	typedef ArticulationFnsScalar Fns;
+	LtbRow* rows = getLtbRows(m);
+
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<m.linkCount;i++)
+		inertia[i] = ArticulationFnsDebug::unsimdify(rows[i].inertia);
+
+	Cm::SpatialVector j[3];
+	for(PxU32 i=m.linkCount; --i>0;)
+	{
+		LtbRow& b = rows[i];
+		inertia[i] = Fns::invertInertia(inertia[i]);
+		PxU32 p = m.parent[i];
+
+		Cm::SpatialVector* j0 = &reinterpret_cast<Cm::SpatialVector&>(*b.j0),
+						 * j1 = &reinterpret_cast<Cm::SpatialVector&>(*b.j1);
+
+		Fns::multiply(j, inertia[i], j1);
+		PxMat33 jResponse = Fns::invertSym33(-Fns::multiplySym(j, j1));
+		j1[0] = j[0]; j1[1] = j[1]; j1[2] = j[2];
+
+		b.jResponse = Mat33V_From_PxMat33(jResponse);
+		Fns::multiply(j, j0, jResponse);
+		inertia[p] = Fns::multiplySubtract(inertia[p], j, j0);
+		j0[0] = j[0]; j0[1] = j[1]; j0[2] = j[2];
+	}
+
+	rows[0].inertia = Fns::invertInertia(inertia[0]);
+	for(PxU32 i=1;i<m.linkCount;i++)
+		rows[i].inertia = inertia[i];
+}
+
+void PxcFsPropagateDrivenInertiaScalar(FsData& matrix,
+									   const FsInertia* baseInertia,
+									   const PxReal* isf,
+									   const Mat33V* load)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	Cm::SpatialVector IS[3], DSI[3];
+	PxMat33 D;
+
+	FsRow* rows = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	for(PxU32 i=0;i<matrix.linkCount;i++)
+		inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+
+	for(PxU32 i=matrix.linkCount; --i>0;)
+	{
+		FsRow& r = rows[i];
+		const FsRowAux& a = aux[i];
+		const FsJointVectors& jv = jointVectors[i];
+
+		Fns::multiply(IS, inertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*a.S));
+
+		PX_ALIGN(16, PxMat33) L;
+		PxMat33_From_Mat33V(load[i], L);
+		D = Fns::invertSym33(Fns::multiplySym(&reinterpret_cast<const Cm::SpatialVector&>(*a.S), IS) + L*isf[i]);
+
+		Fns::multiply(DSI, IS, D);
+
+		r.D = Mat33V_From_PxMat33(D);
+		reinterpret_cast<Cm::SpatialVector&>(r.DSI[0]) = DSI[0];
+		reinterpret_cast<Cm::SpatialVector&>(r.DSI[1]) = DSI[1];
+		reinterpret_cast<Cm::SpatialVector&>(r.DSI[2]) = DSI[2];
+
+		inertia[matrix.parent[i]] += Fns::translate(getParentOffset(jv), Fns::multiplySubtract(inertia[i], DSI, IS));
+	}
+
+	FsInertia& m = getRootInverseInertia(matrix);
+	m = FsInertia(Fns::invertInertia(inertia[0]));
+}
+
+// no need to compile this ecxcept for verification, and it consumes huge amounts of stack space
+void PxcFsComputeJointLoadsScalar(const FsData& matrix,
+								  const FsInertia*PX_RESTRICT baseInertia,
+								  Mat33V*PX_RESTRICT load,
+								  const PxReal*PX_RESTRICT isf,
+								  PxU32 linkCount,
+								  PxU32 maxIterations)
+{
+	typedef ArticulationFnsScalar Fns;
+
+	// the childward S
+	SpInertia leafwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia rootwardInertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia inertia[DY_ARTICULATION_MAX_SIZE];
+	SpInertia contribToParent[DY_ARTICULATION_MAX_SIZE];
+
+	// total articulated inertia assuming the articulation is rooted here
+
+	const FsRow* row = getFsRows(matrix);
+	const FsRowAux* aux = getAux(matrix);
+	const FsJointVectors* jointVectors = getJointVectors(matrix);
+
+	PX_UNUSED(row);
+
+	PxMat33 load_[DY_ARTICULATION_MAX_SIZE];
+
+	for(PxU32 iter=0;iter<maxIterations;iter++)
+	{
+		for(PxU32 i=0;i<linkCount;i++)
+			inertia[i] = ArticulationFnsDebug::unsimdify(baseInertia[i]);
+			
+		for(PxU32 i=linkCount;i-->1;)
+		{
+			const FsJointVectors& j = jointVectors[i];
+
+			leafwardInertia[i] = inertia[i];
+			contribToParent[i] = Fns::propagate(inertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+			inertia[matrix.parent[i]] += Fns::translate((PxVec3&)j.parentOffset, contribToParent[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			rootwardInertia[i] = Fns::translate(-(PxVec3&)jointVectors[i].parentOffset, inertia[matrix.parent[i]]) - contribToParent[i];				
+			inertia[i] += Fns::propagate(rootwardInertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*aux[i].S), load_[i], isf[i]);
+		}
+
+		for(PxU32 i=1;i<linkCount;i++)
+		{
+			load_[i] = Fns::computeDriveInertia(leafwardInertia[i], rootwardInertia[i], &reinterpret_cast<const Cm::SpatialVector&>(*aux[i].S));
+			PX_ASSERT(load_[i][0].isFinite() && load_[i][1].isFinite() && load_[2][i].isFinite());
+		}					
+	}
+	for(PxU32 i=1;i<linkCount;i++)
+		load[i] = Mat33V_From_PxMat33(load_[i]);
+}
+#endif
+
+}
+
+}
--- a/physx/source/lowleveldynamics/src/DyArticulationScalar.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationScalar.h
@ -0,0 +1,101 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_SCALAR_H
+#define DY_ARTICULATION_SCALAR_H
+
+// Scalar helpers for articulations
+
+#include "foundation/PxUnionCast.h"
+#include "DyArticulationUtils.h"
+#include "DySpatial.h"
+#include "PsFPU.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+PX_FORCE_INLINE Cm::SpatialVector&	velocityRef(FsData &m, PxU32 i)
+{	
+	return reinterpret_cast<Cm::SpatialVector&>(getVelocity(m)[i]); 
+}
+
+PX_FORCE_INLINE Cm::SpatialVector&	deferredVelRef(FsData &m, PxU32 i)
+{	
+	return reinterpret_cast<Cm::SpatialVector&>(getDeferredVel(m)[i]); 
+}
+
+PX_FORCE_INLINE PxVec3& deferredSZRef(FsData &m, PxU32 i)
+{	
+	return reinterpret_cast<PxVec3 &>(getDeferredSZ(m)[i]); 
+}
+
+PX_FORCE_INLINE const PxVec3& deferredSZ(const FsData &s, PxU32 i) 
+{	
+	return reinterpret_cast<const PxVec3 &>(getDeferredSZ(s)[i]); 
+}
+
+PX_FORCE_INLINE Cm::SpatialVector& deferredZRef(FsData &s)
+{
+	return unsimdRef(s.deferredZ);
+}
+
+
+PX_FORCE_INLINE const Cm::SpatialVector& deferredZ(const FsData &s)
+{
+	return unsimdRef(s.deferredZ);
+}
+
+PX_FORCE_INLINE const PxVec3& getJointOffset(const FsJointVectors& j) 
+{	
+	return reinterpret_cast<const PxVec3& >(j.jointOffset);		
+}
+
+PX_FORCE_INLINE const PxVec3& getParentOffset(const FsJointVectors& j) 
+{	
+	return reinterpret_cast<const PxVec3&>(j.parentOffset);		
+}
+
+
+
+
+PX_FORCE_INLINE const Cm::SpatialVector* getDSI(const FsRow& row)
+{	
+	return PxUnionCast<const Cm::SpatialVector*,const Cm::SpatialVectorV*>(row.DSI); //reinterpret_cast<const Cm::SpatialVector*>(row.DSI); 
+}
+
+}
+
+}
+
+#endif //DY_ARTICULATION_SCALAR_H
--- a/physx/source/lowleveldynamics/src/DyArticulationUtils.h
+++ b/physx/source/lowleveldynamics/src/DyArticulationUtils.h
@ -0,0 +1,336 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_ARTICULATION_H
+#define DY_ARTICULATION_H
+
+#include "PsVecMath.h"
+#include "CmSpatialVector.h"
+#include "DySpatial.h"
+#include "PsBitUtils.h"
+#include "DyArticulation.h"
+#include "DyArticulationHelper.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+	struct ArticulationCore;
+	struct ArticulationLink;
+	typedef size_t ArticulationLinkHandle;
+	class Articulation;
+
+#define DY_ARTICULATION_DEBUG_VERIFY 0
+
+PX_FORCE_INLINE PxU32 ArticulationLowestSetBit(ArticulationBitField val)
+{
+	PxU32 low = PxU32(val&0xffffffff), high = PxU32(val>>32);
+	PxU32 mask = PxU32((!low)-1);
+	PxU32 result = (mask&Ps::lowestSetBitUnsafe(low)) | ((~mask)&(Ps::lowestSetBitUnsafe(high)+32));
+	PX_ASSERT(val & (PxU64(1)<<result));
+	PX_ASSERT(!(val & ((PxU64(1)<<result)-1)));
+	return result;
+}
+
+PX_FORCE_INLINE PxU32 ArticulationHighestSetBit(ArticulationBitField val)
+{
+	PxU32 low = PxU32(val & 0xffffffff), high = PxU32(val >> 32);
+	PxU32 mask = PxU32((!high) - 1);
+	PxU32 result = ((~mask)&Ps::highestSetBitUnsafe(low)) | ((mask)&(Ps::highestSetBitUnsafe(high) + 32));
+	PX_ASSERT(val & (PxU64(1) << result));
+	return result;
+}
+
+using namespace Ps::aos;
+
+
+
+PX_FORCE_INLINE Cm::SpatialVector& unsimdRef(Cm::SpatialVectorV& v)				{ return reinterpret_cast<Cm::SpatialVector&>(v); }
+PX_FORCE_INLINE const Cm::SpatialVector& unsimdRef(const Cm::SpatialVectorV& v) { return reinterpret_cast<const Cm::SpatialVector&>(v); }
+
+
+PX_ALIGN_PREFIX(16)
+struct FsJointVectors
+{
+	Vec3V					parentOffset;		// 16 bytes world-space offset from parent to child
+	Vec3V					jointOffset;		// 16 bytes world-space offset from child to joint
+}
+PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct FsRow
+{	
+	Cm::SpatialVectorV			DSI[3];				// 96 bytes
+	Mat33V					D;					// 48 bytes
+	ArticulationBitField	children;			// 8 bytes bitmap of children
+	ArticulationBitField	pathToRoot;			// 8 bytes bitmap of nodes to root, including self and root
+}
+PX_ALIGN_SUFFIX(16);
+
+PX_COMPILE_TIME_ASSERT(sizeof(FsRow)==160);
+
+
+
+PX_ALIGN_PREFIX(16)
+struct FsInertia
+{
+	Mat33V ll, la, aa;
+	PX_FORCE_INLINE FsInertia(const Mat33V& _ll, const Mat33V& _la, const Mat33V& _aa): ll(_ll), la(_la), aa(_aa) {}
+	PX_FORCE_INLINE FsInertia(const SpInertia& I)
+	: ll(Mat33V_From_PxMat33(I.mLL)), la(Mat33V_From_PxMat33(I.mLA)), aa(Mat33V_From_PxMat33(I.mAA)) {}
+	PX_FORCE_INLINE FsInertia() {}
+
+	PX_FORCE_INLINE void operator=(const FsInertia& other)
+	{
+		ll.col0 = other.ll.col0;	ll.col1 = other.ll.col1;	ll.col2 = other.ll.col2;
+		la.col0 = other.la.col0;	la.col1 = other.la.col1;	la.col2 = other.la.col2;
+		aa.col0 = other.aa.col0;	aa.col1 = other.aa.col1;	aa.col2 = other.aa.col2;
+	}
+
+	PX_FORCE_INLINE FsInertia(const FsInertia& other)
+	{
+		ll.col0 = other.ll.col0;	ll.col1 = other.ll.col1;	ll.col2 = other.ll.col2;
+		la.col0 = other.la.col0;	la.col1 = other.la.col1;	la.col2 = other.la.col2;
+		aa.col0 = other.aa.col0;	aa.col1 = other.aa.col1;	aa.col2 = other.aa.col2;
+	}
+
+}PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct LtbRow
+{
+	FsInertia		inertia;			// body inertia in world space
+	Cm::SpatialVectorV		j0[3], j1[3];		// jacobians
+	Mat33V				jResponse;			// inverse response matrix of joint
+	Vec3V				jC;
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct FsRowAux
+{
+	Cm::SpatialVectorV		S[3];				// motion subspace
+}PX_ALIGN_SUFFIX(16);
+
+
+struct FsData
+{
+	//Articulation*	articulationX;																//4
+	
+#if !PX_P64_FAMILY
+	PxU32				pad0;																		//8	
+#endif
+	PxU16				linkCount;						// number of links							//10
+	PxU16				jointVectorOffset;				// offset of read-only data					//12
+//	PxU16				maxSolverNormalProgress;													//14
+//	PxU16				maxSolverFrictionProgress;													//16
+
+	PxU64				dirty;																		//24
+	PxU16				ltbDataOffset;					// offset of save-velocity data				//26
+	PxU16				fsDataOffset;					// offset of joint references				//28
+	//PxU32				solverProgress;																//32
+	
+
+	Cm::SpatialVectorV		deferredZ;																	//64
+	PxU8				parent[DY_ARTICULATION_MAX_SIZE];											//128
+};
+
+PX_COMPILE_TIME_ASSERT(0 == (sizeof(FsData) & 0x0f));
+
+#define SOLVER_BODY_SOLVER_PROGRESS_OFFSET 28	
+#define SOLVER_BODY_MAX_SOLVER_PROGRESS_OFFSET 12
+
+namespace
+{
+	template<class T> PX_FORCE_INLINE T addAddr(void* addr, PxU32 increment) 
+	{ 
+		return reinterpret_cast<T>(reinterpret_cast<char*>(addr)+increment);
+	}
+
+	template<class T> PX_FORCE_INLINE T addAddr(const void* addr, PxU32 increment) 
+	{ 
+		return reinterpret_cast<T>(reinterpret_cast<const char*>(addr)+increment);
+	}
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getVelocity(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(&matrix, sizeof(FsData));
+}
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getVelocity(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(&matrix, sizeof(FsData));
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getDeferredVel(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(getVelocity(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getDeferredVel(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(getVelocity(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getMotionVector(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(getDeferredVel(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getMotionVector(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(getDeferredVel(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE Vec3V* getDeferredSZ(FsData& matrix)
+{
+	return addAddr<Vec3V*>(getMotionVector(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Vec3V* getDeferredSZ(const FsData& matrix)
+{
+	return addAddr<const Vec3V*>(getMotionVector(matrix), sizeof(Cm::SpatialVectorV) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE const PxReal* getMaxPenBias(const FsData& matrix)
+{
+	return addAddr<const PxReal*>(getDeferredSZ(matrix), sizeof(Vec3V) * matrix.linkCount);
+}
+
+PX_FORCE_INLINE PxReal* getMaxPenBias(FsData& matrix)
+{
+	return addAddr<PxReal*>(getDeferredSZ(matrix), sizeof(Vec3V) * matrix.linkCount);
+}
+
+
+PX_FORCE_INLINE FsJointVectors* getJointVectors(FsData& matrix)
+{
+	return addAddr<FsJointVectors *>(&matrix,matrix.jointVectorOffset);
+}
+
+PX_FORCE_INLINE const FsJointVectors* getJointVectors(const FsData& matrix)
+{
+	return addAddr<const FsJointVectors *>(&matrix,matrix.jointVectorOffset);
+}
+
+PX_FORCE_INLINE FsInertia& getRootInverseInertia(FsData& matrix)
+{
+	return *addAddr<FsInertia*>(&matrix,matrix.fsDataOffset);
+}
+
+PX_FORCE_INLINE const FsInertia& getRootInverseInertia(const FsData& matrix)
+{
+	return *addAddr<const FsInertia*>(&matrix,matrix.fsDataOffset);
+	
+}
+
+PX_FORCE_INLINE FsRow* getFsRows(FsData& matrix)
+{
+	return addAddr<FsRow*>(&getRootInverseInertia(matrix),sizeof(FsInertia));
+}
+
+PX_FORCE_INLINE const FsRow* getFsRows(const FsData& matrix)
+{
+	return addAddr<const FsRow*>(&getRootInverseInertia(matrix),sizeof(FsInertia));
+}
+
+
+PX_FORCE_INLINE LtbRow* getLtbRows(FsData& matrix)
+{
+	return addAddr<LtbRow*>(&matrix,matrix.ltbDataOffset);
+}
+
+PX_FORCE_INLINE const LtbRow* getLtbRows(const FsData& matrix)
+{
+	return addAddr<const LtbRow*>(&matrix,matrix.ltbDataOffset);
+}
+
+
+PX_FORCE_INLINE Cm::SpatialVectorV* getRefVelocity(FsData& matrix)
+{
+	return addAddr<Cm::SpatialVectorV*>(getLtbRows(matrix), sizeof(LtbRow)*matrix.linkCount);
+}
+
+PX_FORCE_INLINE const Cm::SpatialVectorV* getRefVelocity(const FsData& matrix)
+{
+	return addAddr<const Cm::SpatialVectorV*>(getLtbRows(matrix), sizeof(LtbRow)*matrix.linkCount);
+}
+
+PX_FORCE_INLINE FsRowAux* getAux(FsData& matrix)
+{
+	return addAddr<FsRowAux*>(getRefVelocity(matrix),sizeof(Cm::SpatialVectorV)*matrix.linkCount);
+}
+
+PX_FORCE_INLINE const FsRowAux* getAux(const FsData& matrix)
+{
+	return addAddr<const FsRowAux*>(getRefVelocity(matrix),sizeof(Cm::SpatialVectorV)*matrix.linkCount);
+}
+
+//void PxcFsApplyImpulse(ArticulationV& articulation,
+//					   PxU32 linkID,
+//					   Vec3V linear,
+//					   Vec3V angular);
+
+//Cm::SpatialVectorV PxcFsGetVelocity(ArticulationV& articulation,
+//							    PxU32 linkID);
+
+Cm::SpatialVectorV PxcFsGetMotionVector(ArticulationV& articulation,
+	PxU32 linkID);
+
+
+#if DY_ARTICULATION_DEBUG_VERIFY
+namespace ArticulationRef 
+{	
+	Cm::SpatialVector propagateVelocity(const FsRow& row,
+										const FsJointVectors& jv,
+										const PxVec3& SZ, 
+										const Cm::SpatialVector& v, 
+										const FsRowAux& aux); 
+
+	Cm::SpatialVector propagateImpulse(const FsRow& row, 
+									   const FsJointVectors& jv,
+									   PxVec3& SZ, 
+									   const Cm::SpatialVector& Z,	
+									   const FsRowAux& aux); 
+
+	void applyImpulse(const FsData& matrix,
+					  Cm::SpatialVector* velocity,
+					  PxU32 linkID, 
+					  const Cm::SpatialVector& impulse);
+
+}
+#endif
+
+}
+}
+
+#endif //DY_ARTICULATION_H
--- a/physx/source/lowleveldynamics/src/DyBodyCoreIntegrator.h
+++ b/physx/source/lowleveldynamics/src/DyBodyCoreIntegrator.h
@ -0,0 +1,404 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_BODYCORE_INTEGRATOR_H
+#define DY_BODYCORE_INTEGRATOR_H
+
+#include "CmPhysXCommon.h"
+#include "PxvDynamics.h"
+#include "PsMathUtils.h"
+#include "PxsRigidBody.h"
+#include "DySolverBody.h"
+#include "DySleepingConfigulation.h"
+#include "PxsIslandSim.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+PX_FORCE_INLINE void bodyCoreComputeUnconstrainedVelocity
+(const PxVec3& gravity, const PxReal dt, const PxReal linearDamping, const PxReal angularDamping, const PxReal accelScale, 
+const PxReal maxLinearVelocitySq, const PxReal maxAngularVelocitySq, PxVec3& inOutLinearVelocity, PxVec3& inOutAngularVelocity,
+bool disableGravity)
+{
+
+	//Multiply everything that needs multiplied by dt to improve code generation.
+
+	PxVec3 linearVelocity = inOutLinearVelocity;
+	PxVec3 angularVelocity = inOutAngularVelocity;
+	
+	const PxReal linearDampingTimesDT=linearDamping*dt;
+	const PxReal angularDampingTimesDT=angularDamping*dt;
+	const PxReal oneMinusLinearDampingTimesDT=1.0f-linearDampingTimesDT;
+	const PxReal oneMinusAngularDampingTimesDT=1.0f-angularDampingTimesDT;
+
+	//TODO context-global gravity
+	if (!disableGravity)
+	{
+		const PxVec3 linearAccelTimesDT = gravity*dt *accelScale;
+		linearVelocity += linearAccelTimesDT;
+	}
+
+	//Apply damping.
+	const PxReal linVelMultiplier = physx::intrinsics::fsel(oneMinusLinearDampingTimesDT, oneMinusLinearDampingTimesDT, 0.0f);
+	const PxReal angVelMultiplier = physx::intrinsics::fsel(oneMinusAngularDampingTimesDT, oneMinusAngularDampingTimesDT, 0.0f);
+	linearVelocity*=linVelMultiplier;
+	angularVelocity*=angVelMultiplier;
+
+	// Clamp velocity
+	const PxReal linVelSq = linearVelocity.magnitudeSquared();
+	if(linVelSq > maxLinearVelocitySq)
+	{
+		linearVelocity *= PxSqrt(maxLinearVelocitySq / linVelSq);
+	}
+	const PxReal angVelSq = angularVelocity.magnitudeSquared();
+	if(angVelSq > maxAngularVelocitySq)
+	{
+		angularVelocity *= PxSqrt(maxAngularVelocitySq / angVelSq);
+	}
+
+	inOutLinearVelocity = linearVelocity;
+	inOutAngularVelocity = angularVelocity;
+}
+
+
+PX_FORCE_INLINE void integrateCore(PxVec3& motionLinearVelocity, PxVec3& motionAngularVelocity, PxSolverBody& solverBody, PxSolverBodyData& solverBodyData, const PxF32 dt)
+{
+	PxU32 lockFlags = solverBodyData.lockFlags;
+	if (lockFlags)
+	{
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X)
+		{
+			motionLinearVelocity.x = 0.f;
+			solverBody.linearVelocity.x = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y)
+		{
+			motionLinearVelocity.y = 0.f;
+			solverBody.linearVelocity.y = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z)
+		{
+			motionLinearVelocity.z = 0.f;
+			solverBody.linearVelocity.z = 0.f;
+		}
+		
+		//The angular velocity should be 0 because it is now impossible to make it rotate around that axis!
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X)
+		{
+			motionAngularVelocity.x = 0.f;
+			solverBody.angularState.x = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y)
+		{
+			motionAngularVelocity.y = 0.f;
+			solverBody.angularState.y = 0.f;
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z)
+		{
+			motionAngularVelocity.z = 0.f;
+			solverBody.angularState.z = 0.f;
+		}
+	}
+
+	// Integrate linear part
+	PxVec3 linearMotionVel = solverBodyData.linearVelocity + motionLinearVelocity;
+	PxVec3 delta = linearMotionVel * dt;
+	PxVec3 angularMotionVel = solverBodyData.angularVelocity + solverBodyData.sqrtInvInertia * motionAngularVelocity;
+	PxReal w = angularMotionVel.magnitudeSquared();
+	solverBodyData.body2World.p += delta;
+	PX_ASSERT(solverBodyData.body2World.p.isFinite());
+
+	//Store back the linear and angular velocities
+	//core.linearVelocity += solverBody.linearVelocity * solverBodyData.sqrtInvMass;
+	solverBodyData.linearVelocity += solverBody.linearVelocity;
+	solverBodyData.angularVelocity += solverBodyData.sqrtInvInertia * solverBody.angularState;
+	
+	// Integrate the rotation using closed form quaternion integrator
+	if (w != 0.0f)
+	{
+		w = PxSqrt(w);
+		// Perform a post-solver clamping
+		// TODO(dsequeira): ignore this for the moment
+		//just clamp motionVel to half float-range
+		const PxReal maxW = 1e+7f;		//Should be about sqrt(PX_MAX_REAL/2) or smaller
+		if (w > maxW)
+		{
+			angularMotionVel = angularMotionVel.getNormalized() * maxW;
+			w = maxW;
+		}
+		const PxReal v = dt * w * 0.5f;
+		PxReal s, q;
+		Ps::sincos(v, s, q);
+		s /= w;
+
+		const PxVec3 pqr = angularMotionVel * s;
+		const PxQuat quatVel(pqr.x, pqr.y, pqr.z, 0);
+		PxQuat result = quatVel * solverBodyData.body2World.q;
+
+		result += solverBodyData.body2World.q * q;
+
+		solverBodyData.body2World.q = result.getNormalized();
+		PX_ASSERT(solverBodyData.body2World.q.isSane());
+		PX_ASSERT(solverBodyData.body2World.q.isFinite());
+	}
+
+	motionLinearVelocity = linearMotionVel;
+	motionAngularVelocity = angularMotionVel;
+}
+
+
+PX_FORCE_INLINE PxReal updateWakeCounter(PxsRigidBody* originalBody, PxReal dt, PxReal /*invDt*/, const bool enableStabilization, const bool useAdaptiveForce, Cm::SpatialVector& motionVelocity,
+	bool hasStaticTouch)
+{
+	//KS - at most one of these features can be enabled at any time
+	PX_ASSERT(!useAdaptiveForce || !enableStabilization);
+	PxsBodyCore& bodyCore = originalBody->getCore();
+
+	// update the body's sleep state and 
+	PxReal wakeCounterResetTime = 20.0f*0.02f;
+
+	PxReal wc = bodyCore.wakeCounter;
+
+	{
+		if (enableStabilization)
+		{
+			bool freeze = false;
+			const PxTransform& body2World = bodyCore.body2World;
+
+			// calculate normalized energy: kinetic energy divided by mass
+
+			const PxVec3 t = bodyCore.inverseInertia;
+			const PxVec3 inertia(t.x > 0.f ? 1.0f / t.x : 1.f, t.y > 0.f ? 1.0f / t.y : 1.f, t.z > 0.f ? 1.0f / t.z : 1.f);
+
+
+			PxVec3 sleepLinVelAcc = motionVelocity.linear;
+			PxVec3 sleepAngVelAcc = body2World.q.rotateInv(motionVelocity.angular);
+
+			// scale threshold by cluster factor (more contacts => higher sleep threshold)
+			//const PxReal clusterFactor = PxReal(1u + getNumUniqueInteractions());
+
+			PxReal invMass = bodyCore.inverseMass;
+			if (invMass == 0.f)
+				invMass = 1.f;
+
+			const PxReal angular = sleepAngVelAcc.multiply(sleepAngVelAcc).dot(inertia) * invMass;
+			const PxReal linear = sleepLinVelAcc.magnitudeSquared();
+			PxReal frameNormalizedEnergy = 0.5f * (angular + linear);
+
+			const PxReal cf = hasStaticTouch ? PxReal(PxMin(10u, bodyCore.numBodyInteractions)) : 0.f;
+			const PxReal freezeThresh = cf*bodyCore.freezeThreshold;
+
+			originalBody->freezeCount = PxMax(originalBody->freezeCount - dt, 0.0f);
+			bool settled = true;
+
+			PxReal accelScale = PxMin(1.f, originalBody->accelScale + dt);
+
+			if (frameNormalizedEnergy >= freezeThresh)
+			{
+				settled = false;
+				originalBody->freezeCount = PXD_FREEZE_INTERVAL;
+			}
+
+			if (!hasStaticTouch)
+			{
+				accelScale = 1.f;
+				settled = false;
+			}
+
+
+			if (settled)
+			{
+				//Dampen bodies that are just about to go to sleep
+				if (cf > 1.f)
+				{
+					const PxReal sleepDamping = PXD_SLEEP_DAMPING;
+					const PxReal sleepDampingTimesDT = sleepDamping*dt;
+					const PxReal d = 1.0f - sleepDampingTimesDT;
+					bodyCore.linearVelocity = bodyCore.linearVelocity * d;
+					bodyCore.angularVelocity = bodyCore.angularVelocity * d;
+					accelScale = accelScale * 0.75f + 0.25f*PXD_FREEZE_SCALE;
+				}
+				freeze = originalBody->freezeCount == 0.f && frameNormalizedEnergy < (bodyCore.freezeThreshold * PXD_FREEZE_TOLERANCE);
+			}
+
+			originalBody->accelScale = accelScale;
+
+			const PxU32 wasFrozen = originalBody->mInternalFlags & PxsRigidBody::eFROZEN;
+			PxU16 flags;
+			if(freeze)
+			{
+				//current flag isn't frozen but freeze flag raise so we need to raise the frozen flag in this frame
+				flags = PxU16(PxsRigidBody::eFROZEN);
+				if(!wasFrozen)
+					flags |= PxsRigidBody::eFREEZE_THIS_FRAME;
+				bodyCore.body2World = originalBody->getLastCCDTransform();
+			}
+			else
+			{
+				flags = 0;
+				if(wasFrozen)
+					flags |= PxsRigidBody::eUNFREEZE_THIS_FRAME;
+			}
+			originalBody->mInternalFlags = flags;
+
+			/*KS: New algorithm for sleeping when using stabilization:
+			* Energy *this frame* must be higher than sleep threshold and accumulated energy over previous frames
+			* must be higher than clusterFactor*energyThreshold.
+			*/
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				//Accumulate energy
+				originalBody->sleepLinVelAcc += sleepLinVelAcc;
+				originalBody->sleepAngVelAcc += sleepAngVelAcc;
+
+				//If energy this frame is high
+				if (frameNormalizedEnergy >= bodyCore.sleepThreshold)
+				{
+					//Compute energy over sleep preparation time
+					const PxReal sleepAngular = originalBody->sleepAngVelAcc.multiply(originalBody->sleepAngVelAcc).dot(inertia) * invMass;
+					const PxReal sleepLinear = originalBody->sleepLinVelAcc.magnitudeSquared();
+					PxReal normalizedEnergy = 0.5f * (sleepAngular + sleepLinear);
+					const PxReal sleepClusterFactor = PxReal(1u + bodyCore.numCountedInteractions);
+					// scale threshold by cluster factor (more contacts => higher sleep threshold)
+					const PxReal threshold = sleepClusterFactor*bodyCore.sleepThreshold;
+
+					//If energy over sleep preparation time is high
+					if (normalizedEnergy >= threshold)
+					{
+						//Wake up
+						//PX_ASSERT(isActive());
+						originalBody->sleepAngVelAcc = PxVec3(0);
+						originalBody->sleepLinVelAcc = PxVec3(0);
+
+						const float factor = bodyCore.sleepThreshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+						PxReal oldWc = wc;
+						wc = factor * 0.5f * wakeCounterResetTime + dt * (sleepClusterFactor - 1.0f);
+						bodyCore.solverWakeCounter = wc;
+						//if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+						//	notifyNotReadyForSleeping(bodyCore.nodeIndex);
+
+						if (oldWc == 0.0f)
+							originalBody->mInternalFlags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+
+						return wc;
+					}
+				}
+			}
+
+		}
+		else 
+		{
+			if (useAdaptiveForce)
+			{
+				if (hasStaticTouch && bodyCore.numBodyInteractions > 1)
+					originalBody->accelScale = 1.f / PxReal(bodyCore.numBodyInteractions);
+				else
+					originalBody->accelScale = 1.f;
+			}
+			if (wc < wakeCounterResetTime * 0.5f || wc < dt)
+			{
+				const PxTransform& body2World = bodyCore.body2World;
+
+				// calculate normalized energy: kinetic energy divided by mass
+				const PxVec3 t = bodyCore.inverseInertia;
+				const PxVec3 inertia(t.x > 0.f ? 1.0f / t.x : 1.f, t.y > 0.f ? 1.0f / t.y : 1.f, t.z > 0.f ? 1.0f / t.z : 1.f);
+
+				PxVec3 sleepLinVelAcc = motionVelocity.linear;
+				PxVec3 sleepAngVelAcc = body2World.q.rotateInv(motionVelocity.angular);
+
+				originalBody->sleepLinVelAcc += sleepLinVelAcc;
+				originalBody->sleepAngVelAcc += sleepAngVelAcc;
+
+				PxReal invMass = bodyCore.inverseMass;
+				if (invMass == 0.f)
+					invMass = 1.f;
+
+				const PxReal angular = originalBody->sleepAngVelAcc.multiply(originalBody->sleepAngVelAcc).dot(inertia) * invMass;
+				const PxReal linear = originalBody->sleepLinVelAcc.magnitudeSquared();
+				PxReal normalizedEnergy = 0.5f * (angular + linear);
+
+				// scale threshold by cluster factor (more contacts => higher sleep threshold)
+				const PxReal clusterFactor = PxReal(1 + bodyCore.numCountedInteractions);
+				const PxReal threshold = clusterFactor*bodyCore.sleepThreshold;
+
+				if (normalizedEnergy >= threshold)
+				{
+					//PX_ASSERT(isActive());
+					originalBody->sleepLinVelAcc = PxVec3(0);
+					originalBody->sleepAngVelAcc = PxVec3(0);
+					const float factor = threshold == 0.f ? 2.0f : PxMin(normalizedEnergy / threshold, 2.0f);
+					PxReal oldWc = wc;
+					wc = factor * 0.5f * wakeCounterResetTime + dt * (clusterFactor - 1.0f);
+					bodyCore.solverWakeCounter = wc;
+					PxU16 flags = 0;
+					if (oldWc == 0.0f)  // for the case where a sleeping body got activated by the system (not the user) AND got processed by the solver as well
+					{
+						flags |= PxsRigidBody::eACTIVATE_THIS_FRAME;
+						//notifyNotReadyForSleeping(bodyCore.nodeIndex);
+					}
+
+					originalBody->mInternalFlags = flags;
+
+					return wc;
+				}
+			}
+		}
+	}
+
+	wc = PxMax(wc - dt, 0.0f);
+	bodyCore.solverWakeCounter = wc;
+	return wc;
+}
+
+PX_FORCE_INLINE void sleepCheck(PxsRigidBody* originalBody, const PxReal dt, const PxReal intDt, const bool enableStabilization, bool useAdaptiveForce, Cm::SpatialVector& motionVelocity,
+	bool hasStaticTouch)
+{
+
+	PxReal wc = updateWakeCounter(originalBody, dt, intDt, enableStabilization, useAdaptiveForce, motionVelocity, hasStaticTouch);
+	bool wakeCounterZero = (wc == 0.0f);
+
+	if (wakeCounterZero)
+	{
+		//PxsBodyCore& bodyCore = originalBody->getCore();
+		originalBody->mInternalFlags |= PxsRigidBody::eDEACTIVATE_THIS_FRAME;
+		//	notifyReadyForSleeping(bodyCore.nodeIndex);
+		originalBody->sleepLinVelAcc = PxVec3(0);
+		originalBody->sleepAngVelAcc = PxVec3(0);
+	}
+}
+
+}
+
+}
+
+#endif //DY_BODYCORE_INTEGRATOR_H
--- a/physx/source/lowleveldynamics/src/DyConstraintPartition.cpp
+++ b/physx/source/lowleveldynamics/src/DyConstraintPartition.cpp
@ -0,0 +1,926 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+
+#include "DyConstraintPartition.h"
+#include "DyArticulationUtils.h"
+
+#define INTERLEAVE_SELF_CONSTRAINTS 1
+
+
+namespace physx
+{
+namespace Dy
+{
+
+namespace
+{
+
+PX_FORCE_INLINE PxU32 getArticulationIndex(const uintptr_t eaArticulation, const uintptr_t* eaArticulations, const PxU32 numEas)
+{
+	PxU32 index=0xffffffff;
+	for(PxU32 i=0;i<numEas;i++)
+	{
+		if(eaArticulations[i]== eaArticulation)
+		{
+			index=i;
+			break;
+		}
+	}
+	PX_ASSERT(index!=0xffffffff);
+	return index;
+}
+
+
+
+#define MAX_NUM_PARTITIONS 32
+
+
+class RigidBodyClassification
+{
+	PxU8* PX_RESTRICT mBodies;
+	PxU32 mBodySize;
+	PxU32 mBodyStride;
+	PxU32 mBodyCount;
+
+public:
+	RigidBodyClassification(PxU8* PX_RESTRICT bodies, PxU32 bodyCount, PxU32 bodyStride) : mBodies(bodies), mBodySize(bodyCount*bodyStride), mBodyStride(bodyStride),
+		mBodyCount(bodyCount)
+	{
+	}
+
+	//Returns true if it is a dynamic-dynamic constriant; false if it is a dynamic-static or dynamic-kinematic constraint
+	PX_FORCE_INLINE bool classifyConstraint(const PxSolverConstraintDesc& desc, uintptr_t& indexA, uintptr_t& indexB, 
+		bool& activeA, bool& activeB, PxU32& bodyAProgress, PxU32& bodyBProgress) const
+	{
+		indexA=uintptr_t(reinterpret_cast<PxU8*>(desc.bodyA) - mBodies)/mBodyStride;
+		indexB=uintptr_t(reinterpret_cast<PxU8*>(desc.bodyB) - mBodies)/mBodyStride;
+		activeA = indexA < mBodyCount;
+		activeB = indexB < mBodyCount;
+		bodyAProgress = desc.bodyA->solverProgress;
+		bodyBProgress = desc.bodyB->solverProgress;
+		return activeA && activeB;
+	}
+
+	PX_FORCE_INLINE void getProgress(const PxSolverConstraintDesc& desc,
+		PxU32& bodyAProgress, PxU32& bodyBProgress)
+	{
+		bodyAProgress = desc.bodyA->solverProgress;
+		bodyBProgress = desc.bodyB->solverProgress;
+	}
+
+	PX_FORCE_INLINE void storeProgress(const PxSolverConstraintDesc& desc, const PxU32 bodyAProgress, const PxU32 bodyBProgress,
+		const PxU16 availablePartition)
+	{
+		desc.bodyA->solverProgress = bodyAProgress;
+		desc.bodyA->maxSolverNormalProgress = PxMax(desc.bodyA->maxSolverNormalProgress, availablePartition);
+		desc.bodyB->solverProgress = bodyBProgress;
+		desc.bodyB->maxSolverNormalProgress = PxMax(desc.bodyB->maxSolverNormalProgress, availablePartition);
+	}
+
+	PX_FORCE_INLINE void storeProgress(const PxSolverConstraintDesc& desc, const PxU32 bodyAProgress, const PxU32 bodyBProgress)
+	{
+		desc.bodyA->solverProgress = bodyAProgress;
+		desc.bodyB->solverProgress = bodyBProgress;
+	}
+
+	PX_FORCE_INLINE PxU32 getStaticContactWriteIndex(const PxSolverConstraintDesc& desc,
+		bool activeA, bool activeB)
+	{
+		if (activeA)
+			return PxU32(desc.bodyA->maxSolverNormalProgress + desc.bodyA->maxSolverFrictionProgress++);
+		else if (activeB)
+			return PxU32(desc.bodyB->maxSolverNormalProgress + desc.bodyB->maxSolverFrictionProgress++);
+	
+		return 0xffffffff;
+	
+	}
+
+	PX_FORCE_INLINE void recordStaticConstraint(const PxSolverConstraintDesc& desc, bool& activeA, bool& activeB) const
+	{
+		if (activeA)
+		{
+			desc.bodyA->maxSolverFrictionProgress++;
+		}
+
+		if (activeB)
+		{
+			desc.bodyB->maxSolverFrictionProgress++;
+		}
+	}
+
+	PX_FORCE_INLINE void clearState()
+	{
+		for(PxU32 a = 0; a < mBodySize; a+= mBodyStride)
+			reinterpret_cast<PxSolverBody*>(mBodies+a)->solverProgress = 0;
+	}
+
+	PX_FORCE_INLINE void reserveSpaceForStaticConstraints(Ps::Array<PxU32>& numConstraintsPerPartition)
+	{
+		for(PxU32 a = 0; a < mBodySize; a += mBodyStride)
+		{
+			PxSolverBody& body = *reinterpret_cast<PxSolverBody*>(mBodies+a);
+			body.solverProgress = 0;
+
+			PxU32 requiredSize = PxU32(body.maxSolverNormalProgress + body.maxSolverFrictionProgress);
+			if(requiredSize > numConstraintsPerPartition.size())
+			{
+				numConstraintsPerPartition.resize(requiredSize);
+			}
+
+			for(PxU32 b = 0; b < body.maxSolverFrictionProgress; ++b)
+			{
+				numConstraintsPerPartition[body.maxSolverNormalProgress + b]++;
+			}
+		}
+	}
+};
+
+class ExtendedRigidBodyClassification
+{
+	PxU8* PX_RESTRICT mBodies;
+	PxU32 mBodyCount;
+	PxU32 mBodySize;
+	PxU32 mStride;
+	//uintptr_t* PX_RESTRICT mFsDatas;
+	uintptr_t* PX_RESTRICT mArticulations;
+	PxU32 mNumArticulations;
+
+public:
+
+	ExtendedRigidBodyClassification(PxU8* PX_RESTRICT bodies, PxU32 numBodies, PxU32 stride, uintptr_t* articulations, PxU32 numArticulations)
+		: mBodies(bodies), mBodyCount(numBodies), mBodySize(numBodies*stride), mStride(stride), mArticulations(articulations), mNumArticulations(numArticulations)
+	{
+	}
+
+	//Returns true if it is a dynamic-dynamic constriant; false if it is a dynamic-static or dynamic-kinematic constraint
+	PX_FORCE_INLINE bool classifyConstraint(const PxSolverConstraintDesc& desc, uintptr_t& indexA, uintptr_t& indexB, 
+		bool& activeA, bool& activeB, PxU32& bodyAProgress, PxU32& bodyBProgress) const
+	{
+		bool hasStatic = false;
+		if(PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+		{
+			indexA=uintptr_t(reinterpret_cast<PxU8*>(desc.bodyA) - mBodies)/mStride;
+			activeA = indexA < mBodyCount;
+			hasStatic = !activeA;//desc.bodyADataIndex == 0;
+			bodyAProgress = activeA ? desc.bodyA->solverProgress: 0;
+		}
+		else
+		{
+			
+			ArticulationV* articulationA = desc.articulationA;
+			indexA=mBodyCount+getArticulationIndex(uintptr_t(articulationA), mArticulations ,mNumArticulations);
+			//bodyAProgress = articulationA->getFsDataPtr()->solverProgress;
+			bodyAProgress = articulationA->solverProgress;
+			activeA = true;
+		}
+		if(PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+		{
+			indexB=uintptr_t(reinterpret_cast<PxU8*>(desc.bodyB) - mBodies)/mStride;
+			activeB = indexB < mBodyCount;
+			hasStatic = hasStatic || !activeB;
+			bodyBProgress = activeB ? desc.bodyB->solverProgress : 0;
+		}
+		else
+		{
+			ArticulationV* articulationB = desc.articulationB;
+			indexB=mBodyCount+getArticulationIndex(uintptr_t(articulationB), mArticulations, mNumArticulations);
+			activeB = true;
+			bodyBProgress = articulationB->solverProgress;
+		}
+		return !hasStatic;
+	}
+
+	PX_FORCE_INLINE void getProgress(const PxSolverConstraintDesc& desc,
+		PxU32& bodyAProgress, PxU32& bodyBProgress) const
+	{
+		if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+		{
+			bodyAProgress = desc.bodyA->solverProgress;
+		}
+		else
+		{
+			ArticulationV* articulationA = desc.articulationA;
+			bodyAProgress = articulationA->solverProgress;
+		}
+
+		if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+		{
+			
+			bodyBProgress = desc.bodyB->solverProgress;
+		}
+		else
+		{
+			ArticulationV* articulationB = desc.articulationB;
+			bodyBProgress = articulationB->solverProgress;
+		}
+	}
+
+
+	PX_FORCE_INLINE void storeProgress(const PxSolverConstraintDesc& desc, const PxU32 bodyAProgress, const PxU32 bodyBProgress,
+		const PxU16 availablePartition)
+	{
+		if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+		{
+			desc.bodyA->solverProgress = bodyAProgress;
+			desc.bodyA->maxSolverNormalProgress = PxMax(desc.bodyA->maxSolverNormalProgress, availablePartition);
+		}
+		else
+		{
+			ArticulationV* articulationA = desc.articulationA;
+			articulationA->solverProgress = bodyAProgress;
+			articulationA->maxSolverNormalProgress = PxMax(articulationA->maxSolverNormalProgress, availablePartition);
+		}
+
+		if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+		{
+			desc.bodyB->solverProgress = bodyBProgress;
+			desc.bodyB->maxSolverNormalProgress = PxMax(desc.bodyB->maxSolverNormalProgress, availablePartition);
+		}
+		else
+		{
+			ArticulationV* articulationB = desc.articulationB;
+			articulationB->solverProgress = bodyBProgress;
+			articulationB->maxSolverNormalProgress = PxMax(articulationB->maxSolverNormalProgress, availablePartition);
+		}
+	}
+
+	PX_FORCE_INLINE void storeProgress(const PxSolverConstraintDesc& desc, const PxU32 bodyAProgress,
+		const PxU32 bodyBProgress)
+	{
+		if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+		{
+			desc.bodyA->solverProgress = bodyAProgress;
+		}
+		else
+		{
+			ArticulationV* articulationA = desc.articulationA;
+			articulationA->solverProgress = bodyAProgress;
+		}
+
+		if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+		{
+			desc.bodyB->solverProgress = bodyBProgress;
+		}
+		else
+		{
+			ArticulationV* articulationB = desc.articulationB;
+			articulationB->solverProgress = bodyBProgress;
+		}
+	}
+
+	PX_FORCE_INLINE void recordStaticConstraint(const PxSolverConstraintDesc& desc, bool& activeA, bool& activeB)
+	{
+		if (activeA)
+		{
+			if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+				desc.bodyA->maxSolverFrictionProgress++;
+			else
+			{
+				ArticulationV* articulationA = desc.articulationA;
+				if(!articulationA->willStoreStaticConstraint())
+					articulationA->maxSolverFrictionProgress++;
+			}
+		}
+
+		if (activeB)
+		{
+			if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+				desc.bodyB->maxSolverFrictionProgress++;
+			else
+			{
+				ArticulationV* articulationB = desc.articulationB;
+				if (!articulationB->willStoreStaticConstraint())
+					articulationB->maxSolverFrictionProgress++;
+			}
+		}
+	}
+
+	PX_FORCE_INLINE PxU32 getStaticContactWriteIndex(const PxSolverConstraintDesc& desc,
+		bool activeA, bool activeB)
+	{
+		if (activeA)
+		{
+			if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexA)
+			{
+				return PxU32(desc.bodyA->maxSolverNormalProgress + desc.bodyA->maxSolverFrictionProgress++);
+			}
+			else
+			{
+				ArticulationV* articulationA = desc.articulationA;
+				//Attempt to store static constraints on the articulation (only supported with the reduced coordinate articulations).
+				//This acts as an optimization
+				if(articulationA->storeStaticConstraint(desc))
+					return 0xffffffff;
+				return PxU32(articulationA->maxSolverNormalProgress + articulationA->maxSolverFrictionProgress++);
+			}
+		}
+		else if (activeB)
+		{
+			if (PxSolverConstraintDesc::NO_LINK == desc.linkIndexB)
+			{
+				return PxU32(desc.bodyB->maxSolverNormalProgress + desc.bodyB->maxSolverFrictionProgress++);
+			}
+			else
+			{
+				ArticulationV* articulationB = desc.articulationB;
+				//Attempt to store static constraints on the articulation (only supported with the reduced coordinate articulations).
+				//This acts as an optimization
+				if (articulationB->storeStaticConstraint(desc))
+					return 0xffffffff;
+				return PxU32(articulationB->maxSolverNormalProgress + articulationB->maxSolverFrictionProgress++);
+			}
+		}
+
+		return 0xffffffff;
+	}
+
+	PX_FORCE_INLINE void clearState()
+	{
+		for(PxU32 a = 0; a < mBodySize; a+= mStride)
+			reinterpret_cast<PxSolverBody*>(mBodies+a)->solverProgress = 0;
+
+		for(PxU32 a = 0; a < mNumArticulations; ++a)
+			(reinterpret_cast<ArticulationV*>(mArticulations[a]))->solverProgress = 0;
+	}
+
+	PX_FORCE_INLINE void reserveSpaceForStaticConstraints(Ps::Array<PxU32>& numConstraintsPerPartition)
+	{
+		for(PxU32 a = 0; a < mBodySize; a+= mStride)
+		{
+			PxSolverBody& body = *reinterpret_cast<PxSolverBody*>(mBodies+a);
+			body.solverProgress = 0;
+
+			PxU32 requiredSize = PxU32(body.maxSolverNormalProgress + body.maxSolverFrictionProgress);
+			if(requiredSize > numConstraintsPerPartition.size())
+			{
+				numConstraintsPerPartition.resize(requiredSize);
+			}
+
+			for(PxU32 b = 0; b < body.maxSolverFrictionProgress; ++b)
+			{
+				numConstraintsPerPartition[body.maxSolverNormalProgress + b]++;
+			}
+		}
+
+		for(PxU32 a = 0; a < mNumArticulations; ++a)
+		{
+			ArticulationV* articulation = reinterpret_cast<ArticulationV*>(mArticulations[a]);
+			articulation->solverProgress = 0;
+
+			PxU32 requiredSize = PxU32(articulation->maxSolverNormalProgress + articulation->maxSolverFrictionProgress);
+			if(requiredSize > numConstraintsPerPartition.size())
+			{
+				numConstraintsPerPartition.resize(requiredSize);
+			}
+
+			for(PxU32 b = 0; b < articulation->maxSolverFrictionProgress; ++b)
+			{
+				numConstraintsPerPartition[articulation->maxSolverNormalProgress + b]++;
+			}
+		}
+	}
+
+};
+
+template <typename Classification>
+void classifyConstraintDesc(const PxSolverConstraintDesc* PX_RESTRICT descs, const PxU32 numConstraints, Classification& classification, 
+							Ps::Array<PxU32>& numConstraintsPerPartition, PxSolverConstraintDesc* PX_RESTRICT eaTempConstraintDescriptors)
+{
+	const PxSolverConstraintDesc* _desc = descs;
+	const PxU32 numConstraintsMin1 = numConstraints - 1;
+
+	PxU32 numUnpartitionedConstraints = 0;
+
+	numConstraintsPerPartition.forceSize_Unsafe(32);
+
+	PxMemZero(numConstraintsPerPartition.begin(), sizeof(PxU32) * 32);
+
+	for(PxU32 i = 0; i < numConstraints; ++i, _desc++)
+	{
+		const PxU32 prefetchOffset = PxMin(numConstraintsMin1 - i, 4u);
+		Ps::prefetchLine(_desc[prefetchOffset].constraint);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyA);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyB);
+		Ps::prefetchLine(_desc + 8);
+
+		uintptr_t indexA, indexB;
+		bool activeA, activeB;
+
+		PxU32 partitionsA, partitionsB;
+		const bool notContainsStatic = classification.classifyConstraint(*_desc, indexA, indexB, activeA, activeB,
+			partitionsA, partitionsB);
+		
+		if(notContainsStatic)
+		{			
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					eaTempConstraintDescriptors[numUnpartitionedConstraints++] = *_desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = (1u << availablePartition);
+				if (activeA)
+					partitionsA |= partitionBit;
+				if(activeB)
+					partitionsB |= partitionBit;
+			}
+
+			numConstraintsPerPartition[availablePartition]++;
+			availablePartition++;
+
+			classification.storeProgress(*_desc, partitionsA, partitionsB, PxU16(availablePartition));
+		}
+		else
+		{
+			classification.recordStaticConstraint(*_desc, activeA, activeB);
+		}
+	}
+
+	PxU32 partitionStartIndex = 0;
+
+	while(numUnpartitionedConstraints > 0)
+	{
+		classification.clearState();
+
+		partitionStartIndex += 32;
+		//Keep partitioning the un-partitioned constraints and blat the whole thing to 0!
+		numConstraintsPerPartition.resize(32 + numConstraintsPerPartition.size());
+		PxMemZero(numConstraintsPerPartition.begin() + partitionStartIndex, sizeof(PxU32) * 32);
+
+		PxU32 newNumUnpartitionedConstraints = 0;
+		PxU32 partitionsA, partitionsB;
+		bool activeA, activeB;
+		uintptr_t indexA, indexB;
+		for(PxU32 i = 0; i < numUnpartitionedConstraints; ++i)
+		{
+			const PxSolverConstraintDesc& desc = eaTempConstraintDescriptors[i];
+			
+			classification.classifyConstraint(desc, indexA, indexB, activeA, activeB,
+				partitionsA, partitionsB);
+			
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					//Need to shuffle around unpartitioned constraints...
+					eaTempConstraintDescriptors[newNumUnpartitionedConstraints++] = desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = (1u << availablePartition);
+				if(activeA)
+					partitionsA |= partitionBit;
+				if(activeB)
+					partitionsB |= partitionBit;
+			}
+
+			
+			/*desc.bodyA->solverProgress = partitionsA;
+			desc.bodyB->solverProgress = partitionsB;*/
+			availablePartition += partitionStartIndex;
+			numConstraintsPerPartition[availablePartition]++;
+			availablePartition++;
+
+			classification.storeProgress(desc, partitionsA, partitionsB, PxU16(availablePartition) );
+
+		/*	desc.bodyA->maxSolverNormalProgress = PxMax(desc.bodyA->maxSolverNormalProgress, PxU16(availablePartition));
+			desc.bodyB->maxSolverNormalProgress = PxMax(desc.bodyB->maxSolverNormalProgress, PxU16(availablePartition));*/
+		}
+
+		numUnpartitionedConstraints = newNumUnpartitionedConstraints;
+	}
+
+	classification.reserveSpaceForStaticConstraints(numConstraintsPerPartition);
+
+}
+
+template <typename Classification>
+PxU32 writeConstraintDesc(const PxSolverConstraintDesc* PX_RESTRICT descs, const PxU32 numConstraints, Classification& classification,
+						 Ps::Array<PxU32>& accumulatedConstraintsPerPartition, PxSolverConstraintDesc* eaTempConstraintDescriptors,
+							PxSolverConstraintDesc* PX_RESTRICT eaOrderedConstraintDesc)
+{
+	PX_UNUSED(eaTempConstraintDescriptors);
+	const PxSolverConstraintDesc* _desc = descs;
+	const PxU32 numConstraintsMin1 = numConstraints - 1;
+
+	PxU32 numUnpartitionedConstraints = 0;
+	PxU32 numStaticConstraints = 0;
+
+	for(PxU32 i = 0; i < numConstraints; ++i, _desc++)
+	{
+		const PxU32 prefetchOffset = PxMin(numConstraintsMin1 - i, 4u);
+		Ps::prefetchLine(_desc[prefetchOffset].constraint);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyA);
+		Ps::prefetchLine(_desc[prefetchOffset].bodyB);
+		Ps::prefetchLine(_desc + 8);
+
+		uintptr_t indexA, indexB;
+		bool activeA, activeB;
+		PxU32 partitionsA, partitionsB;
+		const bool notContainsStatic = classification.classifyConstraint(*_desc, indexA, indexB, activeA, activeB, partitionsA, partitionsB);
+
+		if(notContainsStatic)
+		{
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					eaTempConstraintDescriptors[numUnpartitionedConstraints++] = *_desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = (1u << availablePartition);
+				if(activeA)
+					partitionsA |= partitionBit;
+				if(activeB)
+					partitionsB |= partitionBit;
+			}
+
+			classification.storeProgress(*_desc, partitionsA, partitionsB, PxU16(availablePartition + 1));
+
+			eaOrderedConstraintDesc[accumulatedConstraintsPerPartition[availablePartition]++] = *_desc;
+		}
+		else
+		{
+			//Just count the number of static constraints and store in maxSolverFrictionProgress...
+			PxU32 index = classification.getStaticContactWriteIndex(*_desc, activeA, activeB);
+			if (index != 0xffffffff)
+			{
+				eaOrderedConstraintDesc[accumulatedConstraintsPerPartition[index]++] = *_desc;
+			}
+			else
+				numStaticConstraints++;
+		}
+	}
+
+	PxU32 partitionStartIndex = 0;
+
+	while(numUnpartitionedConstraints > 0)
+	{
+		classification.clearState();
+
+		partitionStartIndex += 32;	
+		PxU32 newNumUnpartitionedConstraints = 0;
+
+		PxU32 partitionsA, partitionsB;
+		bool activeA, activeB;
+		uintptr_t indexA, indexB;
+		for(PxU32 i = 0; i < numUnpartitionedConstraints; ++i)
+		{
+			const PxSolverConstraintDesc& desc = eaTempConstraintDescriptors[i];
+			
+		/*	PxU32 partitionsA=desc.bodyA->solverProgress;
+			PxU32 partitionsB=desc.bodyB->solverProgress;*/
+
+			classification.classifyConstraint(desc, indexA, indexB,
+				activeA, activeB, partitionsA, partitionsB);
+				
+			PxU32 availablePartition;
+			{
+				const PxU32 combinedMask = (~partitionsA & ~partitionsB);
+				availablePartition = combinedMask == 0 ? MAX_NUM_PARTITIONS : Ps::lowestSetBit(combinedMask);
+				if(availablePartition == MAX_NUM_PARTITIONS)
+				{
+					//Need to shuffle around unpartitioned constraints...
+					eaTempConstraintDescriptors[newNumUnpartitionedConstraints++] = desc;
+					continue;
+				}
+
+				const PxU32 partitionBit = (1u << availablePartition);
+
+				if(activeA)
+					partitionsA |= partitionBit;
+				if(activeB)
+					partitionsB |= partitionBit;
+			}
+
+			/*desc.bodyA->solverProgress = partitionsA;
+			desc.bodyB->solverProgress = partitionsB;
+*/
+			classification.storeProgress(desc, partitionsA, partitionsB);
+			availablePartition += partitionStartIndex;
+			eaOrderedConstraintDesc[accumulatedConstraintsPerPartition[availablePartition]++] = desc;
+		}
+
+		numUnpartitionedConstraints = newNumUnpartitionedConstraints;
+	}
+
+	return numStaticConstraints;
+}
+
+}
+
+#define PX_NORMALIZE_PARTITIONS 1
+
+#if PX_NORMALIZE_PARTITIONS
+
+template<typename Classification>
+PxU32 normalizePartitions(Ps::Array<PxU32>& accumulatedConstraintsPerPartition, PxSolverConstraintDesc* PX_RESTRICT eaOrderedConstraintDescriptors, 
+	const PxU32 numConstraintDescriptors, Ps::Array<PxU32>& bitField, const Classification& classification, const PxU32 numBodies, const PxU32 numArticulations)
+{
+	PxU32 numPartitions = 0;
+	
+	PxU32 prevAccumulation = 0;
+	for(; numPartitions < accumulatedConstraintsPerPartition.size() && accumulatedConstraintsPerPartition[numPartitions] > prevAccumulation; 
+		prevAccumulation = accumulatedConstraintsPerPartition[numPartitions++]);
+
+	PxU32 targetSize = (numPartitions == 0 ? 0 : (numConstraintDescriptors)/numPartitions);
+
+	bitField.reserve((numBodies + numArticulations + 31)/32);
+	bitField.forceSize_Unsafe((numBodies + numArticulations + 31)/32);
+
+	for(PxU32 i = numPartitions; i > 0; i--)
+	{
+		PxU32 partitionIndex = i-1;
+
+		//Build the partition mask...
+
+		PxU32 startIndex = partitionIndex == 0 ? 0 : accumulatedConstraintsPerPartition[partitionIndex-1];
+		PxU32 endIndex = accumulatedConstraintsPerPartition[partitionIndex];
+
+		//If its greater than target size, there's nothing that will be pulled into it from earlier partitions
+		if((endIndex - startIndex) >= targetSize)
+			continue;
+
+
+		PxMemZero(bitField.begin(), sizeof(PxU32)*bitField.size());
+
+		for(PxU32 a = startIndex; a < endIndex; ++a)
+		{
+			PxSolverConstraintDesc& desc = eaOrderedConstraintDescriptors[a];
+
+			uintptr_t indexA, indexB;
+			bool activeA, activeB;
+			PxU32 partitionsA, partitionsB;
+
+			classification.classifyConstraint(desc, indexA, indexB, activeA, activeB, partitionsA, partitionsB);
+
+			if (activeA)
+				bitField[PxU32(indexA) / 32] |= (1u << (indexA & 31)); 
+			if(activeB)
+				bitField[PxU32(indexB)/32] |= (1u << (indexB & 31));
+		}
+
+		bool bTerm = false;
+		for(PxU32 a = partitionIndex; a > 0 && !bTerm; --a)
+		{
+			PxU32 pInd = a-1;
+
+			PxU32 si = pInd == 0 ? 0 : accumulatedConstraintsPerPartition[pInd-1];
+			PxU32 ei = accumulatedConstraintsPerPartition[pInd];
+
+			for(PxU32 b = ei; b > si && !bTerm; --b)
+			{
+				PxU32 ind = b-1;
+				PxSolverConstraintDesc& desc = eaOrderedConstraintDescriptors[ind];
+
+				uintptr_t indexA, indexB;
+				bool activeA, activeB;
+				PxU32 partitionsA, partitionsB;
+
+				classification.classifyConstraint(desc, indexA, indexB, activeA, activeB, partitionsA, partitionsB);
+
+				bool canAdd = true;
+
+				if(activeA && (bitField[PxU32(indexA)/32] & (1u << (indexA & 31))))
+					canAdd = false;
+				if(activeB && (bitField[PxU32(indexB)/32] & (1u << (indexB & 31))))
+					canAdd = false;
+
+				if(canAdd)
+				{
+					PxSolverConstraintDesc tmp = eaOrderedConstraintDescriptors[ind];
+
+					if(activeA)
+						bitField[PxU32(indexA)/32] |= (1u << (indexA & 31));
+					if(activeB)
+						bitField[PxU32(indexB)/32] |= (1u << (indexB & 31));
+
+					PxU32 index = ind;
+					for(PxU32 c = pInd; c < partitionIndex; ++c)
+					{
+						PxU32 newIndex = --accumulatedConstraintsPerPartition[c];
+						if(index != newIndex)
+							eaOrderedConstraintDescriptors[index] = eaOrderedConstraintDescriptors[newIndex];	
+						index = newIndex;
+					}
+
+					if(index != ind)
+						eaOrderedConstraintDescriptors[index] = tmp;
+
+					if((accumulatedConstraintsPerPartition[partitionIndex] - accumulatedConstraintsPerPartition[partitionIndex-1]) >= targetSize)
+					{
+						bTerm = true;
+						break;
+					}
+				}
+			}
+		}
+	}
+		
+	PxU32 partitionCount = 0;
+	PxU32 lastPartitionCount = 0;
+	for (PxU32 a = 0; a < numPartitions; ++a)
+	{
+		const PxU32 constraintCount = accumulatedConstraintsPerPartition[a];
+		accumulatedConstraintsPerPartition[partitionCount] = constraintCount;
+		if (constraintCount != lastPartitionCount)
+		{
+			lastPartitionCount = constraintCount;
+			partitionCount++;
+		}
+	}
+
+	accumulatedConstraintsPerPartition.forceSize_Unsafe(partitionCount);
+
+	return partitionCount;
+}
+
+#endif
+
+PxU32 partitionContactConstraints(ConstraintPartitionArgs& args) 
+{
+	PxU32 maxPartition = 0;
+	//Unpack the input data.
+	const PxU32 numBodies=args.mNumBodies;
+	const PxU32	numArticulations=args.mNumArticulationPtrs;
+	
+	const PxU32 numConstraintDescriptors=args.mNumContactConstraintDescriptors;
+
+	PxSolverConstraintDesc* PX_RESTRICT eaConstraintDescriptors=args.mContactConstraintDescriptors;
+	PxSolverConstraintDesc* PX_RESTRICT eaOrderedConstraintDescriptors=args.mOrderedContactConstraintDescriptors;
+	PxSolverConstraintDesc* PX_RESTRICT eaTempConstraintDescriptors=args.mTempContactConstraintDescriptors;
+
+	Ps::Array<PxU32>& constraintsPerPartition = *args.mConstraintsPerPartition;
+	constraintsPerPartition.forceSize_Unsafe(0);
+
+	const PxU32 stride = args.mStride;
+
+	for(PxU32 a = 0, offset = 0; a < numBodies; ++a, offset += stride)
+	{
+		PxSolverBody& body = *reinterpret_cast<PxSolverBody*>(args.mBodies + offset);
+		body.solverProgress = 0;
+		//We re-use maxSolverFrictionProgress and maxSolverNormalProgress to record the
+		//maximum partition used by dynamic constraints and the number of static constraints affecting
+		//a body. We use this to make partitioning much cheaper and be able to support 
+		body.maxSolverFrictionProgress = 0;
+		body.maxSolverNormalProgress = 0;
+	}
+
+	PxU32 numOrderedConstraints=0;	
+	PxU32 numStaticConstraints = 0;
+
+	if(numArticulations == 0)
+	{
+		RigidBodyClassification classification(args.mBodies, numBodies, stride);
+		classifyConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, constraintsPerPartition,
+			eaTempConstraintDescriptors);
+		
+		PxU32 accumulation = 0;
+		for(PxU32 a = 0; a < constraintsPerPartition.size(); ++a)
+		{
+			PxU32 count = constraintsPerPartition[a];
+			constraintsPerPartition[a] = accumulation;
+			accumulation += count;
+		}
+
+		for(PxU32 a = 0, offset = 0; a < numBodies; ++a, offset += stride)
+		{
+			PxSolverBody& body = *reinterpret_cast<PxSolverBody*>(args.mBodies + offset);
+			Ps::prefetchLine(&args.mBodies[a], 256);
+			body.solverProgress = 0;
+			//Keep the dynamic constraint count but bump the static constraint count back to 0.
+			//This allows us to place the static constraints in the appropriate place when we see them
+			//because we know the maximum index for the dynamic constraints...
+			body.maxSolverFrictionProgress = 0;
+		}
+
+		writeConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, constraintsPerPartition, 
+			eaTempConstraintDescriptors, eaOrderedConstraintDescriptors);
+
+		numOrderedConstraints = numConstraintDescriptors;
+
+		/*if(!args.enhancedDeterminism)
+			maxPartition = normalizePartitions(constraintsPerPartition, eaOrderedConstraintDescriptors, numConstraintDescriptors, *args.mBitField,
+				classification, numBodies, 0);*/
+
+	}
+	else
+	{
+		
+		const ArticulationSolverDesc* articulationDescs=args.mArticulationPtrs;
+		PX_ALLOCA(_eaArticulations, uintptr_t, numArticulations);
+		uintptr_t* eaArticulations = _eaArticulations;
+		for(PxU32 i=0;i<numArticulations;i++)
+		{
+			ArticulationV* articulation = articulationDescs[i].articulation;
+			eaArticulations[i]=uintptr_t(articulation);
+			articulation->solverProgress = 0;
+			articulation->maxSolverFrictionProgress = 0;
+			articulation->maxSolverNormalProgress = 0;
+		}
+		ExtendedRigidBodyClassification classification(args.mBodies, numBodies, stride, eaArticulations, numArticulations);
+
+		classifyConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, 
+			constraintsPerPartition, eaTempConstraintDescriptors);
+
+		PxU32 accumulation = 0;
+		for(PxU32 a = 0; a < constraintsPerPartition.size(); ++a)
+		{
+			PxU32 count = constraintsPerPartition[a];
+			constraintsPerPartition[a] = accumulation;
+			accumulation += count;
+		}
+
+		for(PxU32 a = 0, offset = 0; a < numBodies; ++a, offset += stride)
+		{
+			PxSolverBody& body = *reinterpret_cast<PxSolverBody*>(args.mBodies+offset);
+			body.solverProgress = 0;
+			//Keep the dynamic constraint count but bump the static constraint count back to 0.
+			//This allows us to place the static constraints in the appropriate place when we see them
+			//because we know the maximum index for the dynamic constraints...
+			body.maxSolverFrictionProgress = 0;
+		}
+
+		for(PxU32 a = 0; a < numArticulations; ++a)
+		{
+			ArticulationV* articulation = reinterpret_cast<ArticulationV*>(eaArticulations[a]);
+			articulation->solverProgress = 0;
+			articulation->maxSolverFrictionProgress = 0;
+		}
+
+		numStaticConstraints = writeConstraintDesc(eaConstraintDescriptors, numConstraintDescriptors, classification, constraintsPerPartition, 
+			eaTempConstraintDescriptors, eaOrderedConstraintDescriptors);
+
+		numOrderedConstraints = numConstraintDescriptors - numStaticConstraints;
+
+		/*if (!args.enhancedDeterminism)
+			maxPartition = normalizePartitions(constraintsPerPartition, eaOrderedConstraintDescriptors,  
+				numOrderedConstraints, *args.mBitField, classification, numBodies, numArticulations);*/
+
+	}
+
+
+
+	const PxU32 numConstraintsDifferentBodies=numOrderedConstraints;
+
+	//PX_ASSERT(numConstraintsDifferentBodies == numConstraintDescriptors);
+
+	//Now handle the articulated self-constraints.
+	PxU32 totalConstraintCount = numConstraintsDifferentBodies;	
+
+	args.mNumDifferentBodyConstraints=numConstraintsDifferentBodies;
+	args.mNumSelfConstraints=totalConstraintCount-numConstraintsDifferentBodies;
+
+	args.mNumStaticConstraints = numStaticConstraints;
+
+	//if (args.enhancedDeterminism)
+	{
+		PxU32 prevPartitionSize = 0;
+		maxPartition = 0;
+		for (PxU32 a = 0; a < constraintsPerPartition.size(); ++a, maxPartition++)
+		{
+			if (constraintsPerPartition[a] == prevPartitionSize)
+				break;
+			prevPartitionSize = constraintsPerPartition[a];
+		}
+	}
+
+	return maxPartition;
+}
+
+}
+
+}
--- a/physx/source/lowleveldynamics/src/DyConstraintPartition.h
+++ b/physx/source/lowleveldynamics/src/DyConstraintPartition.h
@ -0,0 +1,75 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_CONSTRAINTPARTITION_H
+#define DY_CONSTRAINTPARTITION_H
+
+#include "DyDynamics.h"
+
+
+
+namespace physx
+{
+
+namespace Dy
+{
+struct ConstraintPartitionArgs
+{
+
+	//Input
+	PxU8*									mBodies;
+	PxU32									mNumBodies;
+	PxU32									mStride;
+	ArticulationSolverDesc*					mArticulationPtrs;
+	PxU32									mNumArticulationPtrs;
+	PxSolverConstraintDesc*					mContactConstraintDescriptors;
+	PxU32									mNumContactConstraintDescriptors;
+	//output
+	PxSolverConstraintDesc*					mOrderedContactConstraintDescriptors;
+	PxSolverConstraintDesc*					mTempContactConstraintDescriptors;
+	PxU32									mNumDifferentBodyConstraints;
+	PxU32									mNumSelfConstraints;
+	PxU32									mNumStaticConstraints;
+	Ps::Array<PxU32>*						mConstraintsPerPartition;
+	Ps::Array<PxU32>*						mBitField;
+
+	bool									enhancedDeterminism;
+};
+
+PxU32 partitionContactConstraints(ConstraintPartitionArgs& args);
+
+} // namespace physx
+
+}
+
+
+
+#endif // DY_CONSTRAINTPARTITION_H  
+
--- a/physx/source/lowleveldynamics/src/DyConstraintPrep.h
+++ b/physx/source/lowleveldynamics/src/DyConstraintPrep.h
@ -0,0 +1,102 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_CONSTRAINTSHADER_H
+#define DY_CONSTRAINTSHADER_H
+
+#include "DyConstraint.h"
+
+#include "DySolverConstraintDesc.h"
+#include "PsArray.h"
+
+#define DY_ARTICULATION_MIN_RESPONSE 1e-5f
+#define DY_ARTICULATION_CFM	1e-4f
+
+#define DY_ARTICULATION_BAD_RESPONSE 0.02f
+
+namespace physx
+{
+
+class PxcConstraintBlockStream;
+class PxsConstraintBlockManager;
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+
+namespace Cm
+{
+	struct SpatialVectorF;
+}
+
+namespace Dy
+{
+
+	static const PxU32 MAX_CONSTRAINT_ROWS = 12;
+
+struct SolverConstraintShaderPrepDesc
+{
+	const Constraint* constraint;
+	PxConstraintSolverPrep solverPrep;
+	const void* constantBlock;
+	PxU32 constantBlockByteSize;
+};
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+	(SolverConstraintShaderPrepDesc* PX_RESTRICT constraintShaderDescs,
+	PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+		const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+		 PxConstraintAllocator& allocator);
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+	(PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+	const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+	PxConstraintAllocator& allocator, PxU32 maxRows);
+
+PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc,
+							PxSolverConstraintPrepDesc& prepDesc,
+							   PxConstraintAllocator& allocator,
+							   PxReal dt, PxReal invdt, Cm::SpatialVectorF* Z);
+
+
+class ConstraintHelper
+{
+public:
+
+	static PxU32 setupSolverConstraint(
+		PxSolverConstraintPrepDesc& prepDesc,
+		PxConstraintAllocator& allocator,
+		PxReal dt, PxReal invdt, Cm::SpatialVectorF* Z);
+};
+
+}
+
+}
+
+#endif //DY_CONSTRAINTSHADER_H
--- a/physx/source/lowleveldynamics/src/DyConstraintSetup.cpp
+++ b/physx/source/lowleveldynamics/src/DyConstraintSetup.cpp
@ -0,0 +1,599 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxMemory.h"
+#include "DyConstraintPrep.h"
+#include "PxsRigidBody.h"
+#include "DySolverConstraint1D.h"
+#include "PsSort.h"
+#include "DySolverConstraintDesc.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyArticulationContactPrep.h"
+#include "PsFoundation.h"
+
+namespace physx
+{
+namespace Dy
+{
+	// dsequeira:
+	//
+	// we can choose any linear combination of equality constraints and get the same solution
+	// Hence we can orthogonalize the constraints using the inner product given by the
+	// inverse mass matrix, so that when we use PGS, solving a constraint row for a joint 
+	// don't disturb the solution of prior rows.
+	//
+	// We also eliminate the equality constraints from the hard inequality constraints - 
+	// (essentially projecting the direction corresponding to the lagrange multiplier 
+	// onto the equality constraint subspace) but 'til I've verified this generates 
+	// exactly the same KKT/complementarity conditions, status is 'experimental'. 	
+	//
+	// since for equality constraints the resulting rows have the property that applying
+	// an impulse along one row doesn't alter the projected velocity along another row, 
+	// all equality constraints (plus one inequality constraint) can be processed in parallel
+	// using SIMD
+	//
+	// Eliminating the inequality constraints from each other would require a solver change
+	// and not give us any more parallelism, although we might get better convergence.
+
+namespace
+{
+	PX_FORCE_INLINE Vec3V V3FromV4(Vec4V x)			{ return Vec3V_From_Vec4V(x); }
+	PX_FORCE_INLINE Vec3V V3FromV4Unsafe(Vec4V x)	{ return Vec3V_From_Vec4V_WUndefined(x); }
+	PX_FORCE_INLINE Vec4V V4FromV3(Vec3V x)			{ return Vec4V_From_Vec3V(x); }
+	//PX_FORCE_INLINE Vec4V V4ClearW(Vec4V x)			{ return V4SetW(x, FZero()); }
+
+struct MassProps
+{
+	FloatV invMass0;
+	FloatV invMass1;
+	FloatV invInertiaScale0;
+	FloatV invInertiaScale1;
+
+	PX_FORCE_INLINE MassProps(const PxReal imass0, const PxReal imass1, const PxConstraintInvMassScale& ims)
+	:	
+		invMass0(FLoad(imass0 * ims.linear0))
+	,	invMass1(FLoad(imass1 * ims.linear1))
+	,	invInertiaScale0(FLoad(ims.angular0))
+	,	invInertiaScale1(FLoad(ims.angular1))
+	{}
+};
+
+
+PX_FORCE_INLINE PxReal innerProduct(const Px1DConstraint& row0, Px1DConstraint& row1, 
+								 PxVec4& row0AngSqrtInvInertia0, PxVec4& row0AngSqrtInvInertia1, 
+								 PxVec4& row1AngSqrtInvInertia0, PxVec4& row1AngSqrtInvInertia1, const MassProps& m)
+{
+	const Vec3V l0 = V3Mul(V3Scale(V3LoadA(row0.linear0), m.invMass0), V3LoadA(row1.linear0));
+	const Vec3V l1 = V3Mul(V3Scale(V3LoadA(row0.linear1), m.invMass1), V3LoadA(row1.linear1));
+	Vec4V r0ang0 = V4LoadA(&row0AngSqrtInvInertia0.x);
+	Vec4V r1ang0 = V4LoadA(&row1AngSqrtInvInertia0.x);
+	Vec4V r0ang1 = V4LoadA(&row0AngSqrtInvInertia1.x);
+	Vec4V r1ang1 = V4LoadA(&row1AngSqrtInvInertia1.x);
+
+	const Vec3V i0 = V3ScaleAdd(V3Mul(Vec3V_From_Vec4V(r0ang0), Vec3V_From_Vec4V(r1ang0)), m.invInertiaScale0, l0);
+	const Vec3V i1 = V3ScaleAdd(V3MulAdd(Vec3V_From_Vec4V(r0ang1), Vec3V_From_Vec4V(r1ang1), i0), m.invInertiaScale1, l1);
+	PxF32 f;
+	FStore(V3SumElems(i1), &f);
+	return f;
+}
+
+
+// indexed rotation around axis, with sine and cosine of half-angle
+PX_FORCE_INLINE PxQuat indexedRotation(PxU32 axis, PxReal s, PxReal c)
+{
+	PxQuat q(0,0,0,c);
+	reinterpret_cast<PxReal*>(&q)[axis] = s;
+	return q;
+}
+
+PxQuat diagonalize(const PxMat33& m)	// jacobi rotation using quaternions 
+{
+	const PxU32 MAX_ITERS = 5;
+
+	PxQuat q = PxQuat(PxIdentity);
+
+	PxMat33 d;
+	for(PxU32 i=0; i < MAX_ITERS;i++)
+	{
+		const PxMat33 axes(q);
+		d = axes.getTranspose() * m * axes;
+
+		const PxReal d0 = PxAbs(d[1][2]), d1 = PxAbs(d[0][2]), d2 = PxAbs(d[0][1]);
+		const PxU32 a = PxU32(d0 > d1 && d0 > d2 ? 0 : d1 > d2 ? 1 : 2);						// rotation axis index, from largest off-diagonal element
+
+		const PxU32 a1 = Ps::getNextIndex3(a), a2 = Ps::getNextIndex3(a1);											
+		if(d[a1][a2] == 0.0f || PxAbs(d[a1][a1]-d[a2][a2]) > 2e6f*PxAbs(2.0f*d[a1][a2]))
+			break;
+
+		const PxReal w = (d[a1][a1]-d[a2][a2]) / (2.0f*d[a1][a2]);					// cot(2 * phi), where phi is the rotation angle
+		const PxReal absw = PxAbs(w);
+
+		PxQuat r;
+		if(absw>1000)
+			r = indexedRotation(a, 1.0f/(4.0f*w), 1.f);									// h will be very close to 1, so use small angle approx instead
+		else
+		{
+  			const PxReal t = 1 / (absw + PxSqrt(w*w+1));								// absolute value of tan phi
+			const PxReal h = 1 / PxSqrt(t*t+1);										// absolute value of cos phi
+
+			PX_ASSERT(h!=1);													// |w|<1000 guarantees this with typical IEEE754 machine eps (approx 6e-8)
+			r = indexedRotation(a, PxSqrt((1-h)/2) * PxSign(w), PxSqrt((1+h)/2));
+		}
+	
+		q = (q*r).getNormalized();
+	}
+
+	return q;
+}
+
+
+PX_FORCE_INLINE void rescale(const Mat33V& m, PxVec3& a0, PxVec3& a1, PxVec3& a2)
+{
+	const Vec3V va0 = V3LoadU(a0);
+	const Vec3V va1 = V3LoadU(a1);
+	const Vec3V va2 = V3LoadU(a2);
+
+	const Vec3V b0 = V3ScaleAdd(va0, V3GetX(m.col0), V3ScaleAdd(va1, V3GetY(m.col0), V3Scale(va2, V3GetZ(m.col0))));
+	const Vec3V b1 = V3ScaleAdd(va0, V3GetX(m.col1), V3ScaleAdd(va1, V3GetY(m.col1), V3Scale(va2, V3GetZ(m.col1))));
+	const Vec3V b2 = V3ScaleAdd(va0, V3GetX(m.col2), V3ScaleAdd(va1, V3GetY(m.col2), V3Scale(va2, V3GetZ(m.col2))));
+
+	V3StoreU(b0, a0);
+	V3StoreU(b1, a1);
+	V3StoreU(b2, a2);
+}
+
+PX_FORCE_INLINE void rescale4(const Mat33V& m, PxReal* a0, PxReal* a1, PxReal* a2)
+{
+	const Vec4V va0 = V4LoadA(a0);
+	const Vec4V va1 = V4LoadA(a1);
+	const Vec4V va2 = V4LoadA(a2);
+
+	const Vec4V b0 = V4ScaleAdd(va0, V3GetX(m.col0), V4ScaleAdd(va1, V3GetY(m.col0), V4Scale(va2, V3GetZ(m.col0))));
+	const Vec4V b1 = V4ScaleAdd(va0, V3GetX(m.col1), V4ScaleAdd(va1, V3GetY(m.col1), V4Scale(va2, V3GetZ(m.col1))));
+	const Vec4V b2 = V4ScaleAdd(va0, V3GetX(m.col2), V4ScaleAdd(va1, V3GetY(m.col2), V4Scale(va2, V3GetZ(m.col2))));
+
+	V4StoreA(b0, a0);
+	V4StoreA(b1, a1);
+	V4StoreA(b2, a2);
+}
+
+void diagonalize(Px1DConstraint** row,
+				 PxVec4* angSqrtInvInertia0,
+				 PxVec4* angSqrtInvInertia1,
+				 const MassProps &m)
+{
+	PxReal a00 = innerProduct(*row[0], *row[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[0], angSqrtInvInertia1[0], m);
+	PxReal a01 = innerProduct(*row[0], *row[1], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
+	PxReal a02 = innerProduct(*row[0], *row[2], angSqrtInvInertia0[0], angSqrtInvInertia1[0], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+	PxReal a11 = innerProduct(*row[1], *row[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[1], angSqrtInvInertia1[1], m);
+	PxReal a12 = innerProduct(*row[1], *row[2], angSqrtInvInertia0[1], angSqrtInvInertia1[1], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+	PxReal a22 = innerProduct(*row[2], *row[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], angSqrtInvInertia0[2], angSqrtInvInertia1[2], m);
+
+	PxMat33 a(PxVec3(a00, a01, a02),
+			  PxVec3(a01, a11, a12),
+			  PxVec3(a02, a12, a22));
+
+	PxQuat q = diagonalize(a);
+
+	PxMat33 n(-q);
+
+	Mat33V mn(V3LoadU(n.column0), V3LoadU(n.column1), V3LoadU(n.column2));
+
+	//KS - We treat as a Vec4V so that we get geometricError rescaled for free along with linear0
+	rescale4(mn, &row[0]->linear0.x, &row[1]->linear0.x, &row[2]->linear0.x);
+	rescale(mn, row[0]->linear1, row[1]->linear1, row[2]->linear1);
+	//KS - We treat as a PxVec4 so that we get velocityTarget rescaled for free 
+	rescale4(mn, &row[0]->angular0.x, &row[1]->angular0.x, &row[2]->angular0.x);
+	rescale(mn, row[0]->angular1, row[1]->angular1, row[2]->angular1);
+	rescale4(mn, &angSqrtInvInertia0[0].x, &angSqrtInvInertia0[1].x, &angSqrtInvInertia0[2].x);
+	rescale4(mn, &angSqrtInvInertia1[0].x, &angSqrtInvInertia1[1].x, &angSqrtInvInertia1[2].x);
+	
+}
+
+void orthogonalize(Px1DConstraint** row,
+				   PxVec4* angSqrtInvInertia0,
+				   PxVec4* angSqrtInvInertia1,
+				   PxU32 rowCount,
+				   PxU32 eqRowCount,
+				   const MassProps &m)
+{
+	PX_ASSERT(eqRowCount<=6);
+
+	const FloatV zero = FZero();
+
+	Vec3V lin1m[6], ang1m[6], lin1[6], ang1[6];	
+	Vec4V lin0m[6], ang0m[6];			// must have 0 in the W-field
+	Vec4V lin0AndG[6], ang0AndT[6];
+
+	for(PxU32 i=0;i<rowCount;i++)
+	{
+		Vec4V l0AndG = V4LoadA(&row[i]->linear0.x);		// linear0 and geometric error
+		Vec4V a0AndT = V4LoadA(&row[i]->angular0.x);	// angular0 and velocity target
+
+		Vec3V l1 = V3FromV4(V4LoadA(&row[i]->linear1.x));
+		Vec3V a1 = V3FromV4(V4LoadA(&row[i]->angular1.x));
+
+		Vec4V angSqrtL0 = V4LoadA(&angSqrtInvInertia0[i].x);
+		Vec4V angSqrtL1 = V4LoadA(&angSqrtInvInertia1[i].x);
+
+		PxU32 eliminationRows = PxMin<PxU32>(i, eqRowCount);
+		for(PxU32 j=0;j<eliminationRows;j++)
+		{
+			const Vec3V s0 = V3MulAdd(l1, lin1m[j], V3FromV4Unsafe(V4Mul(l0AndG, lin0m[j])));
+			const Vec3V s1 = V3MulAdd(V3FromV4Unsafe(angSqrtL1), ang1m[j], V3FromV4Unsafe(V4Mul(angSqrtL0, ang0m[j])));
+			FloatV t = V3SumElems(V3Add(s0, s1));
+
+			l0AndG = V4NegScaleSub(lin0AndG[j], t, l0AndG);
+			a0AndT = V4NegScaleSub(ang0AndT[j], t, a0AndT);
+			l1 = V3NegScaleSub(lin1[j], t, l1);
+			a1 = V3NegScaleSub(ang1[j], t, a1);
+			angSqrtL0 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia0[j].x), t, angSqrtL0);
+			angSqrtL1 = V4NegScaleSub(V4LoadA(&angSqrtInvInertia1[j].x), t, angSqrtL1);
+		}
+
+		V4StoreA(l0AndG, &row[i]->linear0.x);
+		V4StoreA(a0AndT, &row[i]->angular0.x);
+		V3StoreA(l1, row[i]->linear1);
+		V3StoreA(a1, row[i]->angular1);
+		V4StoreA(angSqrtL0, &angSqrtInvInertia0[i].x);
+		V4StoreA(angSqrtL1, &angSqrtInvInertia1[i].x);
+
+		if(i<eqRowCount)
+		{
+			lin0AndG[i] = l0AndG;	
+			ang0AndT[i] = a0AndT;
+			lin1[i] = l1;	
+			ang1[i] = a1;	
+			
+			const Vec3V l0 = V3FromV4(l0AndG);
+
+			const Vec3V l0m = V3Scale(l0, m.invMass0);
+			const Vec3V l1m = V3Scale(l1, m.invMass1);
+			const Vec4V a0m = V4Scale(angSqrtL0, m.invInertiaScale0);
+			const Vec4V a1m = V4Scale(angSqrtL1, m.invInertiaScale1);
+
+			const Vec3V s0 = V3MulAdd(l0, l0m, V3Mul(l1, l1m));
+			const Vec4V s1 = V4MulAdd(a0m, angSqrtL0, V4Mul(a1m, angSqrtL1));
+			const FloatV s = V3SumElems(V3Add(s0, V3FromV4Unsafe(s1)));
+			const FloatV a = FSel(FIsGrtr(s, zero), FRecip(s), zero);	// with mass scaling, it's possible for the inner product of a row to be zero
+
+			lin0m[i] = V4Scale(V4ClearW(V4FromV3(l0m)), a);	
+			ang0m[i] = V4Scale(V4ClearW(a0m), a);
+			lin1m[i] = V3Scale(l1m, a);
+			ang1m[i] = V3Scale(V3FromV4Unsafe(a1m), a);
+		}
+	}
+}
+}
+
+
+void preprocessRows(Px1DConstraint** sorted, 
+					Px1DConstraint* rows,
+					PxVec4* angSqrtInvInertia0,
+					PxVec4* angSqrtInvInertia1,
+					PxU32 rowCount,
+					const PxMat33& sqrtInvInertia0F32,
+					const PxMat33& sqrtInvInertia1F32,
+					const PxReal invMass0,
+					const PxReal invMass1,
+					const PxConstraintInvMassScale& ims,
+					bool disablePreprocessing,
+					bool diagonalizeDrive,
+					bool preprocessLinear)
+{
+	// j is maxed at 12, typically around 7, so insertion sort is fine
+	for(PxU32 i=0; i<rowCount; i++)
+	{
+		Px1DConstraint* r = rows+i;
+		
+		PxU32 j = i;
+		for(;j>0 && r->solveHint < sorted[j-1]->solveHint; j--)
+			sorted[j] = sorted[j-1];
+
+		sorted[j] = r;
+	}
+
+	for(PxU32 i=0;i<rowCount-1;i++)
+		PX_ASSERT(sorted[i]->solveHint <= sorted[i+1]->solveHint);
+
+	for (PxU32 i = 0; i<rowCount; i++)
+		rows[i].forInternalUse = rows[i].flags & Px1DConstraintFlag::eKEEPBIAS ? rows[i].geometricError : 0;
+
+
+	const Mat33V sqrtInvInertia0 = Mat33V(V3LoadU(sqrtInvInertia0F32.column0), V3LoadU(sqrtInvInertia0F32.column1),
+		V3LoadU(sqrtInvInertia0F32.column2));
+
+	const Mat33V sqrtInvInertia1 = Mat33V(V3LoadU(sqrtInvInertia1F32.column0), V3LoadU(sqrtInvInertia1F32.column1),
+		V3LoadU(sqrtInvInertia1F32.column2));
+
+	PX_ASSERT(((uintptr_t(angSqrtInvInertia0)) & 0xF) == 0);
+	PX_ASSERT(((uintptr_t(angSqrtInvInertia1)) & 0xF) == 0);
+
+	for(PxU32 i = 0; i < rowCount; ++i)
+	{
+		const Vec3V angDelta0 = M33MulV3(sqrtInvInertia0, V3LoadU(sorted[i]->angular0));
+		const Vec3V angDelta1 = M33MulV3(sqrtInvInertia1, V3LoadU(sorted[i]->angular1));
+		V4StoreA(Vec4V_From_Vec3V(angDelta0), &angSqrtInvInertia0[i].x);
+		V4StoreA(Vec4V_From_Vec3V(angDelta1), &angSqrtInvInertia1[i].x);
+	}
+
+	if(disablePreprocessing)
+		return;
+
+	MassProps m(invMass0, invMass1, ims);
+	for(PxU32 i=0;i<rowCount;)
+	{
+		const PxU32 groupMajorId = PxU32(sorted[i]->solveHint>>8), start = i++;
+		while(i<rowCount && PxU32(sorted[i]->solveHint>>8) == groupMajorId)
+			i++;
+
+		if(groupMajorId == 4 || (groupMajorId == 8 && preprocessLinear))
+		{
+			PxU32 bCount = start;		// count of bilateral constraints 
+			for(; bCount<i && (sorted[bCount]->solveHint&255)==0; bCount++)
+				;
+			orthogonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, i-start, bCount-start, m);
+		}
+
+		if(groupMajorId == 1 && diagonalizeDrive)
+		{			
+			PxU32 slerp = start;		// count of bilateral constraints 
+			for(; slerp<i && (sorted[slerp]->solveHint&255)!=2; slerp++)
+				;
+			if(slerp+3 == i)
+				diagonalize(sorted+slerp, angSqrtInvInertia0+slerp, angSqrtInvInertia1+slerp, m);
+
+			PX_ASSERT(i-start==3);
+			diagonalize(sorted+start, angSqrtInvInertia0+start, angSqrtInvInertia1+start, m);
+		}
+	}
+}
+
+
+
+
+
+PxU32 ConstraintHelper::setupSolverConstraint(
+PxSolverConstraintPrepDesc& prepDesc,
+PxConstraintAllocator& allocator,
+PxReal dt, PxReal invdt,
+Cm::SpatialVectorF* Z)
+{
+	if (prepDesc.numRows == 0)
+	{
+		prepDesc.desc->constraint = NULL;
+		prepDesc.desc->writeBack = NULL;
+		prepDesc.desc->constraintLengthOver16 = 0;
+		prepDesc.desc->writeBackLengthOver4 = 0;
+		return 0;
+	}
+
+	PxSolverConstraintDesc& desc = *prepDesc.desc;
+
+	bool isExtended = desc.linkIndexA != PxSolverConstraintDesc::NO_LINK
+		|| desc.linkIndexB != PxSolverConstraintDesc::NO_LINK;
+
+	PxU32 stride = isExtended ? sizeof(SolverConstraint1DExt) : sizeof(SolverConstraint1D);
+	const PxU32 constraintLength = sizeof(SolverConstraint1DHeader) + stride * prepDesc.numRows;
+	
+	//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
+	//know SPU DMAs have completed)
+	PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
+	if(NULL == ptr || (reinterpret_cast<PxU8*>(-1))==ptr)
+	{
+		if(NULL==ptr)
+		{
+			PX_WARN_ONCE(
+				"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+				"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			return 0;
+		}
+		else
+		{
+			PX_WARN_ONCE(
+				"Attempting to allocate more than 16K of constraint data. "
+				"Either accept joints detaching/exploding or simplify constraints.");
+			ptr=NULL;
+			return 0;
+		}
+	}
+	desc.constraint = ptr;
+
+	setConstraintLength(desc,constraintLength);
+
+	desc.writeBack = prepDesc.writeback;
+	setWritebackLength(desc, sizeof(ConstraintWriteback));
+
+	memset(desc.constraint, 0, constraintLength);
+
+	SolverConstraint1DHeader* header = reinterpret_cast<SolverConstraint1DHeader*>(desc.constraint);
+	PxU8* constraints = desc.constraint + sizeof(SolverConstraint1DHeader);
+	init(*header, Ps::to8(prepDesc.numRows), isExtended, prepDesc.invMassScales);
+	header->body0WorldOffset = prepDesc.body0WorldOffset;
+	header->linBreakImpulse = prepDesc.linBreakForce * dt;
+	header->angBreakImpulse = prepDesc.angBreakForce * dt;
+	header->breakable = PxU8((prepDesc.linBreakForce != PX_MAX_F32) || (prepDesc.angBreakForce != PX_MAX_F32));
+	header->invMass0D0 = prepDesc.data0->invMass * prepDesc.invMassScales.linear0;
+	header->invMass1D1 = prepDesc.data1->invMass * prepDesc.invMassScales.linear1;
+
+
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS];
+	
+	Px1DConstraint* sorted[MAX_CONSTRAINT_ROWS];
+
+	preprocessRows(sorted, prepDesc.rows, angSqrtInvInertia0, angSqrtInvInertia1, prepDesc.numRows, 
+		prepDesc.data0->sqrtInvInertia, prepDesc.data1->sqrtInvInertia, prepDesc.data0->invMass, prepDesc.data1->invMass, 
+		prepDesc.invMassScales, isExtended || prepDesc.disablePreprocessing, prepDesc.improvedSlerp, true);
+
+	const PxReal erp = 1.0f;
+	for (PxU32 i = 0; i<prepDesc.numRows; i++)
+	{
+		Ps::prefetchLine(constraints, 128);
+		SolverConstraint1D &s = *reinterpret_cast<SolverConstraint1D *>(constraints);
+		Px1DConstraint& c = *sorted[i];
+
+		PxReal driveScale = c.flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && prepDesc.driveLimitsAreForces ? PxMin(dt, 1.0f) : 1.0f;
+
+		PxReal unitResponse;
+		PxReal normalVel = 0.0f;
+		PxReal initVel = 0.f;
+
+		PxReal minResponseThreshold = prepDesc.minResponseThreshold;
+
+		if(!isExtended)
+		{
+			init(s, c.linear0, c.linear1, PxVec3(angSqrtInvInertia0[i].x, angSqrtInvInertia0[i].y, angSqrtInvInertia0[i].z),
+				PxVec3(angSqrtInvInertia1[i].x, angSqrtInvInertia1[i].y, angSqrtInvInertia1[i].z), c.minImpulse * driveScale, c.maxImpulse * driveScale);
+			s.ang0Writeback = c.angular0;
+			PxReal resp0 = s.lin0.magnitudeSquared() * prepDesc.data0->invMass * prepDesc.invMassScales.linear0 + s.ang0.magnitudeSquared() * prepDesc.invMassScales.angular0;
+			PxReal resp1 = s.lin1.magnitudeSquared() * prepDesc.data1->invMass * prepDesc.invMassScales.linear1 + s.ang1.magnitudeSquared() * prepDesc.invMassScales.angular1;
+			unitResponse = resp0 + resp1;
+			initVel = normalVel = prepDesc.data0->projectVelocity(c.linear0, c.angular0) - prepDesc.data1->projectVelocity(c.linear1, c.angular1);
+		}
+		else
+		{
+			init(s, c.linear0, c.linear1, c.angular0, c.angular1, c.minImpulse * driveScale, c.maxImpulse * driveScale);
+			SolverConstraint1DExt& e = static_cast<SolverConstraint1DExt&>(s);
+
+			const SolverExtBody eb0(reinterpret_cast<const void*>(prepDesc.body0), prepDesc.data0, desc.linkIndexA);
+			const SolverExtBody eb1(reinterpret_cast<const void*>(prepDesc.body1), prepDesc.data1, desc.linkIndexB);
+
+			const Cm::SpatialVector resp0 = createImpulseResponseVector(e.lin0, e.ang0, eb0);
+			const Cm::SpatialVector resp1 = createImpulseResponseVector(-e.lin1, -e.ang1, eb1);
+			unitResponse = getImpulseResponse(eb0, resp0, unsimdRef(e.deltaVA), prepDesc.invMassScales.linear0, prepDesc.invMassScales.angular0,
+				eb1, resp1, unsimdRef(e.deltaVB), prepDesc.invMassScales.linear1, prepDesc.invMassScales.angular1, Z, false);
+
+			//Add CFM term!
+			/*if(unitResponse > DY_ARTICULATION_MIN_RESPONSE)
+				unitResponse += DY_ARTICULATION_CFM;*/
+			
+			s.ang0Writeback = c.angular0;
+			s.lin0 = resp0.linear;
+			s.ang0 = resp0.angular;
+			s.lin1 = -resp1.linear;
+			s.ang1 = -resp1.angular;
+			PxReal vel0, vel1;
+			if(needsNormalVel(c) || eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK || eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+			{
+				vel0 = eb0.projectVelocity(c.linear0, c.angular0);
+				vel1 = eb1.projectVelocity(c.linear1, c.angular1);
+
+				normalVel = vel0 - vel1;
+
+				//normalVel = eb0.projectVelocity(s.lin0, s.ang0) - eb1.projectVelocity(s.lin1, s.ang1);
+				if(eb0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+					initVel = vel0;
+				else if(eb1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+					initVel = -vel1;
+
+			}
+
+			minResponseThreshold = PxMax(minResponseThreshold, DY_ARTICULATION_MIN_RESPONSE);
+		}
+
+		setSolverConstants(s.constant, s.unbiasedConstant, s.velMultiplier, s.impulseMultiplier, 
+			c, normalVel, unitResponse, minResponseThreshold, erp, dt, invdt);
+
+		//s.targetVelocity = initVel;
+		const PxReal velBias = initVel * s.velMultiplier;
+		s.constant += velBias;
+		s.unbiasedConstant += velBias;
+
+		if(c.flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+			s.flags |= DY_SC_FLAG_OUTPUT_FORCE;
+
+		constraints += stride;
+	}
+
+	//KS - Set the solve count at the end to 0 
+	*(reinterpret_cast<PxU32*>(constraints)) = 0;
+	*(reinterpret_cast<PxU32*>(constraints + 4)) = 0;
+	PX_ASSERT(desc.constraint + getConstraintLength(desc) == constraints);
+	return prepDesc.numRows;
+}
+
+PxU32 SetupSolverConstraint(SolverConstraintShaderPrepDesc& shaderDesc,
+	PxSolverConstraintPrepDesc& prepDesc,
+	PxConstraintAllocator& allocator,
+	PxReal dt, PxReal invdt, Cm::SpatialVectorF* Z)
+{
+	// LL shouldn't see broken constraints
+	
+	PX_ASSERT(!(reinterpret_cast<ConstraintWriteback*>(prepDesc.writeback)->broken));
+
+	setConstraintLength(*prepDesc.desc, 0);
+
+	if (!shaderDesc.solverPrep)
+		return 0;
+
+	//PxU32 numAxisConstraints = 0;
+
+	Px1DConstraint rows[MAX_CONSTRAINT_ROWS];
+
+	// This is necessary so that there will be sensible defaults and shaders will
+	// continue to work (albeit with a recompile) if the row format changes.
+	// It's a bit inefficient because it fills in all constraint rows even if there
+	// is only going to be one generated. A way around this would be for the shader to
+	// specify the maximum number of rows it needs, or it could call a subroutine to
+	// prep the row before it starts filling it it.
+
+	PxMemZero(rows, sizeof(Px1DConstraint)*MAX_CONSTRAINT_ROWS);
+
+	for (PxU32 i = 0; i<MAX_CONSTRAINT_ROWS; i++)
+	{
+		Px1DConstraint& c = rows[i];
+		//Px1DConstraintInit(c);
+		c.minImpulse = -PX_MAX_REAL;
+		c.maxImpulse = PX_MAX_REAL;
+	}
+
+	prepDesc.invMassScales.linear0 = prepDesc.invMassScales.linear1 = prepDesc.invMassScales.angular0 = prepDesc.invMassScales.angular1 = 1.f;
+
+	PxVec3 body0WorldOffset(0.f);
+	PxVec3 ra, rb;
+	PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
+		body0WorldOffset,
+		MAX_CONSTRAINT_ROWS,
+		prepDesc.invMassScales,
+		shaderDesc.constantBlock,
+		prepDesc.bodyFrame0, prepDesc.bodyFrame1, prepDesc.extendedLimits, ra, rb);
+
+	prepDesc.rows = rows;
+	prepDesc.numRows = constraintCount;
+
+	prepDesc.body0WorldOffset = body0WorldOffset;
+
+	return ConstraintHelper::setupSolverConstraint(prepDesc, allocator, dt, invdt, Z);
+}
+
+}
+
+}
--- a/physx/source/lowleveldynamics/src/DyConstraintSetupBlock.cpp
+++ b/physx/source/lowleveldynamics/src/DyConstraintSetupBlock.cpp
@ -0,0 +1,541 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxMemory.h"
+#include "DyConstraintPrep.h"
+#include "PxsRigidBody.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraint1D4.h"
+#include "PsSort.h"
+#include "PxcConstraintBlockStream.h"
+#include "DyArticulationContactPrep.h"
+#include "PsFoundation.h"
+namespace physx
+{
+
+namespace Dy
+{
+
+	void preprocessRows(Px1DConstraint** sorted,
+		Px1DConstraint* rows,
+		PxVec4* angSqrtInvInertia0,
+		PxVec4* angSqrtInvInertia1,
+		PxU32 rowCount,
+		const PxMat33& sqrtInvInertia0F32,
+		const PxMat33& sqrtInvInertia1F32,
+		const PxReal invMass0,
+		const PxReal invMass1,
+		const PxConstraintInvMassScale& ims,
+		bool disablePreprocessing,
+		bool diagonalizeDrive,
+		bool preprocessLinear = true);
+
+
+namespace
+{
+void setConstants(PxReal& constant, PxReal& unbiasedConstant, PxReal& velMultiplier, PxReal& impulseMultiplier,
+				  const Px1DConstraint& c, PxReal unitResponse, PxReal minRowResponse, PxReal erp, PxReal dt, PxReal recipdt,
+				  const PxSolverBodyData& b0, const PxSolverBodyData& b1, const bool finished)
+{
+	if(finished)
+	{
+		constant = 0.f;
+		unbiasedConstant = 0.f;
+		velMultiplier = 0.f;
+		impulseMultiplier = 0.f;
+		return;
+	}
+	PxReal nv = needsNormalVel(c) ? b0.projectVelocity(c.linear0, c.angular0) - b1.projectVelocity(c.linear1, c.angular1)
+								  : 0;
+	
+	setSolverConstants(constant, unbiasedConstant, velMultiplier, impulseMultiplier, 
+					   c, nv, unitResponse, minRowResponse, erp, dt, recipdt);
+}
+}
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+		(PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+		const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+		PxConstraintAllocator& allocator, PxU32 maxRows);
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+(SolverConstraintShaderPrepDesc* PX_RESTRICT constraintShaderDescs,
+PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+PxConstraintAllocator& allocator)
+
+{
+	//KS - we will never get here with constraints involving articulations so we don't need to stress about those in here
+
+	totalRows = 0;
+
+	Px1DConstraint allRows[MAX_CONSTRAINT_ROWS * 4];
+	
+	PxU32 numRows = 0;
+
+	PxU32 maxRows = 0;
+	PxU32 preppedIndex = 0;
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		Px1DConstraint* rows = allRows + numRows;
+		SolverConstraintShaderPrepDesc& shaderDesc = constraintShaderDescs[a];
+		PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+
+		if (!shaderDesc.solverPrep)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		PxMemZero(rows + preppedIndex, sizeof(Px1DConstraint)*(MAX_CONSTRAINT_ROWS));
+		for (PxU32 b = preppedIndex; b < MAX_CONSTRAINT_ROWS; ++b)
+		{
+			Px1DConstraint& c = rows[b];
+			//Px1DConstraintInit(c);
+			c.minImpulse = -PX_MAX_REAL;
+			c.maxImpulse = PX_MAX_REAL;
+		}
+
+		desc.invMassScales.linear0 = desc.invMassScales.linear1 = desc.invMassScales.angular0 = desc.invMassScales.angular1 = 1.f;
+
+		desc.body0WorldOffset = PxVec3(0.f);
+
+		PxVec3 ra, rb;
+
+		PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
+			desc.body0WorldOffset,
+			MAX_CONSTRAINT_ROWS,
+			desc.invMassScales,
+			shaderDesc.constantBlock,
+			desc.bodyFrame0, desc.bodyFrame1, desc.extendedLimits, ra, rb);
+
+		preppedIndex = MAX_CONSTRAINT_ROWS - constraintCount;
+
+		maxRows = PxMax(constraintCount, maxRows);
+
+		if (constraintCount == 0)
+			return SolverConstraintPrepState::eUNBATCHABLE;
+
+		desc.rows = rows;
+		desc.numRows = constraintCount;
+		numRows += constraintCount;
+	}
+
+	return setupSolverConstraint4(constraintDescs, dt, recipdt, totalRows, allocator, maxRows);
+}
+
+SolverConstraintPrepState::Enum setupSolverConstraint4
+(PxSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+const PxReal dt, const PxReal recipdt, PxU32& totalRows,
+PxConstraintAllocator& allocator, PxU32 maxRows)
+{
+	const Vec4V zero = V4Zero();
+	Px1DConstraint* allSorted[MAX_CONSTRAINT_ROWS * 4];
+	PxU32 startIndex[4];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS * 4];
+	PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS * 4];
+
+	PxU32 numRows = 0;
+
+	for (PxU32 a = 0; a < 4; ++a)
+	{
+		startIndex[a] = numRows;
+		PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+		Px1DConstraint** sorted = allSorted + numRows;
+
+		preprocessRows(sorted, desc.rows, angSqrtInvInertia0 + numRows, angSqrtInvInertia1 + numRows, desc.numRows, 
+			desc.data0->sqrtInvInertia, desc.data1->sqrtInvInertia, desc.data0->invMass, desc.data1->invMass, 
+			desc.invMassScales, desc.disablePreprocessing, desc.improvedSlerp);
+
+		numRows += desc.numRows;
+	}
+
+
+	PxU32 stride = sizeof(SolverConstraint1DDynamic4);
+
+	
+	const PxU32 constraintLength = sizeof(SolverConstraint1DHeader4) + stride * maxRows;
+
+	//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
+	//know SPU DMAs have completed)
+	PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
+	if(NULL == ptr || (reinterpret_cast<PxU8*>(-1))==ptr)
+	{
+		for(PxU32 a = 0; a < 4; ++a)
+		{
+			PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+			desc.desc->constraint = NULL;
+			setConstraintLength(*desc.desc, 0);
+			desc.desc->writeBack = desc.writeback;
+		}
+
+		if(NULL==ptr)
+		{
+			PX_WARN_ONCE(
+				"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+				"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			return SolverConstraintPrepState::eOUT_OF_MEMORY;
+		}
+		else
+		{
+			PX_WARN_ONCE(
+				"Attempting to allocate more than 16K of constraint data. "
+				"Either accept joints detaching/exploding or simplify constraints.");
+			ptr=NULL;
+			return SolverConstraintPrepState::eOUT_OF_MEMORY;
+		}
+	}
+	//desc.constraint = ptr;
+
+	totalRows = numRows;
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		PxSolverConstraintPrepDesc& desc = constraintDescs[a];
+		desc.desc->constraint = ptr;
+		setConstraintLength(*desc.desc, constraintLength);
+		desc.desc->writeBack = desc.writeback;
+	}
+
+	const PxReal erp[4] = { 1.0f, 1.0f, 1.0f, 1.0f};
+	//OK, now we build all 4 constraints into a single set of rows
+
+	{
+		PxU8* currPtr = ptr;
+		SolverConstraint1DHeader4* header = reinterpret_cast<SolverConstraint1DHeader4*>(currPtr);
+		currPtr += sizeof(SolverConstraint1DHeader4);
+
+		const PxSolverBodyData& bd00 = *constraintDescs[0].data0;
+		const PxSolverBodyData& bd01 = *constraintDescs[1].data0;
+		const PxSolverBodyData& bd02 = *constraintDescs[2].data0;
+		const PxSolverBodyData& bd03 = *constraintDescs[3].data0;
+
+		const PxSolverBodyData& bd10 = *constraintDescs[0].data1;
+		const PxSolverBodyData& bd11 = *constraintDescs[1].data1;
+		const PxSolverBodyData& bd12 = *constraintDescs[2].data1;
+		const PxSolverBodyData& bd13 = *constraintDescs[3].data1;
+
+		//Load up masses, invInertia, velocity etc.
+
+		const Vec4V invMassScale0 = V4LoadXYZW(constraintDescs[0].invMassScales.linear0, constraintDescs[1].invMassScales.linear0, 
+			constraintDescs[2].invMassScales.linear0, constraintDescs[3].invMassScales.linear0);
+		const Vec4V invMassScale1 = V4LoadXYZW(constraintDescs[0].invMassScales.linear1, constraintDescs[1].invMassScales.linear1, 
+			constraintDescs[2].invMassScales.linear1, constraintDescs[3].invMassScales.linear1);
+
+
+		const Vec4V iMass0 = V4LoadXYZW(bd00.invMass, bd01.invMass, bd02.invMass, bd03.invMass);
+
+		const Vec4V iMass1 = V4LoadXYZW(bd10.invMass, bd11.invMass, bd12.invMass, bd13.invMass);
+
+		const Vec4V invMass0 = V4Mul(iMass0, invMassScale0);
+		const Vec4V invMass1 = V4Mul(iMass1, invMassScale1);
+
+
+		const Vec4V invInertiaScale0 = V4LoadXYZW(constraintDescs[0].invMassScales.angular0, constraintDescs[1].invMassScales.angular0, 
+			constraintDescs[2].invMassScales.angular0, constraintDescs[3].invMassScales.angular0);
+		const Vec4V invInertiaScale1 = V4LoadXYZW(constraintDescs[0].invMassScales.angular1, constraintDescs[1].invMassScales.angular1, 
+			constraintDescs[2].invMassScales.angular1, constraintDescs[3].invMassScales.angular1);
+
+		//Velocities
+		Vec4V linVel00 = V4LoadA(&bd00.linearVelocity.x);
+		Vec4V linVel01 = V4LoadA(&bd10.linearVelocity.x);
+		Vec4V angVel00 = V4LoadA(&bd00.angularVelocity.x);
+		Vec4V angVel01 = V4LoadA(&bd10.angularVelocity.x);
+
+		Vec4V linVel10 = V4LoadA(&bd01.linearVelocity.x);
+		Vec4V linVel11 = V4LoadA(&bd11.linearVelocity.x);
+		Vec4V angVel10 = V4LoadA(&bd01.angularVelocity.x);
+		Vec4V angVel11 = V4LoadA(&bd11.angularVelocity.x);
+
+		Vec4V linVel20 = V4LoadA(&bd02.linearVelocity.x);
+		Vec4V linVel21 = V4LoadA(&bd12.linearVelocity.x);
+		Vec4V angVel20 = V4LoadA(&bd02.angularVelocity.x);
+		Vec4V angVel21 = V4LoadA(&bd12.angularVelocity.x);
+
+		Vec4V linVel30 = V4LoadA(&bd03.linearVelocity.x);
+		Vec4V linVel31 = V4LoadA(&bd13.linearVelocity.x);
+		Vec4V angVel30 = V4LoadA(&bd03.angularVelocity.x);
+		Vec4V angVel31 = V4LoadA(&bd13.angularVelocity.x);
+
+
+		Vec4V linVel0T0, linVel0T1, linVel0T2;
+		Vec4V linVel1T0, linVel1T1, linVel1T2;
+		Vec4V angVel0T0, angVel0T1, angVel0T2;
+		Vec4V angVel1T0, angVel1T1, angVel1T2;
+
+
+		PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2);
+		PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2);
+		PX_TRANSPOSE_44_34(angVel00, angVel10, angVel20, angVel30, angVel0T0, angVel0T1, angVel0T2);
+		PX_TRANSPOSE_44_34(angVel01, angVel11, angVel21, angVel31, angVel1T0, angVel1T1, angVel1T2);
+
+
+
+		//body world offsets
+		Vec4V workOffset0 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[0].body0WorldOffset));
+		Vec4V workOffset1 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[1].body0WorldOffset));
+		Vec4V workOffset2 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[2].body0WorldOffset));
+		Vec4V workOffset3 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[3].body0WorldOffset));
+
+		Vec4V workOffsetX, workOffsetY, workOffsetZ;
+
+		PX_TRANSPOSE_44_34(workOffset0, workOffset1, workOffset2, workOffset3, workOffsetX, workOffsetY, workOffsetZ);
+
+		const FloatV dtV = FLoad(dt);
+		Vec4V linBreakForce = V4LoadXYZW(constraintDescs[0].linBreakForce, constraintDescs[1].linBreakForce,
+			constraintDescs[2].linBreakForce, constraintDescs[3].linBreakForce);
+		Vec4V angBreakForce = V4LoadXYZW(constraintDescs[0].angBreakForce, constraintDescs[1].angBreakForce,
+			constraintDescs[2].angBreakForce, constraintDescs[3].angBreakForce);
+
+		
+		header->break0 = PxU8((constraintDescs[0].linBreakForce != PX_MAX_F32) || (constraintDescs[0].angBreakForce != PX_MAX_F32));
+		header->break1 = PxU8((constraintDescs[1].linBreakForce != PX_MAX_F32) || (constraintDescs[1].angBreakForce != PX_MAX_F32));
+		header->break2 = PxU8((constraintDescs[2].linBreakForce != PX_MAX_F32) || (constraintDescs[2].angBreakForce != PX_MAX_F32));
+		header->break3 = PxU8((constraintDescs[3].linBreakForce != PX_MAX_F32) || (constraintDescs[3].angBreakForce != PX_MAX_F32));
+
+
+		//OK, I think that's everything loaded in
+
+		header->invMass0D0 = invMass0;
+		header->invMass1D1 = invMass1;
+		header->angD0 = invInertiaScale0;
+		header->angD1 = invInertiaScale1;
+		header->body0WorkOffsetX = workOffsetX;
+		header->body0WorkOffsetY = workOffsetY;
+		header->body0WorkOffsetZ = workOffsetZ;
+
+		header->count = maxRows;
+		header->type = DY_SC_TYPE_BLOCK_1D;
+		header->linBreakImpulse = V4Scale(linBreakForce, dtV);
+		header->angBreakImpulse = V4Scale(angBreakForce, dtV);
+		header->count0 = Ps::to8(constraintDescs[0].numRows);
+		header->count1 = Ps::to8(constraintDescs[1].numRows);
+		header->count2 = Ps::to8(constraintDescs[2].numRows);
+		header->count3 = Ps::to8(constraintDescs[3].numRows);
+
+		//Now we loop over the constraints and build the results...
+
+		PxU32 index0 = 0;
+		PxU32 endIndex0 = constraintDescs[0].numRows - 1;
+		PxU32 index1 = startIndex[1];
+		PxU32 endIndex1 = index1 + constraintDescs[1].numRows - 1;
+		PxU32 index2 = startIndex[2];
+		PxU32 endIndex2 = index2 + constraintDescs[2].numRows - 1;
+		PxU32 index3 = startIndex[3];
+		PxU32 endIndex3 = index3 + constraintDescs[3].numRows - 1;
+
+		const FloatV one = FOne();
+
+		for(PxU32 a = 0; a < maxRows; ++a)
+		{	
+			SolverConstraint1DDynamic4* c = reinterpret_cast<SolverConstraint1DDynamic4*>(currPtr);
+			currPtr += stride;
+
+			Px1DConstraint* con0 = allSorted[index0];
+			Px1DConstraint* con1 = allSorted[index1];
+			Px1DConstraint* con2 = allSorted[index2];
+			Px1DConstraint* con3 = allSorted[index3];
+
+			Vec4V cangDelta00 = V4LoadA(&angSqrtInvInertia0[index0].x);
+			Vec4V cangDelta01 = V4LoadA(&angSqrtInvInertia0[index1].x);
+			Vec4V cangDelta02 = V4LoadA(&angSqrtInvInertia0[index2].x);
+			Vec4V cangDelta03 = V4LoadA(&angSqrtInvInertia0[index3].x);
+
+			Vec4V cangDelta10 = V4LoadA(&angSqrtInvInertia1[index0].x);
+			Vec4V cangDelta11 = V4LoadA(&angSqrtInvInertia1[index1].x);
+			Vec4V cangDelta12 = V4LoadA(&angSqrtInvInertia1[index2].x);
+			Vec4V cangDelta13 = V4LoadA(&angSqrtInvInertia1[index3].x);
+
+			index0 = index0 == endIndex0 ? index0 : index0 + 1;
+			index1 = index1 == endIndex1 ? index1 : index1 + 1;
+			index2 = index2 == endIndex2 ? index2 : index2 + 1;
+			index3 = index3 == endIndex3 ? index3 : index3 + 1;
+
+			Vec4V driveScale = V4Splat(one);
+			if (con0->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[0].driveLimitsAreForces)
+				driveScale = V4SetX(driveScale, FMin(one, dtV));
+			if (con1->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[1].driveLimitsAreForces)
+				driveScale = V4SetY(driveScale, FMin(one, dtV));
+			if (con2->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[2].driveLimitsAreForces)
+				driveScale = V4SetZ(driveScale, FMin(one, dtV));
+			if (con3->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[3].driveLimitsAreForces)
+				driveScale = V4SetW(driveScale, FMin(one, dtV));
+
+
+			Vec4V clin00 = V4LoadA(&con0->linear0.x);
+			Vec4V clin01 = V4LoadA(&con1->linear0.x);
+			Vec4V clin02 = V4LoadA(&con2->linear0.x);
+			Vec4V clin03 = V4LoadA(&con3->linear0.x);
+
+			Vec4V cang00 = V4LoadA(&con0->angular0.x);
+			Vec4V cang01 = V4LoadA(&con1->angular0.x);
+			Vec4V cang02 = V4LoadA(&con2->angular0.x);
+			Vec4V cang03 = V4LoadA(&con3->angular0.x);
+
+			Vec4V clin0X, clin0Y, clin0Z;
+			Vec4V cang0X, cang0Y, cang0Z;
+			
+			PX_TRANSPOSE_44_34(clin00, clin01, clin02, clin03, clin0X, clin0Y, clin0Z);
+			PX_TRANSPOSE_44_34(cang00, cang01, cang02, cang03, cang0X, cang0Y, cang0Z);
+			
+			const Vec4V maxImpulse = V4LoadXYZW(con0->maxImpulse, con1->maxImpulse, con2->maxImpulse, con3->maxImpulse);
+			const Vec4V minImpulse = V4LoadXYZW(con0->minImpulse, con1->minImpulse, con2->minImpulse, con3->minImpulse);
+
+			Vec4V angDelta0X, angDelta0Y, angDelta0Z;
+
+			PX_TRANSPOSE_44_34(cangDelta00, cangDelta01, cangDelta02, cangDelta03, angDelta0X, angDelta0Y, angDelta0Z);
+
+			c->flags[0] = 0;
+			c->flags[1] = 0;
+			c->flags[2] = 0;
+			c->flags[3] = 0;
+
+			c->lin0X = clin0X;
+			c->lin0Y = clin0Y;
+			c->lin0Z = clin0Z;
+			c->ang0X = angDelta0X;
+			c->ang0Y = angDelta0Y;
+			c->ang0Z = angDelta0Z;
+			c->ang0WritebackX = cang0X;
+			c->ang0WritebackY = cang0Y;
+			c->ang0WritebackZ = cang0Z;
+
+			c->minImpulse = V4Mul(minImpulse, driveScale);
+			c->maxImpulse = V4Mul(maxImpulse, driveScale);
+			c->appliedForce = zero;
+
+			const Vec4V lin0MagSq = V4MulAdd(clin0Z, clin0Z, V4MulAdd(clin0Y, clin0Y, V4Mul(clin0X, clin0X)));
+			const Vec4V cang0DotAngDelta = V4MulAdd(angDelta0Z, angDelta0Z, V4MulAdd(angDelta0Y, angDelta0Y, V4Mul(angDelta0X, angDelta0X)));
+			c->flags[0] = 0;
+			c->flags[1] = 0;
+			c->flags[2] = 0;
+			c->flags[3] = 0;
+
+			Vec4V unitResponse = V4MulAdd(lin0MagSq, invMass0, V4Mul(cang0DotAngDelta, invInertiaScale0));
+
+			Vec4V clin10 = V4LoadA(&con0->linear1.x);
+			Vec4V clin11 = V4LoadA(&con1->linear1.x);
+			Vec4V clin12 = V4LoadA(&con2->linear1.x);
+			Vec4V clin13 = V4LoadA(&con3->linear1.x);
+
+			Vec4V cang10 = V4LoadA(&con0->angular1.x);
+			Vec4V cang11 = V4LoadA(&con1->angular1.x);
+			Vec4V cang12 = V4LoadA(&con2->angular1.x);
+			Vec4V cang13 = V4LoadA(&con3->angular1.x);
+
+			Vec4V clin1X, clin1Y, clin1Z;
+			Vec4V cang1X, cang1Y, cang1Z;
+			PX_TRANSPOSE_44_34(clin10, clin11, clin12, clin13, clin1X, clin1Y, clin1Z);
+			PX_TRANSPOSE_44_34(cang10, cang11, cang12, cang13, cang1X, cang1Y, cang1Z);
+
+			Vec4V angDelta1X, angDelta1Y, angDelta1Z;
+
+			PX_TRANSPOSE_44_34(cangDelta10, cangDelta11, cangDelta12, cangDelta13, angDelta1X, angDelta1Y, angDelta1Z);
+
+			const Vec4V lin1MagSq = V4MulAdd(clin1Z, clin1Z, V4MulAdd(clin1Y, clin1Y, V4Mul(clin1X, clin1X)));
+			const Vec4V cang1DotAngDelta = V4MulAdd(angDelta1Z, angDelta1Z, V4MulAdd(angDelta1Y, angDelta1Y, V4Mul(angDelta1X, angDelta1X)));
+
+			c->lin1X = clin1X;
+			c->lin1Y = clin1Y;
+			c->lin1Z = clin1Z;
+
+			c->ang1X = angDelta1X;
+			c->ang1Y = angDelta1Y;
+			c->ang1Z = angDelta1Z;
+
+			unitResponse = V4Add(unitResponse, V4MulAdd(lin1MagSq, invMass1, V4Mul(cang1DotAngDelta, invInertiaScale1)));
+
+			Vec4V linProj0(V4Mul(clin0X, linVel0T0));
+			Vec4V linProj1(V4Mul(clin1X, linVel1T0));
+			Vec4V angProj0(V4Mul(cang0X, angVel0T0));
+			Vec4V angProj1(V4Mul(cang1X, angVel1T0));
+
+			linProj0 = V4MulAdd(clin0Y, linVel0T1, linProj0);
+			linProj1 = V4MulAdd(clin1Y, linVel1T1, linProj1);
+			angProj0 = V4MulAdd(cang0Y, angVel0T1, angProj0);
+			angProj1 = V4MulAdd(cang1Y, angVel1T1, angProj1);
+			
+			linProj0 = V4MulAdd(clin0Z, linVel0T2, linProj0);
+			linProj1 = V4MulAdd(clin1Z, linVel1T2, linProj1);
+			angProj0 = V4MulAdd(cang0Z, angVel0T2, angProj0);
+			angProj1 = V4MulAdd(cang1Z, angVel1T2, angProj1);
+
+			const Vec4V projectVel0 = V4Add(linProj0, angProj0);
+			const Vec4V projectVel1 = V4Add(linProj1, angProj1);
+			
+			const Vec4V normalVel = V4Sub(projectVel0, projectVel1);
+
+
+			{
+				const PxVec4& ur				= reinterpret_cast<const PxVec4&>(unitResponse);
+				PxVec4& cConstant				= reinterpret_cast<PxVec4&>(c->constant);
+				PxVec4& cUnbiasedConstant		= reinterpret_cast<PxVec4&>(c->unbiasedConstant);
+				PxVec4& cVelMultiplier			= reinterpret_cast<PxVec4&>(c->velMultiplier);
+				PxVec4& cImpulseMultiplier		= reinterpret_cast<PxVec4&>(c->impulseMultiplier);
+
+				setConstants(cConstant.x, cUnbiasedConstant.x, cVelMultiplier.x, cImpulseMultiplier.x, 
+							 *con0, ur.x, constraintDescs[0].minResponseThreshold, erp[0], dt, recipdt, 
+							 *constraintDescs[0].data0, *constraintDescs[0].data1, a >= constraintDescs[0].numRows);
+
+				setConstants(cConstant.y, cUnbiasedConstant.y, cVelMultiplier.y, cImpulseMultiplier.y, 
+							 *con1, ur.y, constraintDescs[1].minResponseThreshold, erp[1], dt, recipdt, 
+							 *constraintDescs[1].data0, *constraintDescs[1].data1, a >= constraintDescs[1].numRows);
+				
+				setConstants(cConstant.z, cUnbiasedConstant.z, cVelMultiplier.z, cImpulseMultiplier.z, 
+							 *con2, ur.z, constraintDescs[2].minResponseThreshold, erp[2], dt, recipdt, 
+							 *constraintDescs[2].data0, *constraintDescs[2].data1, a >= constraintDescs[2].numRows);
+
+				setConstants(cConstant.w, cUnbiasedConstant.w, cVelMultiplier.w, cImpulseMultiplier.w, 
+							 *con3, ur.w, constraintDescs[3].minResponseThreshold, erp[3], dt, recipdt, 
+							 *constraintDescs[3].data0, *constraintDescs[3].data1, a >= constraintDescs[3].numRows);
+			}
+
+			const Vec4V velBias = V4Mul(c->velMultiplier, normalVel);
+			c->constant = V4Add(c->constant, velBias);
+			c->unbiasedConstant = V4Add(c->unbiasedConstant, velBias);
+
+			if(con0->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[0] |= DY_SC_FLAG_OUTPUT_FORCE;
+			if(con1->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[1] |= DY_SC_FLAG_OUTPUT_FORCE;
+			if(con2->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[2] |= DY_SC_FLAG_OUTPUT_FORCE;
+			if(con3->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
+				c->flags[3] |= DY_SC_FLAG_OUTPUT_FORCE;
+		}
+		*(reinterpret_cast<PxU32*>(currPtr)) = 0;
+		*(reinterpret_cast<PxU32*>(currPtr + 4)) = 0;
+	}
+	
+	//OK, we're ready to allocate and solve prep these constraints now :-)
+	return SolverConstraintPrepState::eSUCCESS;
+}
+
+}
+
+}
--- a/physx/source/lowleveldynamics/src/DyContactPrep.cpp
+++ b/physx/source/lowleveldynamics/src/DyContactPrep.cpp
@ -0,0 +1,824 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+     
+#include "foundation/PxPreprocessor.h"
+#include "PxSceneDesc.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DySolverContact.h"
+#include "DySolverContact4.h"
+#include "DySolverConstraintTypes.h"
+#include "PxcNpWorkUnit.h"
+#include "DyThreadContext.h"
+#include "DyContactPrep.h"
+#include "PxcNpContactPrepShared.h"
+#include "PxvDynamics.h"
+#include "DyCorrelationBuffer.h"
+#include "DyDynamics.h"
+#include "DyArticulationContactPrep.h"
+#include "PxsContactManager.h"
+#include "PsFoundation.h"
+#include "DyConstraintPrep.h"
+
+using namespace physx;
+using namespace Gu;
+
+
+#include "PsVecMath.h"
+#include "PxContactModifyCallback.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DyContactPrepShared.h"
+
+using namespace Ps::aos;
+
+namespace physx
+{
+namespace Dy
+{
+
+PxcCreateFinalizeSolverContactMethod createFinalizeMethods[3] =
+{
+	createFinalizeSolverContacts,
+	createFinalizeSolverContactsCoulomb1D,
+	createFinalizeSolverContactsCoulomb2D
+};
+
+
+
+static void setupFinalizeSolverConstraints(Sc::ShapeInteraction* shapeInteraction,
+						    const ContactPoint* buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							const PxSolverBodyData& data0,
+							const PxSolverBodyData& data1,
+							const PxReal invDtF32,
+							PxReal bounceThresholdF32,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1, 
+							bool hasForceThreshold, bool staticOrKinematicBody,
+							const PxReal restDist, PxU8* frictionDataPtr,
+							const PxReal maxCCDSeparation,
+							const PxReal solverOffsetSlopF32)
+{
+	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
+	// therefore did not get written back to the cache) but the patch addresses are dense,
+	// corresponding to valid patches
+
+	const Vec3V solverOffsetSlop = V3Load(solverOffsetSlopF32);
+
+	const FloatV ccdMaxSeparation = FLoad(maxCCDSeparation);
+
+	PxU8 flags = PxU8(hasForceThreshold ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0);
+
+	PxU8* PX_RESTRICT ptr = workspace;
+
+	PxU8 type = Ps::to8(staticOrKinematicBody ? DY_SC_TYPE_STATIC_CONTACT
+									                 : DY_SC_TYPE_RB_CONTACT);
+
+	const FloatV zero=FZero();
+	const Vec3V v3Zero = V3Zero();
+
+	const FloatV d0 = FLoad(invMassScale0);
+	const FloatV d1 = FLoad(invMassScale1);
+	const FloatV angD0 = FLoad(invInertiaScale0);
+	const FloatV angD1 = FLoad(invInertiaScale1);
+	
+	const FloatV nDom1fV = FNeg(d1);
+
+	const FloatV invMass0 = FLoad(data0.invMass);
+	const FloatV invMass1 = FLoad(data1.invMass);
+
+	const FloatV invMass0_dom0fV = FMul(d0, invMass0);
+	const FloatV invMass1_dom1fV = FMul(nDom1fV, invMass1);
+
+
+	Vec4V staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = V4Zero();
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, invMass0_dom0fV);
+	staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, invMass1_dom1fV);
+
+	const FloatV restDistance = FLoad(restDist); 
+
+	const FloatV maxPenBias = FMax(FLoad(data0.penBiasClamp), FLoad(data1.penBiasClamp));
+
+	const QuatV bodyFrame0q = QuatVLoadU(&bodyFrame0.q.x);
+	const Vec3V bodyFrame0p = V3LoadU(bodyFrame0.p);
+	
+	const QuatV bodyFrame1q = QuatVLoadU(&bodyFrame1.q.x);
+	const Vec3V bodyFrame1p = V3LoadU(bodyFrame1.p);
+
+	PxU32 frictionPatchWritebackAddrIndex = 0;
+	PxU32 contactWritebackCount = 0;
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+
+	const Vec3V linVel0 = V3LoadU_SafeReadW(data0.linearVelocity);	// PT: safe because 'invMass' follows 'initialLinVel' in PxSolverBodyData
+	const Vec3V linVel1 = V3LoadU_SafeReadW(data1.linearVelocity);	// PT: safe because 'invMass' follows 'initialLinVel' in PxSolverBodyData
+	const Vec3V angVel0 = V3LoadU_SafeReadW(data0.angularVelocity);	// PT: safe because 'reportThreshold' follows 'initialAngVel' in PxSolverBodyData
+	const Vec3V angVel1 = V3LoadU_SafeReadW(data1.angularVelocity);	// PT: safe because 'reportThreshold' follows 'initialAngVel' in PxSolverBodyData
+
+	PX_ALIGN(16, const Mat33V invSqrtInertia0)
+	(
+		V3LoadU_SafeReadW(data0.sqrtInvInertia.column0),	// PT: safe because 'column1' follows 'column0' in PxMat33
+		V3LoadU_SafeReadW(data0.sqrtInvInertia.column1),	// PT: safe because 'column2' follows 'column1' in PxMat33
+		V3LoadU(data0.sqrtInvInertia.column2)
+	);
+	
+	PX_ALIGN(16, const Mat33V invSqrtInertia1)
+	(
+		V3LoadU_SafeReadW(data1.sqrtInvInertia.column0),	// PT: safe because 'column1' follows 'column0' in PxMat33
+		V3LoadU_SafeReadW(data1.sqrtInvInertia.column1),	// PT: safe because 'column2' follows 'column1' in PxMat33
+		V3LoadU(data1.sqrtInvInertia.column2)
+	);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+
+	for(PxU32 i=0;i<c.frictionPatchCount;i++)
+	{
+		PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+		PX_ASSERT(frictionPatch.anchorCount <= 2);
+
+		PxU32 firstPatch = c.correlationListHeads[i];
+		const Gu::ContactPoint* contactBase0 = buffer + c.contactPatches[firstPatch].start;
+
+		const PxReal combinedRestitution = contactBase0->restitution;
+		
+		SolverContactHeader* PX_RESTRICT header = reinterpret_cast<SolverContactHeader*>(ptr);
+		ptr += sizeof(SolverContactHeader);		
+
+
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+
+		header->shapeInteraction = shapeInteraction;
+		header->flags = flags;
+		FStore(invMass0_dom0fV, &header->invMass0);
+		FStore(FNeg(invMass1_dom1fV), &header->invMass1);
+		const FloatV restitution = FLoad(combinedRestitution);
+	
+		PxU32 pointStride = sizeof(SolverContactPoint);
+		PxU32 frictionStride = sizeof(SolverContactFriction);
+
+		const Vec3V normal = V3LoadA(buffer[c.contactPatches[c.correlationListHeads[i]].start].normal);
+		const FloatV normalLenSq = V3LengthSq(normal);
+		const VecCrossV norCross = V3PrepareCross(normal);
+		const FloatV norVel = V3SumElems(V3NegMulSub(normal, linVel1, V3Mul(normal, linVel0)));
+
+		const FloatV invMassNorLenSq0 = FMul(invMass0_dom0fV, normalLenSq);
+		const FloatV invMassNorLenSq1 = FMul(invMass1_dom1fV, normalLenSq);
+
+		header->normal_minAppliedImpulseForFrictionW = Vec4V_From_Vec3V(normal);
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer + c.contactPatches[patch].start;
+				
+			PxU8* p = ptr;
+			
+			for(PxU32 j=0;j<count;j++)
+			{
+				Ps::prefetchLine(p, 256);
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPoint* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPoint*>(p);
+				p += pointStride;
+
+				constructContactConstraint(invSqrtInertia0, invSqrtInertia1, invMassNorLenSq0, 
+					invMassNorLenSq1, angD0, angD1, bodyFrame0p, bodyFrame1p,
+					normal, norVel, norCross, angVel0, angVel1,
+					invDt, invDtp8, restDistance, maxPenBias,  restitution,
+					bounceThreshold, contact, *solverContact,
+					ccdMaxSeparation, solverOffsetSlop);
+			}
+
+			ptr = p;
+		}
+		contactWritebackCount += contactCount;
+
+		PxF32* forceBuffers = reinterpret_cast<PxF32*>(ptr);
+		PxMemZero(forceBuffers, sizeof(PxF32) * contactCount);
+		ptr += ((contactCount + 3) & (~3)) * sizeof(PxF32); // jump to next 16-byte boundary
+		
+		const PxReal frictionCoefficient = (contactBase0->materialFlags & PxMaterialFlag::eIMPROVED_PATCH_FRICTION && frictionPatch.anchorCount == 2) ? 0.5f : 1.f;
+
+		const PxReal staticFriction = contactBase0->staticFriction * frictionCoefficient;
+		const PxReal dynamicFriction = contactBase0->dynamicFriction* frictionCoefficient;
+
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(staticFriction));
+		staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, FLoad(dynamicFriction));
+
+		const bool haveFriction = (disableStrongFriction == 0 && frictionPatch.anchorCount != 0) ;//PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;
+		header->numNormalConstr		= Ps::to8(contactCount);
+		header->numFrictionConstr	= Ps::to8(haveFriction ? frictionPatch.anchorCount*2 : 0);
+	
+		header->type				= type;
+
+		header->staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W = staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;
+		FStore(angD0, &header->angDom0);
+		FStore(angD1, &header->angDom1);
+
+		header->broken = 0;
+
+		if(haveFriction)
+		{
+			const Vec3V linVrel = V3Sub(linVel0, linVel1);
+			//const Vec3V normal = Vec3V_From_PxVec3_Aligned(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+			const FloatV orthoThreshold = FLoad(0.70710678f);
+			const FloatV p1 = FLoad(0.0001f);
+			// fallback: normal.cross((1,0,0)) or normal.cross((0,0,1))
+			const FloatV normalX = V3GetX(normal);
+			const FloatV normalY = V3GetY(normal);
+			const FloatV normalZ = V3GetZ(normal);
+			
+			Vec3V t0Fallback1 = V3Merge(zero, FNeg(normalZ), normalY);
+			Vec3V t0Fallback2 = V3Merge(FNeg(normalY), normalX, zero) ;
+			Vec3V t0Fallback = V3Sel(FIsGrtr(orthoThreshold, FAbs(normalX)), t0Fallback1, t0Fallback2);
+
+			Vec3V t0 = V3Sub(linVrel, V3Scale(normal, V3Dot(normal, linVrel)));
+			t0 = V3Sel(FIsGrtr(V3LengthSq(t0), p1), t0, t0Fallback);
+			t0 = V3Normalize(t0);
+
+			const VecCrossV t0Cross = V3PrepareCross(t0);
+
+			const Vec3V t1 = V3Cross(norCross, t0Cross);
+			const VecCrossV t1Cross = V3PrepareCross(t1);
+
+			
+			// since we don't even have the body velocities we can't compute the tangent dirs, so 
+			// the only thing we can do right now is to write the geometric information (which is the
+			// same for both axis constraints of an anchor) We put ra in the raXn field, rb in the rbXn
+			// field, and the error in the normal field. See corresponding comments in
+			// completeContactFriction()
+
+			//We want to set the writeBack ptr to point to the broken flag of the friction patch.
+			//On spu we have a slight problem here because the friction patch array is 
+			//in local store rather than in main memory. The good news is that the address of the friction 
+			//patch array in main memory is stored in the work unit. These two addresses will be equal 
+			//except on spu where one is local store memory and the other is the effective address in main memory.
+			//Using the value stored in the work unit guarantees that the main memory address is used on all platforms.
+			PxU8* PX_RESTRICT writeback = frictionDataPtr + frictionPatchWritebackAddrIndex*sizeof(FrictionPatch);
+
+			header->frictionBrokenWritebackByte = writeback;
+
+			for(PxU32 j = 0; j < frictionPatch.anchorCount; j++)
+			{
+				Ps::prefetchLine(ptr, 256);
+				Ps::prefetchLine(ptr, 384);
+				SolverContactFriction* PX_RESTRICT f0 = reinterpret_cast<SolverContactFriction*>(ptr);
+				ptr += frictionStride;
+				SolverContactFriction* PX_RESTRICT f1 = reinterpret_cast<SolverContactFriction*>(ptr);
+				ptr += frictionStride;
+
+				Vec3V body0Anchor = V3LoadU(frictionPatch.body0Anchors[j]);
+				Vec3V body1Anchor = V3LoadU(frictionPatch.body1Anchors[j]);
+
+				const Vec3V ra = QuatRotate(bodyFrame0q, body0Anchor);
+				const Vec3V rb = QuatRotate(bodyFrame1q, body1Anchor);
+
+				/*ra = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(ra)), v3Zero, ra);
+				rb = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(rb)), v3Zero, rb);*/
+
+				PxU32 index = c.contactID[i][j];
+				Vec3V error = V3Sub(V3Add(ra, bodyFrame0p), V3Add(rb, bodyFrame1p));
+				error = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(error)), v3Zero, error);
+
+				index = index == 0xFFFF ? c.contactPatches[c.correlationListHeads[i]].start : index;
+
+				const Vec3V tvel = V3LoadA(buffer[index].targetVel);
+				
+				{
+					Vec3V raXn = V3Cross(ra, t0Cross);
+					Vec3V rbXn = V3Cross(rb, t0Cross);
+
+					raXn = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(raXn)), V3Zero(), raXn);
+					rbXn = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(rbXn)), V3Zero(), rbXn);
+
+					const Vec3V raXnSqrtInertia = M33MulV3(invSqrtInertia0, raXn);
+					const Vec3V rbXnSqrtInertia = M33MulV3(invSqrtInertia1, rbXn);	
+
+
+					const FloatV resp0 = FAdd(invMass0_dom0fV, FMul(angD0, V3Dot(raXnSqrtInertia, raXnSqrtInertia)));
+					const FloatV resp1 = FSub(FMul(angD1, V3Dot(rbXnSqrtInertia, rbXnSqrtInertia)), invMass1_dom1fV);
+					const FloatV resp = FAdd(resp0, resp1);
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, zero), FDiv(p8, resp), zero);
+
+					FloatV targetVel = V3Dot(tvel, t0);
+
+					const FloatV vrel1 = FAdd(V3Dot(t0, linVel0), V3Dot(raXn, angVel0));
+					const FloatV vrel2 = FAdd(V3Dot(t0, linVel1), V3Dot(rbXn, angVel1));
+					const FloatV vrel = FSub(vrel1, vrel2);
+
+					targetVel = FSub(targetVel, vrel);
+
+					f0->normalXYZ_appliedForceW = V4SetW(t0, zero);
+					f0->raXnXYZ_velMultiplierW = V4SetW(raXnSqrtInertia, velMultiplier);
+					f0->rbXnXYZ_biasW = V4SetW(rbXnSqrtInertia, FMul(V3Dot(t0, error), invDt));
+					FStore(targetVel, &f0->targetVel);
+				}
+
+				{
+
+					Vec3V raXn = V3Cross(ra, t1Cross);
+					Vec3V rbXn = V3Cross(rb, t1Cross);
+
+					raXn = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(raXn)), V3Zero(), raXn);
+					rbXn = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(rbXn)), V3Zero(), rbXn);
+
+					const Vec3V raXnSqrtInertia = M33MulV3(invSqrtInertia0, raXn);
+					const Vec3V rbXnSqrtInertia = M33MulV3(invSqrtInertia1, rbXn);	
+
+					const FloatV resp0 = FAdd(invMass0_dom0fV, FMul(angD0, V3Dot(raXnSqrtInertia, raXnSqrtInertia)));
+					const FloatV resp1 = FSub(FMul(angD1, V3Dot(rbXnSqrtInertia, rbXnSqrtInertia)), invMass1_dom1fV);
+					const FloatV resp = FAdd(resp0, resp1);
+
+					const FloatV velMultiplier = FSel(FIsGrtr(resp, zero), FDiv(p8, resp), zero);
+
+					FloatV targetVel = V3Dot(tvel, t1);
+
+					const FloatV vrel1 = FAdd(V3Dot(t1, linVel0), V3Dot(raXn, angVel0));
+					const FloatV vrel2 = FAdd(V3Dot(t1, linVel1), V3Dot(rbXn, angVel1));
+					const FloatV vrel = FSub(vrel1, vrel2);
+
+					targetVel = FSub(targetVel, vrel);
+
+					f1->normalXYZ_appliedForceW = V4SetW(t1, zero);
+					f1->raXnXYZ_velMultiplierW = V4SetW(raXnSqrtInertia, velMultiplier);
+					f1->rbXnXYZ_biasW = V4SetW(rbXnSqrtInertia, FMul(V3Dot(t1, error), invDt));
+					FStore(targetVel, &f1->targetVel);
+				}
+			}
+		}
+
+		frictionPatchWritebackAddrIndex++;
+	}
+}
+
+
+PX_FORCE_INLINE void computeBlockStreamByteSizes(const bool useExtContacts, const CorrelationBuffer& c,
+								PxU32& _solverConstraintByteSize, PxU32& _frictionPatchByteSize, PxU32& _numFrictionPatches,
+								PxU32& _axisConstraintCount)
+{
+	PX_ASSERT(0 == _solverConstraintByteSize);
+	PX_ASSERT(0 == _frictionPatchByteSize);
+	PX_ASSERT(0 == _numFrictionPatches);
+	PX_ASSERT(0 == _axisConstraintCount);
+
+	// PT: use local vars to remove LHS
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 numFrictionPatches = 0;
+	PxU32 axisConstraintCount = 0;
+
+	
+	for(PxU32 i = 0; i < c.frictionPatchCount; i++)
+	{
+		//Friction patches.
+		if(c.correlationListHeads[i] != CorrelationBuffer::LIST_END)
+			numFrictionPatches++;
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+
+		const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0;
+
+		//Solver constraint data.
+		if(c.frictionPatchContactCounts[i]!=0)
+		{
+			solverConstraintByteSize += sizeof(SolverContactHeader);
+			solverConstraintByteSize += useExtContacts ? c.frictionPatchContactCounts[i] * sizeof(SolverContactPointExt) 
+				: c.frictionPatchContactCounts[i] * sizeof(SolverContactPoint);
+			solverConstraintByteSize += sizeof(PxF32) * ((c.frictionPatchContactCounts[i] + 3)&(~3)); //Add on space for applied impulses
+
+			axisConstraintCount += c.frictionPatchContactCounts[i];
+
+			if(haveFriction)
+			{
+				solverConstraintByteSize += useExtContacts ? c.frictionPatches[i].anchorCount * 2 * sizeof(SolverContactFrictionExt)
+					: c.frictionPatches[i].anchorCount * 2 * sizeof(SolverContactFriction);
+				axisConstraintCount += c.frictionPatches[i].anchorCount * 2;
+
+			}
+		}
+	}
+	PxU32 frictionPatchByteSize = numFrictionPatches*sizeof(FrictionPatch);
+
+	_numFrictionPatches = numFrictionPatches;
+	_axisConstraintCount = axisConstraintCount;
+
+	//16-byte alignment.
+	_frictionPatchByteSize = ((frictionPatchByteSize + 0x0f) & ~0x0f);
+	_solverConstraintByteSize =  ((solverConstraintByteSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
+	PX_ASSERT(0 == (_frictionPatchByteSize & 0x0f));
+}
+
+static bool reserveBlockStreams(const bool useExtContacts, Dy::CorrelationBuffer& cBuffer,
+						PxU8*& solverConstraint,
+						FrictionPatch*& _frictionPatches,
+						PxU32& numFrictionPatches, PxU32& solverConstraintByteSize,
+						PxU32& axisConstraintCount, PxConstraintAllocator& constraintAllocator)
+{
+	PX_ASSERT(NULL == solverConstraint);
+	PX_ASSERT(NULL == _frictionPatches);
+	PX_ASSERT(0 == numFrictionPatches);
+	PX_ASSERT(0 == solverConstraintByteSize);
+	PX_ASSERT(0 == axisConstraintCount);
+
+	//From frictionPatchStream we just need to reserve a single buffer.
+	PxU32 frictionPatchByteSize = 0;
+	//Compute the sizes of all the buffers.
+	computeBlockStreamByteSizes(
+		useExtContacts, cBuffer,
+		solverConstraintByteSize, frictionPatchByteSize, numFrictionPatches,
+		axisConstraintCount);
+
+	//Reserve the buffers.
+
+	//First reserve the accumulated buffer size for the constraint block.
+	PxU8* constraintBlock = NULL;
+	const PxU32 constraintBlockByteSize = solverConstraintByteSize;
+	if(constraintBlockByteSize > 0)
+	{
+		constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
+
+		if(0==constraintBlock || (reinterpret_cast<PxU8*>(-1))==constraintBlock)
+		{
+			if(0==constraintBlock)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				constraintBlock=NULL;
+			}
+		}
+		PX_ASSERT((size_t(constraintBlock) & 0xF) == 0);
+	}
+
+	FrictionPatch* frictionPatches = NULL;
+	//If the constraint block reservation didn't fail then reserve the friction buffer too.
+	if(frictionPatchByteSize >0 && (0==constraintBlockByteSize || constraintBlock))
+	{
+		frictionPatches = reinterpret_cast<FrictionPatch*>(constraintAllocator.reserveFrictionData(frictionPatchByteSize));
+
+		if(0==frictionPatches || (reinterpret_cast<FrictionPatch*>(-1))==frictionPatches)
+		{
+			if(0==frictionPatches)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of friction data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				frictionPatches=NULL;
+			}
+		}
+	}
+
+	_frictionPatches = frictionPatches;
+
+	//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
+	if(0==constraintBlockByteSize || constraintBlock)
+	{
+		if(solverConstraintByteSize)
+		{
+			solverConstraint = constraintBlock;
+			PX_ASSERT(0==(uintptr_t(solverConstraint) & 0x0f));
+		}
+	}
+
+	//Return true if neither of the two block reservations failed.
+	return ((0==constraintBlockByteSize || constraintBlock) && (0==frictionPatchByteSize || frictionPatches));
+}
+
+
+bool createFinalizeSolverContacts(
+	PxSolverContactDesc& contactDesc,
+	CorrelationBuffer& c,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxReal solverOffsetSlop,
+	PxConstraintAllocator& constraintAllocator,
+	Cm::SpatialVectorF* Z)
+{
+	Ps::prefetchLine(contactDesc.body0);
+	Ps::prefetchLine(contactDesc.body1);
+	Ps::prefetchLine(contactDesc.data0);
+	Ps::prefetchLine(contactDesc.data1);
+
+	c.frictionPatchCount = 0;
+	c.contactPatchCount = 0;
+
+	const bool hasForceThreshold = contactDesc.hasForceThresholds;
+	const bool staticOrKinematicBody = contactDesc.bodyState1 == PxSolverContactDesc::eKINEMATIC_BODY || contactDesc.bodyState1 == PxSolverContactDesc::eSTATIC_BODY;
+
+	const bool disableStrongFriction = contactDesc.disableStrongFriction;
+	const bool useExtContacts = ((contactDesc.bodyState0 | contactDesc.bodyState1) & PxSolverContactDesc::eARTICULATION) != 0;
+
+	PxSolverConstraintDesc& desc = *contactDesc.desc;
+
+	desc.constraintLengthOver16 = 0;
+
+
+	if (contactDesc.numContacts == 0)
+	{
+		contactDesc.frictionPtr = NULL;
+		contactDesc.frictionCount = 0;
+		desc.constraint = NULL;
+		return true;
+	}
+
+	if (!disableStrongFriction)
+	{
+		getFrictionPatches(c, contactDesc.frictionPtr, contactDesc.frictionCount, contactDesc.bodyFrame0, contactDesc.bodyFrame1, correlationDistance);
+	}
+
+	bool overflow = !createContactPatches(c, contactDesc.contacts, contactDesc.numContacts, PXC_SAME_NORMAL);
+	overflow = correlatePatches(c, contactDesc.contacts, contactDesc.bodyFrame0, contactDesc.bodyFrame1, PXC_SAME_NORMAL, 0, 0) || overflow;
+	PX_UNUSED(overflow);
+
+#if PX_CHECKED
+	if (overflow)
+	{
+		Ps::getFoundation().error(physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__,
+			"Dropping contacts in solver because we exceeded limit of 32 friction patches.");
+	}
+#endif
+
+	growPatches(c, contactDesc.contacts, contactDesc.bodyFrame0, contactDesc.bodyFrame1, correlationDistance, 0, frictionOffsetThreshold + contactDesc.restDistance);
+
+	//PX_ASSERT(patchCount == c.frictionPatchCount);
+
+	FrictionPatch* frictionPatches = NULL;
+	PxU8* solverConstraint = NULL;
+	PxU32 numFrictionPatches = 0;
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 axisConstraintCount = 0;
+
+	const bool successfulReserve = reserveBlockStreams(
+		useExtContacts, c,
+		solverConstraint, frictionPatches,
+		numFrictionPatches,
+		solverConstraintByteSize,
+		axisConstraintCount,
+		constraintAllocator);
+	// initialise the work unit's ptrs to the various buffers.
+
+	contactDesc.frictionPtr = NULL;
+	contactDesc.frictionCount = 0;
+	desc.constraint = NULL;
+	desc.constraintLengthOver16 = 0;
+	// patch up the work unit with the reserved buffers and set the reserved buffer data as appropriate.
+
+	if (successfulReserve)
+	{
+		PxU8* frictionDataPtr = reinterpret_cast<PxU8*>(frictionPatches);
+		contactDesc.frictionPtr = frictionDataPtr;
+		desc.constraint = solverConstraint;
+		//output.nbContacts = Ps::to8(numContacts);
+		contactDesc.frictionCount = Ps::to8(numFrictionPatches);
+		desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize / 16);
+		desc.writeBack = contactDesc.contactForces;
+		desc.writeBackLengthOver4 = PxU16(contactDesc.contactForces ? contactDesc.numContacts : 0);
+
+		//Initialise friction buffer.
+		if (frictionPatches)
+		{
+			// PT: TODO: revisit this... not very satisfying
+			//const PxU32 maxSize = numFrictionPatches*sizeof(FrictionPatch);
+			Ps::prefetchLine(frictionPatches);
+			Ps::prefetchLine(frictionPatches, 128);
+			Ps::prefetchLine(frictionPatches, 256);
+
+			for (PxU32 i = 0; i<c.frictionPatchCount; i++)
+			{
+				//if(c.correlationListHeads[i]!=CorrelationBuffer::LIST_END)
+				if (c.frictionPatchContactCounts[i])
+				{
+					*frictionPatches++ = c.frictionPatches[i];
+					Ps::prefetchLine(frictionPatches, 256);
+				}
+			}
+		}
+
+		//Initialise solverConstraint buffer.
+		if (solverConstraint)
+		{
+			const PxSolverBodyData& data0 = *contactDesc.data0;
+			const PxSolverBodyData& data1 = *contactDesc.data1;
+			if (useExtContacts)
+			{
+				const SolverExtBody b0(reinterpret_cast<const void*>(contactDesc.body0), reinterpret_cast<const void*>(&data0), desc.linkIndexA);
+				const SolverExtBody b1(reinterpret_cast<const void*>(contactDesc.body1), reinterpret_cast<const void*>(&data1), desc.linkIndexB);
+
+				setupFinalizeExtSolverContacts(contactDesc.contacts, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					b0, b1, invDtF32, bounceThresholdF32,
+					contactDesc.invMassScales.linear0, contactDesc.invMassScales.angular0, contactDesc.invMassScales.linear1, contactDesc.invMassScales.angular1, 
+					contactDesc.restDistance, frictionDataPtr, contactDesc.maxCCDSeparation, Z);
+			}
+			else
+			{
+				setupFinalizeSolverConstraints(contactDesc.shapeInteraction, contactDesc.contacts, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					data0, data1, invDtF32, bounceThresholdF32,
+					contactDesc.invMassScales.linear0, contactDesc.invMassScales.angular0, contactDesc.invMassScales.linear1, contactDesc.invMassScales.angular1, 
+					hasForceThreshold, staticOrKinematicBody, contactDesc.restDistance, frictionDataPtr, contactDesc.maxCCDSeparation, solverOffsetSlop);
+			}
+			//KS - set to 0 so we have a counter for the number of times we solved the constraint
+			//only going to be used on SPU but might as well set on all platforms because this code is shared
+			*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
+		}
+	}
+
+	return successfulReserve;
+}
+
+FloatV setupExtSolverContact(const SolverExtBody& b0, const SolverExtBody& b1,
+	const FloatV& d0, const FloatV& d1, const FloatV& angD0, const FloatV& angD1, const Vec3V& bodyFrame0p, const Vec3V& bodyFrame1p,
+	const Vec3VArg normal, const FloatVArg invDt, const FloatVArg invDtp8, const FloatVArg restDistance, const FloatVArg maxPenBias, const FloatVArg restitution,
+	const FloatVArg bounceThreshold, const Gu::ContactPoint& contact, SolverContactPointExt& solverContact, const FloatVArg ccdMaxSeparation, Cm::SpatialVectorF* zVector,
+	const Cm::SpatialVectorV& v0, const Cm::SpatialVectorV& v1)
+{
+	const FloatV zero = FZero();
+	const FloatV separation = FLoad(contact.separation);
+
+	const FloatV penetration = FSub(separation, restDistance);
+
+	const Vec3V point = V3LoadA(contact.point);
+
+	const Vec3V ra = V3Sub(point, bodyFrame0p);
+	const Vec3V rb = V3Sub(point, bodyFrame1p);
+
+	const Vec3V raXn = V3Cross(ra, normal);
+	const Vec3V rbXn = V3Cross(rb, normal);
+
+	Cm::SpatialVectorV deltaV0, deltaV1;
+
+	const Cm::SpatialVectorV resp0 = createImpulseResponseVector(normal, raXn, b0);
+	const Cm::SpatialVectorV resp1 = createImpulseResponseVector(V3Neg(normal), V3Neg(rbXn), b1);
+
+	const FloatV unitResponse = getImpulseResponse(b0, resp0, deltaV0, d0, angD0,
+		b1, resp1, deltaV1, d1, angD1, reinterpret_cast<Cm::SpatialVectorV*>(zVector));
+
+
+
+	const Vec3V vel0 = V3Add(V3Mul(v0.linear, normal), V3Mul(v0.angular, raXn));
+	const Vec3V vel1 = V3Add(V3Mul(v1.linear, normal), V3Mul(v1.angular, rbXn));
+
+	const FloatV vrel = V3SumElems(V3Sub(vel0, vel1));
+
+	FloatV velMultiplier = FSel(FIsGrtr(FLoad(DY_ARTICULATION_MIN_RESPONSE), unitResponse), zero, FRecip(FAdd(unitResponse, FLoad(DY_ARTICULATION_CFM))));
+	//FloatV velMultiplier = FSel(FIsGrtr(FEps(), unitResponse), zero, FRecip(unitResponse));
+	FloatV scaledBias = FMul(velMultiplier, FMax(maxPenBias, FMul(penetration, invDtp8)));
+	const FloatV penetrationInvDt = FMul(penetration, invDt);
+
+	const BoolV isGreater2 = BAnd(BAnd(FIsGrtr(restitution, zero), FIsGrtr(bounceThreshold, vrel)), FIsGrtr(FNeg(vrel), penetrationInvDt));
+
+	const BoolV ccdSeparationCondition = FIsGrtrOrEq(ccdMaxSeparation, penetration);
+
+	scaledBias = FSel(BAnd(ccdSeparationCondition, isGreater2), zero, scaledBias);
+
+	FloatV targetVelocity = FSel(isGreater2, FMul(FNeg(vrel), restitution), zero);
+
+	//Get the rigid body's current velocity and embed into the constraint target velocities
+	if (b0.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		targetVelocity = FSub(targetVelocity, V3SumElems(vel0));
+	else if (b1.mLinkIndex == PxSolverConstraintDesc::NO_LINK)
+		targetVelocity = FAdd(targetVelocity, V3SumElems(vel1));
+
+	targetVelocity = FAdd(targetVelocity, V3Dot(V3LoadA(contact.targetVel), normal));
+
+	const FloatV biasedErr = FScaleAdd(targetVelocity, velMultiplier, FNeg(scaledBias));
+	const FloatV unbiasedErr = FScaleAdd(targetVelocity, velMultiplier, FSel(isGreater2, zero, FNeg(FMax(scaledBias, zero))));
+
+	const FloatV deltaF = FMax(FNegScaleSub(vrel, velMultiplier, biasedErr), zero);
+
+
+	FStore(velMultiplier, &solverContact.velMultiplier);
+	FStore(biasedErr, &solverContact.biasedErr);
+	FStore(unbiasedErr, &solverContact.unbiasedErr);
+	solverContact.maxImpulse = contact.maxImpulse;
+
+	solverContact.raXn = resp0.angular;
+	solverContact.rbXn = V3Neg(resp1.angular);
+	solverContact.linDeltaVA = deltaV0.linear;
+	solverContact.angDeltaVA = deltaV0.angular;
+	solverContact.linDeltaVB = deltaV1.linear;
+	solverContact.angDeltaVB = deltaV1.angular;
+
+	return deltaF;
+}
+
+
+bool createFinalizeSolverContacts(PxSolverContactDesc& contactDesc,
+								  PxsContactManagerOutput& output,
+								 ThreadContext& threadContext,
+								 const PxReal invDtF32,
+								 PxReal bounceThresholdF32,
+								 PxReal frictionOffsetThreshold,
+								 PxReal correlationDistance,
+								 PxReal solverOffsetSlop,
+								 PxConstraintAllocator& constraintAllocator,
+								 Cm::SpatialVectorF* Z)
+{
+	ContactBuffer& buffer = threadContext.mContactBuffer;
+
+	
+
+	buffer.count = 0;
+
+	// We pull the friction patches out of the cache to remove the dependency on how
+	// the cache is organized. Remember original addrs so we can write them back 
+	// efficiently.
+
+	PxU32 numContacts = 0;
+	{
+		PxReal invMassScale0 = 1.f;
+		PxReal invMassScale1 = 1.f;
+		PxReal invInertiaScale0 = 1.f;
+		PxReal invInertiaScale1 = 1.f;
+
+		bool hasMaxImpulse = false, hasTargetVelocity = false;
+
+		numContacts = extractContacts(buffer, output, hasMaxImpulse, hasTargetVelocity, invMassScale0, invMassScale1,
+			invInertiaScale0, invInertiaScale1, PxMin(contactDesc.data0->maxContactImpulse, contactDesc.data1->maxContactImpulse));
+
+		contactDesc.contacts = buffer.contacts;
+		contactDesc.numContacts = numContacts;
+		contactDesc.disableStrongFriction = contactDesc.disableStrongFriction || hasTargetVelocity;
+		contactDesc.hasMaxImpulse = hasMaxImpulse;
+		contactDesc.invMassScales.linear0 *= invMassScale0;
+		contactDesc.invMassScales.linear1 *= invMassScale1;
+		contactDesc.invMassScales.angular0 *= invInertiaScale0;
+		contactDesc.invMassScales.angular1 *= invInertiaScale1;
+	}
+	
+	CorrelationBuffer& c = threadContext.mCorrelationBuffer;
+
+	return createFinalizeSolverContacts(contactDesc, c, invDtF32, bounceThresholdF32, frictionOffsetThreshold, 
+		correlationDistance, solverOffsetSlop, constraintAllocator, Z);
+}
+  
+PxU32 getContactManagerConstraintDesc(const PxsContactManagerOutput& cmOutput, const PxsContactManager& /*cm*/, PxSolverConstraintDesc& desc)
+{
+	desc.writeBackLengthOver4 = cmOutput.nbContacts;
+	desc.writeBack = cmOutput.contactForces;
+	return cmOutput.nbContacts;// cm.getWorkUnit().axisConstraintCount;
+}
+
+}
+
+}
+
--- a/physx/source/lowleveldynamics/src/DyContactPrep.h
+++ b/physx/source/lowleveldynamics/src/DyContactPrep.h
@ -0,0 +1,213 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_CONTACTPREP_H
+#define DY_CONTACTPREP_H
+
+#include "DySolverConstraintDesc.h"
+#include "PxSceneDesc.h"
+#include "DySolverContact4.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+class PxsConstraintBlockManager;
+class PxcConstraintBlockStream;
+struct PxsContactManagerOutput;
+class FrictionPatchStreamPair;
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+class PxsContactManager;
+
+namespace Dy
+{
+	class ThreadContext;
+	struct CorrelationBuffer;
+
+#define CREATE_FINALIZE_SOLVER_CONTACT_METHOD_ARGS			\
+	PxSolverContactDesc& contactDesc,						\
+	PxsContactManagerOutput& output,						\
+	ThreadContext& threadContext,							\
+	const PxReal invDtF32,									\
+	PxReal bounceThresholdF32,								\
+	PxReal frictionOffsetThreshold,							\
+	PxReal correlationDistance,								\
+	PxReal solverOffsetSlop,								\
+	PxConstraintAllocator& constraintAllocator,				\
+	Cm::SpatialVectorF* Z
+
+#define CREATE_FINALIZE_SOVLER_CONTACT_METHOD_ARGS_4									\
+								 PxsContactManagerOutput** outputs,						\
+								 ThreadContext& threadContext,							\
+								 PxSolverContactDesc* blockDescs,						\
+								 const PxReal invDtF32,									\
+								 PxReal bounceThresholdF32,								\
+								 PxReal	frictionThresholdF32,							\
+								 PxReal	correlationDistanceF32,							\
+								 PxReal solverOffsetSlopF32,							\
+								 PxConstraintAllocator& constraintAllocator				
+
+	
+/*!
+Method prototype for create finalize solver contact
+*/
+
+typedef	bool (*PxcCreateFinalizeSolverContactMethod)(CREATE_FINALIZE_SOLVER_CONTACT_METHOD_ARGS);
+
+extern PxcCreateFinalizeSolverContactMethod createFinalizeMethods[3];
+
+typedef	SolverConstraintPrepState::Enum (*PxcCreateFinalizeSolverContactMethod4)(CREATE_FINALIZE_SOVLER_CONTACT_METHOD_ARGS_4);
+
+extern PxcCreateFinalizeSolverContactMethod4 createFinalizeMethods4[3];
+
+
+bool createFinalizeSolverContacts(	PxSolverContactDesc& contactDesc,
+									PxsContactManagerOutput& output,
+									ThreadContext& threadContext,
+									const PxReal invDtF32,
+									PxReal bounceThresholdF32,
+									PxReal frictionOffsetThreshold,
+									PxReal correlationDistance,
+									PxReal solverOffsetSlop,
+									PxConstraintAllocator& constraintAllocator,
+									Cm::SpatialVectorF* Z);
+
+bool createFinalizeSolverContacts(	PxSolverContactDesc& contactDesc,
+									CorrelationBuffer& c,
+									const PxReal invDtF32,
+									PxReal bounceThresholdF32,
+									PxReal frictionOffsetThreshold,
+									PxReal correlationDistance,
+									PxReal solverOffsetSlop,
+									PxConstraintAllocator& constraintAllocator,
+									Cm::SpatialVectorF* Z);
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4(	PxsContactManagerOutput** outputs,
+																 ThreadContext& threadContext,
+																 PxSolverContactDesc* blockDescs,
+																 const PxReal invDtF32,
+																 PxReal bounceThresholdF32,
+																 PxReal frictionOffsetThreshold,
+																 PxReal correlationDistance,
+																 PxReal solverOffsetSlop,
+																 PxConstraintAllocator& constraintAllocator);
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4(	Dy::CorrelationBuffer& c,
+																PxSolverContactDesc* blockDescs,
+																const PxReal invDtF32,
+																PxReal bounceThresholdF32,
+																PxReal	frictionOffsetThreshold,
+																PxReal correlationDistance,
+																PxReal solverOffsetSlop,
+																PxConstraintAllocator& constraintAllocator);
+
+
+
+bool createFinalizeSolverContactsCoulomb1D(PxSolverContactDesc& contactDesc,
+											 PxsContactManagerOutput& output,
+											 ThreadContext& threadContext,
+											 const PxReal invDtF32,
+											 PxReal bounceThresholdF32,
+											 PxReal frictionOffsetThreshold,
+											 PxReal correlationDistance,
+											 PxReal solverOffsetSlop,
+											 PxConstraintAllocator& constraintAllocator,
+											 Cm::SpatialVectorF* Z);
+
+bool createFinalizeSolverContactsCoulomb2D(PxSolverContactDesc& contactDesc,
+											PxsContactManagerOutput& output,
+											ThreadContext& threadContext,
+											const PxReal invDtF32,
+											PxReal bounceThresholdF32,
+											PxReal frictionOffsetThreshold,
+											PxReal correlationDistance,
+											PxReal solverOffsetSlop,
+											PxConstraintAllocator& constraintAllocator,
+											Cm::SpatialVectorF* Z);
+
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb1D(	PxsContactManagerOutput** outputs,
+																		ThreadContext& threadContext,
+																		 PxSolverContactDesc* blockDescs,
+																		 const PxReal invDtF32,
+																		 PxReal bounceThresholdF32,
+																		 PxReal frictionOffsetThreshold,
+																		 PxReal correlationDistance,
+																		 PxReal solverOffsetSlop,
+																		 PxConstraintAllocator& constraintAllocator);
+
+SolverConstraintPrepState::Enum createFinalizeSolverContacts4Coulomb2D(PxsContactManagerOutput** outputs,
+																		ThreadContext& threadContext,
+																		PxSolverContactDesc* blockDescs,
+																		const PxReal invDtF32,
+																		PxReal bounceThresholdF32,
+																		PxReal frictionOffsetThreshold,
+																		PxReal correlationDistance,
+																		PxReal solverOffsetSlop,
+																		PxConstraintAllocator& constraintAllocator);
+
+
+PxU32 getContactManagerConstraintDesc(const PxsContactManagerOutput& cmOutput, const PxsContactManager& cm, PxSolverConstraintDesc& desc);
+
+class BlockAllocator : public PxConstraintAllocator
+{
+	PxsConstraintBlockManager& mConstraintBlockManager;
+	PxcConstraintBlockStream& mConstraintBlockStream;
+	FrictionPatchStreamPair& mFrictionPatchStreamPair;
+	PxU32& mTotalConstraintByteSize;
+public:
+
+	BlockAllocator(PxsConstraintBlockManager& constraintBlockManager, PxcConstraintBlockStream& constraintBlockStream, FrictionPatchStreamPair& frictionPatchStreamPair,
+		PxU32& totalConstraintByteSize) :
+		mConstraintBlockManager(constraintBlockManager), mConstraintBlockStream(constraintBlockStream), mFrictionPatchStreamPair(frictionPatchStreamPair),
+		mTotalConstraintByteSize(totalConstraintByteSize)
+	{
+	}
+
+	virtual PxU8* reserveConstraintData(const PxU32 size);
+
+	virtual PxU8* reserveFrictionData(const PxU32 size);
+
+	virtual PxU8* findInputPatches(PxU8* frictionCookie)
+	{
+		return frictionCookie;
+	}
+
+	PX_NOCOPY(BlockAllocator)
+
+};
+
+}
+
+}
+
+#endif //DY_CONTACTPREP_H
--- a/physx/source/lowleveldynamics/src/DyContactPrep4.cpp
+++ b/physx/source/lowleveldynamics/src/DyContactPrep4.cpp
--- a/physx/source/lowleveldynamics/src/DyContactPrep4PF.cpp
+++ b/physx/source/lowleveldynamics/src/DyContactPrep4PF.cpp
--- a/physx/source/lowleveldynamics/src/DyContactPrepPF.cpp
+++ b/physx/source/lowleveldynamics/src/DyContactPrepPF.cpp
@ -0,0 +1,660 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+     
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DySolverConstraintTypes.h"
+#include "PxcNpWorkUnit.h"
+#include "DyThreadContext.h"
+#include "DyContactPrep.h"
+#include "PxcNpContactPrepShared.h"
+//#include "PxvGeometry.h"
+#include "PxvDynamics.h"
+#include "DyCorrelationBuffer.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverBody.h"
+#include "DySolverContact4.h"
+#include "DySolverContactPF4.h"
+
+
+#include "PsVecMath.h"
+#include "PxContactModifyCallback.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+#include "DySolverExt.h"
+#include "DyArticulationContactPrep.h"
+#include "DyContactPrepShared.h"
+
+#include "PsFoundation.h"
+
+using namespace physx::Gu;
+using namespace physx::shdfnd::aos;
+
+namespace physx
+{
+namespace Dy
+{
+
+bool createFinalizeSolverContactsCoulomb(PxSolverContactDesc& contactDesc,
+		PxsContactManagerOutput& output,
+		ThreadContext& threadContext,
+		const PxReal invDtF32,
+		PxReal bounceThresholdF32,
+		PxReal frictionOffsetThreshold,
+		PxReal correlationDistance,
+		PxReal solverOffsetSlop,
+		PxConstraintAllocator& constraintAllocator,
+		PxFrictionType::Enum frictionType,
+		Cm::SpatialVectorF* Z);
+
+static bool setupFinalizeSolverConstraintsCoulomb(
+												  Sc::ShapeInteraction* shapeInteraction,
+						    const ContactBuffer& buffer,
+							const CorrelationBuffer& c,
+							const PxTransform& bodyFrame0,
+							const PxTransform& bodyFrame1,
+							PxU8* workspace,
+							const PxSolverBodyData& data0,
+							const PxSolverBodyData& data1,
+							const PxReal invDtF32,
+							PxReal bounceThresholdF32,
+							PxU32 frictionPerPointCount,
+							const bool hasForceThresholds,
+							const bool staticBody,
+							PxReal invMassScale0, PxReal invInertiaScale0, 
+							PxReal invMassScale1, PxReal invInertiaScale1,
+							PxReal restDist,
+							const PxReal maxCCDSeparation,
+							const PxReal solverOffsetSlopF32)
+{   
+	const FloatV ccdMaxSeparation = FLoad(maxCCDSeparation);
+	const Vec3V solverOffsetSlop = V3Load(solverOffsetSlopF32);
+	PxU8* PX_RESTRICT ptr = workspace;
+	const FloatV zero=FZero();
+
+	PxU8 flags = PxU8(hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0);
+
+	const FloatV restDistance = FLoad(restDist);
+
+	const Vec3V bodyFrame0p = V3LoadU(bodyFrame0.p);
+	const Vec3V bodyFrame1p = V3LoadU(bodyFrame1.p);
+
+	Ps::prefetchLine(c.contactID);
+	Ps::prefetchLine(c.contactID, 128);
+	
+	const PxU32 frictionPatchCount = c.frictionPatchCount;
+
+	const PxU32 pointStride = sizeof(SolverContactPoint);
+	const PxU32 frictionStride = sizeof(SolverContactFriction);
+	const PxU8 pointHeaderType = Ps::to8(staticBody ? DY_SC_TYPE_STATIC_CONTACT : DY_SC_TYPE_RB_CONTACT);
+	const PxU8 frictionHeaderType = Ps::to8(staticBody ? DY_SC_TYPE_STATIC_FRICTION : DY_SC_TYPE_FRICTION);
+
+
+	const Vec3V linVel0 = V3LoadU(data0.linearVelocity);
+	const Vec3V linVel1 = V3LoadU(data1.linearVelocity);
+	const Vec3V angVel0 = V3LoadU(data0.angularVelocity);
+	const Vec3V angVel1 = V3LoadU(data1.angularVelocity);
+
+
+	const FloatV invMass0 = FLoad(data0.invMass);
+	const FloatV invMass1 = FLoad(data1.invMass);
+
+	const FloatV maxPenBias = FMax(FLoad(data0.penBiasClamp), FLoad(data1.penBiasClamp));
+
+	// PT: the matrix is symmetric so we can read it as a PxMat33! Gets rid of 25000+ LHS.
+	const PxMat33& invIn0 = reinterpret_cast<const PxMat33&>(data0.sqrtInvInertia);
+	PX_ALIGN(16, const Mat33V invSqrtInertia0)
+	(
+		V3LoadU(invIn0.column0),
+		V3LoadU(invIn0.column1),
+		V3LoadU(invIn0.column2)
+	);
+	const PxMat33& invIn1 = reinterpret_cast<const PxMat33&>(data1.sqrtInvInertia);
+	PX_ALIGN(16, const Mat33V invSqrtInertia1)
+	(
+		V3LoadU(invIn1.column0),
+		V3LoadU(invIn1.column1),
+		V3LoadU(invIn1.column2)
+	);
+
+	const FloatV invDt = FLoad(invDtF32);
+	const FloatV p8 = FLoad(0.8f);
+	const FloatV bounceThreshold = FLoad(bounceThresholdF32);
+	const FloatV orthoThreshold = FLoad(0.70710678f);
+	const FloatV eps = FLoad(0.00001f);
+
+	const FloatV invDtp8 = FMul(invDt, p8);
+
+	const FloatV d0 = FLoad(invMassScale0);
+	const FloatV d1 = FLoad(invMassScale1);
+	const FloatV nDom1fV = FNeg(d1);
+	const FloatV angD0 = FLoad(invInertiaScale0);
+	const FloatV angD1 = FLoad(invInertiaScale1);
+
+	const FloatV invMass0_dom0fV = FMul(d0, invMass0);
+	const FloatV invMass1_dom1fV = FMul(nDom1fV, invMass1);
+
+
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		const Vec3V normal = Ps::aos::V3LoadA(contactBase0->normal);
+
+		const FloatV normalLenSq = V3LengthSq(normal);
+		const VecCrossV norCross = V3PrepareCross(normal);
+
+		const FloatV restitution = FLoad(contactBase0->restitution);
+
+		const FloatV norVel = V3SumElems(V3NegMulSub(normal, linVel1, V3Mul(normal, linVel0)));
+		/*const FloatV norVel0 = V3Dot(normal, linVel0);
+		const FloatV norVel1 = V3Dot(normal, linVel1);
+		const FloatV norVel = FSub(norVel0, norVel1);*/
+
+		const FloatV invMassNorLenSq0 = FMul(invMass0_dom0fV, normalLenSq);
+		const FloatV invMassNorLenSq1 = FMul(invMass1_dom1fV, normalLenSq);
+	
+		
+		SolverContactCoulombHeader* PX_RESTRICT header = reinterpret_cast<SolverContactCoulombHeader*>(ptr);
+		ptr += sizeof(SolverContactCoulombHeader);
+
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+
+		header->numNormalConstr		= PxU8(contactCount);
+		header->type				= pointHeaderType;
+		//header->setRestitution(n.restitution);
+		//header->setRestitution(contactBase0->restitution);
+		
+		header->setDominance0(invMass0_dom0fV);
+		header->setDominance1(FNeg(invMass1_dom1fV));
+		FStore(angD0, &header->angDom0);
+		FStore(angD1, &header->angDom1);
+		header->setNormal(normal);
+		header->flags = flags;
+		header->shapeInteraction = shapeInteraction;
+
+		
+		for(PxU32 patch=c.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = c.contactPatches[patch].next)
+		{
+			const PxU32 count = c.contactPatches[patch].count;
+			const Gu::ContactPoint* contactBase = buffer.contacts + c.contactPatches[patch].start;
+
+				
+			PxU8* p = ptr;
+			for(PxU32 j=0;j<count;j++)
+			{
+				const Gu::ContactPoint& contact = contactBase[j];
+
+				SolverContactPoint* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPoint*>(p);
+				p += pointStride;
+
+				constructContactConstraint(invSqrtInertia0, invSqrtInertia1, invMassNorLenSq0, 
+					invMassNorLenSq1, angD0, angD1, bodyFrame0p, bodyFrame1p,
+					normal, norVel, norCross, angVel0, angVel1,
+					invDt, invDtp8, restDistance, maxPenBias,  restitution,
+					bounceThreshold, contact, *solverContact, ccdMaxSeparation, solverOffsetSlop);
+			}			
+			ptr = p;
+		}
+	}
+
+	//construct all the frictions
+
+	PxU8* PX_RESTRICT ptr2 = workspace;
+
+	bool hasFriction = false;
+	for(PxU32 i=0;i< frictionPatchCount;i++)
+	{
+		const PxU32 contactCount = c.frictionPatchContactCounts[i];
+		if(contactCount == 0)
+			continue;
+
+		const Gu::ContactPoint* contactBase0 = buffer.contacts + c.contactPatches[c.correlationListHeads[i]].start;
+
+		SolverContactCoulombHeader* header = reinterpret_cast<SolverContactCoulombHeader*>(ptr2); 
+		header->frictionOffset = PxU16(ptr - ptr2);// + sizeof(SolverFrictionHeader);
+		ptr2 += sizeof(SolverContactCoulombHeader) + header->numNormalConstr * pointStride;
+
+		const PxReal staticFriction = contactBase0->staticFriction;
+		const bool disableStrongFriction = !!(contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION);
+		const bool haveFriction = (disableStrongFriction == 0);
+	
+		SolverFrictionHeader* frictionHeader = reinterpret_cast<SolverFrictionHeader*>(ptr);
+		frictionHeader->numNormalConstr = Ps::to8(c.frictionPatchContactCounts[i]);
+		frictionHeader->numFrictionConstr = Ps::to8(haveFriction ? c.frictionPatchContactCounts[i] * frictionPerPointCount : 0);
+		ptr += sizeof(SolverFrictionHeader);
+		PxF32* appliedForceBuffer = reinterpret_cast<PxF32*>(ptr);
+		ptr += frictionHeader->getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+		PxMemZero(appliedForceBuffer, sizeof(PxF32)*contactCount*frictionPerPointCount);
+		Ps::prefetchLine(ptr, 128);
+		Ps::prefetchLine(ptr, 256);
+		Ps::prefetchLine(ptr, 384);
+
+		const Vec3V normal = V3LoadU(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);
+
+		const FloatV normalX = V3GetX(normal);
+		const FloatV normalY = V3GetY(normal);
+		const FloatV normalZ = V3GetZ(normal);
+		
+		const Vec3V t0Fallback1 = V3Merge(zero, FNeg(normalZ), normalY);
+		const Vec3V t0Fallback2 = V3Merge(FNeg(normalY), normalX, zero) ;
+
+		const BoolV con = FIsGrtr(orthoThreshold, FAbs(normalX));
+		const Vec3V tFallback1 = V3Sel(con, t0Fallback1, t0Fallback2);
+
+		const Vec3V linVrel = V3Sub(linVel0, linVel1);
+		const Vec3V t0_ = V3Sub(linVrel, V3Scale(normal, V3Dot(normal, linVrel)));
+		const FloatV sqDist = V3Dot(t0_,t0_);
+		const BoolV con1 = FIsGrtr(sqDist, eps);
+		const Vec3V tDir0 =V3Normalize(V3Sel(con1, t0_, tFallback1));
+		const Vec3V tDir1 = V3Cross(tDir0, normal);
+
+		Vec3V tFallback = tDir0;
+		Vec3V tFallbackAlt = tDir1;
+
+		if(haveFriction)
+		{
+			//frictionHeader->setStaticFriction(n.staticFriction);
+			frictionHeader->setStaticFriction(staticFriction);
+			FStore(invMass0_dom0fV, &frictionHeader->invMass0D0);
+			FStore(FNeg(invMass1_dom1fV), &frictionHeader->invMass1D1);
+			FStore(angD0, &frictionHeader->angDom0);
+			FStore(angD1, &frictionHeader->angDom1);
+			frictionHeader->type			= frictionHeaderType;
+			
+			PxU32 totalPatchContactCount = 0;
+		
+			for(PxU32 patch=c.correlationListHeads[i]; 
+				patch!=CorrelationBuffer::LIST_END; 
+				patch = c.contactPatches[patch].next)
+			{
+				const PxU32 count = c.contactPatches[patch].count;
+				const PxU32 start = c.contactPatches[patch].start;
+				const Gu::ContactPoint* contactBase = buffer.contacts + start;
+					
+				PxU8* p = ptr;
+				for(PxU32 j =0; j < count; j++)
+				{
+					hasFriction = true;
+					const Gu::ContactPoint& contact = contactBase[j];
+					const Vec3V point = V3LoadU(contact.point);
+					Vec3V ra = V3Sub(point, bodyFrame0p);
+					Vec3V rb = V3Sub(point, bodyFrame1p);
+					ra = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(ra)), V3Zero(), ra);
+					rb = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(rb)), V3Zero(), rb);
+					const Vec3V targetVel = V3LoadU(contact.targetVel);
+
+					for(PxU32 k = 0; k < frictionPerPointCount; ++k)
+					{
+						const Vec3V t0 = tFallback;
+						tFallback = tFallbackAlt;
+						tFallbackAlt = t0;
+
+						SolverContactFriction* PX_RESTRICT f0 = reinterpret_cast<SolverContactFriction*>(p);
+						p += frictionStride;
+						//f0->brokenOrContactIndex = contactId;
+
+						const Vec3V raXn = V3Cross(ra, t0);
+						const Vec3V rbXn = V3Cross(rb, t0);
+
+						const Vec3V delAngVel0 = M33MulV3(invSqrtInertia0, raXn);
+						const Vec3V delAngVel1 = M33MulV3(invSqrtInertia1, rbXn);
+
+						const FloatV resp0 = FAdd(invMass0_dom0fV, FMul(angD0, V3Dot(delAngVel0, delAngVel0)));
+						const FloatV resp1 = FSub(FMul(angD1, V3Dot(delAngVel1, delAngVel1)), invMass1_dom1fV);
+						const FloatV resp = FAdd(resp0, resp1);
+
+						const FloatV velMultiplier = FNeg(FSel(FIsGrtr(resp, zero), FRecip(resp), zero));
+
+						const FloatV vrel1 = FAdd(V3Dot(t0, linVel0), V3Dot(raXn, angVel0));
+						const FloatV vrel2 = FAdd(V3Dot(t0, linVel1), V3Dot(rbXn, angVel1));
+						const FloatV vrel = FSub(vrel1, vrel2);
+
+
+						f0->normalXYZ_appliedForceW = V4SetW(Vec4V_From_Vec3V(t0), zero);
+						f0->raXnXYZ_velMultiplierW = V4SetW(Vec4V_From_Vec3V(delAngVel0), velMultiplier);
+						//f0->rbXnXYZ_targetVelocityW = V4SetW(Vec4V_From_Vec3V(delAngVel1), FSub(V3Dot(targetVel, t0), vrel));
+						f0->rbXnXYZ_biasW = Vec4V_From_Vec3V(delAngVel1);
+						FStore(FSub(V3Dot(targetVel, t0), vrel), &f0->targetVel);
+					}
+				}
+
+				totalPatchContactCount += c.contactPatches[patch].count;
+				
+				ptr = p;	
+			}
+		}
+	}
+	*ptr = 0;
+	return hasFriction;
+}
+
+
+
+static void computeBlockStreamByteSizesCoulomb(const CorrelationBuffer& c,
+													 const PxU32 frictionCountPerPoint, PxU32& _solverConstraintByteSize,
+													 PxU32& _axisConstraintCount,
+													 bool useExtContacts)
+{
+	PX_ASSERT(0 == _solverConstraintByteSize);
+	PX_ASSERT(0 == _axisConstraintCount);
+
+	// PT: use local vars to remove LHS
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 numFrictionPatches = 0;
+	PxU32 axisConstraintCount = 0;
+
+	for(PxU32 i = 0; i < c.frictionPatchCount; i++)
+	{
+		//Friction patches.
+		if(c.correlationListHeads[i] != CorrelationBuffer::LIST_END)
+			numFrictionPatches++;
+
+
+		const FrictionPatch& frictionPatch = c.frictionPatches[i];
+		const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0;
+
+		//Solver constraint data.
+		if(c.frictionPatchContactCounts[i]!=0)
+		{
+			solverConstraintByteSize += sizeof(SolverContactCoulombHeader);
+			
+			solverConstraintByteSize += useExtContacts ? c.frictionPatchContactCounts[i] * sizeof(SolverContactPointExt) 
+				: c.frictionPatchContactCounts[i] * sizeof(SolverContactPoint);
+
+			axisConstraintCount += c.frictionPatchContactCounts[i];
+
+			//We always need the friction headers to write the accumulated 
+			if(haveFriction)
+			{
+				//4 bytes
+				solverConstraintByteSize += sizeof(SolverFrictionHeader);
+				//buffer to store applied forces in
+				solverConstraintByteSize += SolverFrictionHeader::getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+
+				const PxU32 nbFrictionConstraints = c.frictionPatchContactCounts[i] * frictionCountPerPoint;
+
+				solverConstraintByteSize += useExtContacts ? nbFrictionConstraints * sizeof(SolverContactFrictionExt)
+					: nbFrictionConstraints * sizeof(SolverContactFriction);
+				axisConstraintCount += c.frictionPatchContactCounts[i];
+			}
+			else
+			{
+				//reserve buffers for storing accumulated impulses
+				solverConstraintByteSize += sizeof(SolverFrictionHeader);
+				solverConstraintByteSize += SolverFrictionHeader::getAppliedForcePaddingSize(c.frictionPatchContactCounts[i]);
+			}
+		}
+	}  
+	_axisConstraintCount = axisConstraintCount;
+
+	//16-byte alignment.
+	_solverConstraintByteSize =  ((solverConstraintByteSize + 0x0f) & ~0x0f);
+	PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
+}
+
+static bool reserveBlockStreamsCoulomb(const CorrelationBuffer& c,
+						PxU8*& solverConstraint, PxU32 frictionCountPerPoint,
+						PxU32& solverConstraintByteSize,
+						PxU32& axisConstraintCount, PxConstraintAllocator& constraintAllocator,
+						bool useExtContacts)
+{
+	PX_ASSERT(NULL == solverConstraint);
+	PX_ASSERT(0 == solverConstraintByteSize);
+	PX_ASSERT(0 == axisConstraintCount);
+	
+
+	//From constraintBlockStream we need to reserve contact points, contact forces, and a char buffer for the solver constraint data (already have a variable for this).
+	//From frictionPatchStream we just need to reserve a single buffer.
+
+	//Compute the sizes of all the buffers.
+	computeBlockStreamByteSizesCoulomb(
+		c,
+		frictionCountPerPoint, solverConstraintByteSize,
+		axisConstraintCount, useExtContacts);
+
+	//Reserve the buffers.
+
+	//First reserve the accumulated buffer size for the constraint block.
+	PxU8* constraintBlock = NULL;
+	const PxU32 constraintBlockByteSize = solverConstraintByteSize;
+	if(constraintBlockByteSize > 0)
+	{
+		constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
+
+		if(0==constraintBlock || (reinterpret_cast<PxU8*>(-1))==constraintBlock)
+		{
+			if(0==constraintBlock)
+			{
+				PX_WARN_ONCE(
+					"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
+					"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
+			}
+			else
+			{
+				PX_WARN_ONCE(
+					"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
+					"Either accept dropped contacts or simplify collision geometry.");
+				constraintBlock=NULL;
+			}
+		}
+	}
+
+	//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
+	if(0==constraintBlockByteSize || constraintBlock)
+	{
+		if(solverConstraintByteSize)
+		{
+			solverConstraint = constraintBlock;
+			PX_ASSERT(0==(uintptr_t(solverConstraint) & 0x0f));
+		}
+	}
+
+	//Return true if neither of the two block reservations failed.
+	return ((0==constraintBlockByteSize || constraintBlock));
+}
+
+bool createFinalizeSolverContactsCoulomb1D(PxSolverContactDesc& contactDesc,
+	PxsContactManagerOutput& output,
+	ThreadContext& threadContext,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxReal solverOffsetSlop,
+	PxConstraintAllocator& constraintAllocator,
+	Cm::SpatialVectorF* Z)
+{
+	return createFinalizeSolverContactsCoulomb(contactDesc, output, threadContext, invDtF32, bounceThresholdF32, frictionOffsetThreshold, correlationDistance, solverOffsetSlop,
+		constraintAllocator, PxFrictionType::eONE_DIRECTIONAL, Z);
+}
+
+bool createFinalizeSolverContactsCoulomb2D(PxSolverContactDesc& contactDesc,
+	PxsContactManagerOutput& output,
+	ThreadContext& threadContext,
+	const PxReal invDtF32,
+	PxReal bounceThresholdF32,
+	PxReal frictionOffsetThreshold,
+	PxReal correlationDistance,
+	PxReal solverOffsetSlop,
+	PxConstraintAllocator& constraintAllocator,
+	Cm::SpatialVectorF* Z)
+
+{
+	return createFinalizeSolverContactsCoulomb(contactDesc, output, threadContext, invDtF32, bounceThresholdF32, frictionOffsetThreshold, correlationDistance, solverOffsetSlop,
+		constraintAllocator, PxFrictionType::eTWO_DIRECTIONAL, Z);
+}
+
+bool createFinalizeSolverContactsCoulomb(PxSolverContactDesc& contactDesc,
+									PxsContactManagerOutput& output,
+								 ThreadContext& threadContext,
+								 const PxReal invDtF32,
+								 PxReal bounceThresholdF32,
+								 PxReal frictionOffsetThreshold,
+								 PxReal correlationDistance,
+								 PxReal solverOffsetSlop,
+								 PxConstraintAllocator& constraintAllocator,
+								 PxFrictionType::Enum frictionType,
+								 Cm::SpatialVectorF* Z)
+{
+	PX_UNUSED(frictionOffsetThreshold);
+	PX_UNUSED(correlationDistance);
+
+	PxSolverConstraintDesc& desc = *contactDesc.desc;
+
+	desc.constraintLengthOver16 = 0;
+	
+	ContactBuffer& buffer = threadContext.mContactBuffer;
+
+	buffer.count = 0;
+
+	// We pull the friction patches out of the cache to remove the dependency on how
+	// the cache is organized. Remember original addrs so we can write them back 
+	// efficiently.
+
+	Ps::prefetchLine(contactDesc.frictionPtr);
+
+	PxReal invMassScale0 = 1.f;
+	PxReal invMassScale1 = 1.f;
+	PxReal invInertiaScale0 = 1.f;
+	PxReal invInertiaScale1 = 1.f;
+
+	bool hasMaxImpulse = false, hasTargetVelocity = false;
+	
+	PxU32 numContacts = extractContacts(buffer, output, hasMaxImpulse, hasTargetVelocity, invMassScale0, invMassScale1, 
+			invInertiaScale0, invInertiaScale1, PxMin(contactDesc.data0->maxContactImpulse, contactDesc.data1->maxContactImpulse));
+
+	if(numContacts == 0)
+	{
+		contactDesc.frictionPtr = NULL;
+		contactDesc.frictionCount = 0;
+		return true;
+	}
+
+	Ps::prefetchLine(contactDesc.body0);
+	Ps::prefetchLine(contactDesc.body1);
+	Ps::prefetchLine(contactDesc.data0);
+	Ps::prefetchLine(contactDesc.data1);
+
+	CorrelationBuffer& c = threadContext.mCorrelationBuffer;
+	c.frictionPatchCount = 0;
+	c.contactPatchCount = 0;
+
+	createContactPatches(c, buffer.contacts, buffer.count, PXC_SAME_NORMAL);	
+
+	PxU32 numFrictionPerPatch = PxU32(frictionType == PxFrictionType::eONE_DIRECTIONAL ? 1 : 2);
+	
+	bool overflow = correlatePatches(c, buffer.contacts, contactDesc.bodyFrame0, contactDesc.bodyFrame1, PXC_SAME_NORMAL, 0, 0);
+	PX_UNUSED(overflow);
+#if PX_CHECKED
+	if(overflow)
+	{
+		Ps::getFoundation().error(physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__, 
+					"Dropping contacts in solver because we exceeded limit of 32 friction patches.");
+	}
+#endif
+
+
+	//PX_ASSERT(patchCount == c.frictionPatchCount);
+
+	PxU8* solverConstraint = NULL;
+	PxU32 solverConstraintByteSize = 0;
+	PxU32 axisConstraintCount = 0;
+
+	const bool useExtContacts = !!((contactDesc.bodyState0 | contactDesc.bodyState1) & PxSolverContactDesc::eARTICULATION);
+
+	const bool successfulReserve = reserveBlockStreamsCoulomb(
+		c,
+		solverConstraint, numFrictionPerPatch,
+		solverConstraintByteSize,
+		axisConstraintCount,
+		constraintAllocator,
+		useExtContacts);
+
+	// initialise the work unit's ptrs to the various buffers.
+
+	contactDesc.frictionPtr = NULL;
+	desc.constraint = NULL;
+	desc.constraintLengthOver16 = 0;
+	contactDesc.frictionCount = 0;
+	
+	// patch up the work unit with the reserved buffers and set the reserved buffer data as appropriate.
+
+	if(successfulReserve)
+	{
+		desc.constraint = solverConstraint;
+		output.nbContacts = Ps::to8(numContacts);
+		desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize/16);
+
+		//Initialise solverConstraint buffer.
+		if(solverConstraint)
+		{
+			bool hasFriction = false;
+			const PxSolverBodyData& data0 = *contactDesc.data0;
+			const PxSolverBodyData& data1 = *contactDesc.data1;
+			if(useExtContacts)
+			{
+				const SolverExtBody b0(reinterpret_cast<const void*>(contactDesc.body0), reinterpret_cast<const void*>(&data0), desc.linkIndexA);
+				const SolverExtBody b1(reinterpret_cast<const void*>(contactDesc.body1), reinterpret_cast<const void*>(&data1), desc.linkIndexB);
+
+				hasFriction = setupFinalizeExtSolverContactsCoulomb(buffer, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					invDtF32, bounceThresholdF32, b0, b1, numFrictionPerPatch,
+					invMassScale0, invInertiaScale0, invMassScale1, invInertiaScale1, contactDesc.restDistance, contactDesc.maxCCDSeparation, Z);
+			}
+			else
+			{
+				hasFriction = setupFinalizeSolverConstraintsCoulomb(contactDesc.shapeInteraction, buffer, c, contactDesc.bodyFrame0, contactDesc.bodyFrame1, solverConstraint,
+					data0, data1, invDtF32, bounceThresholdF32, numFrictionPerPatch, contactDesc.hasForceThresholds, contactDesc.bodyState1 == PxSolverContactDesc::eSTATIC_BODY,
+					invMassScale0, invInertiaScale0, invMassScale1, invInertiaScale1, contactDesc.restDistance, contactDesc.maxCCDSeparation, solverOffsetSlop);
+			}
+			*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
+			*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize + 4)) = hasFriction ? 0xFFFFFFFF : 0;
+		}
+	}
+
+	return successfulReserve;
+}
+
+}
+}
+
+
--- a/physx/source/lowleveldynamics/src/DyContactPrepShared.h
+++ b/physx/source/lowleveldynamics/src/DyContactPrepShared.h
@ -0,0 +1,308 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_CONTACT_PREP_SHARED_H
+#define DY_CONTACT_PREP_SHARED_H
+     
+#include "foundation/PxPreprocessor.h"
+#include "PxSceneDesc.h"
+#include "PsVecMath.h"
+#include "PsMathUtils.h"
+#include "DyContactPrep.h"
+#include "DyCorrelationBuffer.h"
+#include "DyArticulationContactPrep.h"
+#include "PxsContactManager.h"
+#include "PxsContactManagerState.h"
+
+namespace physx
+{
+namespace Dy
+{
+
+
+PX_FORCE_INLINE bool pointsAreClose(const PxTransform& body1ToBody0,
+									const PxVec3& localAnchor0, const PxVec3& localAnchor1,
+									const PxVec3& axis, float correlDist)
+{
+	const PxVec3 body0PatchPoint1 = body1ToBody0.transform(localAnchor1);
+
+	return PxAbs((localAnchor0 - body0PatchPoint1).dot(axis))<correlDist;
+}
+
+PX_FORCE_INLINE bool isSeparated(const FrictionPatch& patch, const PxTransform& body1ToBody0, const PxReal correlationDistance)
+{
+	PX_ASSERT(patch.anchorCount <= 2);
+	for(PxU32 a = 0; a < patch.anchorCount; ++a)
+	{
+		if(!pointsAreClose(body1ToBody0, patch.body0Anchors[a], patch.body1Anchors[a], patch.body0Normal, correlationDistance))
+			return true;
+	}
+	return false;
+}
+
+
+inline bool getFrictionPatches(CorrelationBuffer& c,
+						const PxU8* frictionCookie,
+						PxU32 frictionPatchCount,
+						const PxTransform& bodyFrame0,
+						const PxTransform& bodyFrame1,
+						PxReal correlationDistance)
+{
+	PX_UNUSED(correlationDistance);
+	if(frictionCookie == NULL || frictionPatchCount == 0)
+		return true;
+
+	//KS - this is now DMA'd inside the shader so we don't need to immediate DMA it here
+	const FrictionPatch* patches = reinterpret_cast<const FrictionPatch*>(frictionCookie);
+
+	//Try working out relative transforms! TODO - can we compute this lazily for the first friction patch
+	bool evaluated = false;
+	PxTransform body1ToBody0;
+
+	while(frictionPatchCount--)
+	{
+		Ps::prefetchLine(patches,128);
+		const FrictionPatch& patch = *patches++;
+		PX_ASSERT (patch.broken == 0 || patch.broken == 1);
+		if(!patch.broken)
+		{
+			// if the eDISABLE_STRONG_FRICTION flag is there we need to blow away the previous frame's friction correlation, so
+			// that we can associate each friction anchor with a target velocity. So we lose strong friction.
+			if(patch.anchorCount != 0 && !(patch.materialFlags & PxMaterialFlag::eDISABLE_STRONG_FRICTION))
+			{
+				PX_ASSERT(patch.anchorCount <= 2);
+
+				
+				if(!evaluated)
+				{
+					body1ToBody0 = bodyFrame0.transformInv(bodyFrame1);
+					evaluated = true;
+				}
+
+
+				if(patch.body0Normal.dot(body1ToBody0.rotate(patch.body1Normal)) > PXC_SAME_NORMAL)
+				{
+					if(!isSeparated(patch, body1ToBody0, correlationDistance))
+					{
+						if(c.frictionPatchCount == CorrelationBuffer::MAX_FRICTION_PATCHES)
+							return false;
+						{
+							c.contactID[c.frictionPatchCount][0] = 0xffff;
+							c.contactID[c.frictionPatchCount][1] = 0xffff;
+							//Rotate the contact normal into world space
+							c.frictionPatchWorldNormal[c.frictionPatchCount] = bodyFrame0.rotate(patch.body0Normal);
+							c.frictionPatchContactCounts[c.frictionPatchCount] = 0;
+							c.patchBounds[c.frictionPatchCount].setEmpty();
+							c.correlationListHeads[c.frictionPatchCount] = CorrelationBuffer::LIST_END;
+							PxMemCopy(&c.frictionPatches[c.frictionPatchCount++], &patch, sizeof(FrictionPatch));
+						}
+					}
+				}
+			}
+		}
+	}
+	return true;
+}
+
+PX_FORCE_INLINE PxU32 extractContacts(Gu::ContactBuffer& buffer, PxsContactManagerOutput& npOutput, bool& hasMaxImpulse, bool& hasTargetVelocity,
+							 PxReal& invMassScale0, PxReal& invMassScale1, PxReal& invInertiaScale0, PxReal& invInertiaScale1, PxReal defaultMaxImpulse)
+{
+	PxContactStreamIterator iter(npOutput.contactPatches, npOutput.contactPoints, npOutput.getInternalFaceIndice(), npOutput.nbPatches, npOutput.nbContacts);	
+
+	PxU32 numContacts = buffer.count, origContactCount = buffer.count;
+	if(!iter.forceNoResponse)
+	{
+		invMassScale0 = iter.getInvMassScale0();
+		invMassScale1 = iter.getInvMassScale1();
+		invInertiaScale0 = iter.getInvInertiaScale0();
+		invInertiaScale1 = iter.getInvInertiaScale1();
+		hasMaxImpulse = (iter.patch->internalFlags & PxContactPatch::eHAS_MAX_IMPULSE) != 0;
+		hasTargetVelocity = (iter.patch->internalFlags & PxContactPatch::eHAS_TARGET_VELOCITY) != 0;
+
+		while(iter.hasNextPatch())
+		{
+			iter.nextPatch();
+			while(iter.hasNextContact())
+			{
+				iter.nextContact();
+				Ps::prefetchLine(iter.contact, 128);
+				Ps::prefetchLine(&buffer.contacts[numContacts], 128);
+				PxReal maxImpulse = hasMaxImpulse ? iter.getMaxImpulse() : defaultMaxImpulse;
+				if(maxImpulse != 0.f)
+				{
+					PX_ASSERT(numContacts < Gu::ContactBuffer::MAX_CONTACTS);
+					buffer.contacts[numContacts].normal = iter.getContactNormal();
+					buffer.contacts[numContacts].point = iter.getContactPoint();
+					buffer.contacts[numContacts].separation = iter.getSeparation();
+					//KS - we use the face indices to cache the material indices and flags - avoids bloating the PxContact structure
+					buffer.contacts[numContacts].materialFlags = PxU8(iter.getMaterialFlags());
+					buffer.contacts[numContacts].maxImpulse = maxImpulse;
+					buffer.contacts[numContacts].staticFriction = iter.getStaticFriction();
+					buffer.contacts[numContacts].dynamicFriction = iter.getDynamicFriction();
+					buffer.contacts[numContacts].restitution = iter.getRestitution();
+					const PxVec3& targetVel = iter.getTargetVel();
+					buffer.contacts[numContacts].targetVel = targetVel;
+					++numContacts;
+				}
+			}
+		}
+	}
+	const PxU32 contactCount = numContacts - origContactCount;
+	buffer.count = numContacts;
+	return contactCount;
+}
+
+struct CorrelationListIterator
+{
+	CorrelationBuffer& buffer;
+	PxU32 currPatch;
+	PxU32 currContact;
+
+	CorrelationListIterator(CorrelationBuffer& correlationBuffer, PxU32 startPatch) : buffer(correlationBuffer)
+	{
+		//We need to force us to advance the correlation buffer to the first available contact (if one exists)
+		PxU32 newPatch = startPatch, newContact = 0;
+
+		while(newPatch != CorrelationBuffer::LIST_END && newContact == buffer.contactPatches[newPatch].count)
+		{
+			newPatch = buffer.contactPatches[newPatch].next;
+			newContact = 0;
+		}
+
+		currPatch = newPatch;
+		currContact = newContact;
+	}
+
+	//Returns true if it has another contact pre-loaded. Returns false otherwise
+	PX_FORCE_INLINE bool hasNextContact()
+	{
+		return (currPatch != CorrelationBuffer::LIST_END && currContact < buffer.contactPatches[currPatch].count);
+	}
+
+	inline void nextContact(PxU32& patch, PxU32& contact)
+	{
+		PX_ASSERT(currPatch != CorrelationBuffer::LIST_END);
+		PX_ASSERT(currContact < buffer.contactPatches[currPatch].count);
+
+		patch = currPatch;
+		contact = currContact;
+		PxU32 newPatch = currPatch, newContact = currContact + 1;
+
+		while(newPatch != CorrelationBuffer::LIST_END && newContact == buffer.contactPatches[newPatch].count)
+		{
+			newPatch = buffer.contactPatches[newPatch].next;
+			newContact = 0;
+		}
+
+		currPatch = newPatch;
+		currContact = newContact;
+	}
+
+private:
+	CorrelationListIterator& operator=(const CorrelationListIterator&);
+
+};
+
+
+	PX_FORCE_INLINE void constructContactConstraint(const Mat33V& invSqrtInertia0, const Mat33V& invSqrtInertia1,  const FloatVArg invMassNorLenSq0, 
+		const FloatVArg invMassNorLenSq1, const FloatVArg angD0, const FloatVArg angD1, const Vec3VArg bodyFrame0p, const Vec3VArg bodyFrame1p,
+		const Vec3VArg normal, const FloatVArg norVel, const VecCrossV& norCross, const Vec3VArg angVel0, const Vec3VArg angVel1,
+		const FloatVArg invDt, const FloatVArg invDtp8, const FloatVArg restDistance, const FloatVArg maxPenBias,  const FloatVArg restitution,
+		const FloatVArg bounceThreshold, const Gu::ContactPoint& contact, SolverContactPoint& solverContact,
+		const FloatVArg ccdMaxSeparation, const Vec3VArg solverOffsetSlop)
+	{
+		const FloatV zero = FZero();
+		const Vec3V point = V3LoadA(contact.point);
+		const FloatV separation = FLoad(contact.separation);
+
+		const FloatV cTargetVel = V3Dot(normal, V3LoadA(contact.targetVel));
+
+		const Vec3V ra = V3Sub(point, bodyFrame0p);
+		const Vec3V rb = V3Sub(point, bodyFrame1p);
+
+		/*ra = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(ra)), V3Zero(), ra);
+		rb = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(rb)), V3Zero(), rb);*/
+
+		Vec3V raXn = V3Cross(ra, norCross);
+		Vec3V rbXn = V3Cross(rb, norCross);
+
+		raXn = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(raXn)), V3Zero(), raXn);
+		rbXn = V3Sel(V3IsGrtr(solverOffsetSlop, V3Abs(rbXn)), V3Zero(), rbXn);
+
+		const Vec3V raXnSqrtInertia = M33MulV3(invSqrtInertia0, raXn);
+		const Vec3V rbXnSqrtInertia = M33MulV3(invSqrtInertia1, rbXn);				
+
+		const FloatV resp0 = FAdd(invMassNorLenSq0, FMul(V3Dot(raXnSqrtInertia, raXnSqrtInertia), angD0));
+		const FloatV resp1 = FSub(FMul(V3Dot(rbXnSqrtInertia, rbXnSqrtInertia), angD1), invMassNorLenSq1);
+
+		const FloatV unitResponse = FAdd(resp0, resp1);
+
+		const FloatV vrel1 = FAdd(norVel, V3Dot(raXn, angVel0));
+		const FloatV vrel2 = V3Dot(rbXn, angVel1);
+		const FloatV vrel = FSub(vrel1, vrel2);
+
+		const FloatV velMultiplier = FSel(FIsGrtr(unitResponse, zero), FRecip(unitResponse), zero);
+
+		const FloatV penetration = FSub(separation, restDistance);
+
+		const FloatV penetrationInvDt = FMul(penetration, invDt);
+
+		const FloatV penetrationInvDtPt8 = FMax(maxPenBias, FMul(penetration, invDtp8));
+
+		FloatV scaledBias = FMul(velMultiplier, penetrationInvDtPt8);
+
+		const BoolV isGreater2 = BAnd(BAnd(FIsGrtr(restitution, zero), FIsGrtr(bounceThreshold, vrel)), FIsGrtr(FNeg(vrel), penetrationInvDt));
+
+		const BoolV ccdSeparationCondition = FIsGrtrOrEq(ccdMaxSeparation, penetration);
+
+		scaledBias = FSel(BAnd(ccdSeparationCondition, isGreater2), zero, scaledBias);
+
+		const FloatV sumVRel(vrel);
+
+		FloatV targetVelocity = FAdd(cTargetVel, FSel(isGreater2, FMul(FNeg(sumVRel), restitution), zero));
+
+		//Note - we add on the initial target velocity
+		targetVelocity = FSub(targetVelocity, vrel);
+
+		const FloatV biasedErr = FScaleAdd(targetVelocity, velMultiplier, FNeg(scaledBias));
+		const FloatV unbiasedErr = FScaleAdd(targetVelocity, velMultiplier, FSel(isGreater2, zero, FNeg(FMax(scaledBias, zero))));
+		//const FloatV unbiasedErr = FScaleAdd(targetVelocity, velMultiplier, FNeg(FMax(scaledBias, zero)));
+
+		FStore(velMultiplier, &solverContact.velMultiplier);
+		FStore(biasedErr, &solverContact.biasedErr);
+		FStore(unbiasedErr, &solverContact.unbiasedErr);
+		solverContact.maxImpulse = contact.maxImpulse;
+
+		solverContact.raXn = raXnSqrtInertia;
+		solverContact.rbXn = rbXnSqrtInertia;
+	}
+}
+}
+
+#endif //DY_CONTACT_PREP_SHARED_H
--- a/physx/source/lowleveldynamics/src/DyContactReduction.h
+++ b/physx/source/lowleveldynamics/src/DyContactReduction.h
@ -0,0 +1,409 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_CONTACT_REDUCTION_H
+#define DY_CONTACT_REDUCTION_H
+
+#include "geomutils/GuContactPoint.h"
+#include "PxsMaterialManager.h"
+
+namespace physx
+{
+
+
+namespace Dy
+{
+
+//KS - might be OK with 4 but 5 guarantees the deepest + 4 contacts that contribute to largest surface area
+#define CONTACT_REDUCTION_MAX_CONTACTS 6
+#define CONTACT_REDUCTION_MAX_PATCHES 32
+#define PXS_NORMAL_TOLERANCE 0.995f
+#define PXS_SEPARATION_TOLERANCE 0.001f
+
+
+	//A patch contains a normal, pair of material indices and a list of indices. These indices are 
+	//used to index into the PxContact array that's passed by the user
+	struct ReducedContactPatch
+	{
+		PxU32 numContactPoints;
+		PxU32 contactPoints[CONTACT_REDUCTION_MAX_CONTACTS];	
+	};
+
+	struct ContactPatch
+	{	
+		PxVec3 rootNormal;
+		ContactPatch* mNextPatch;
+		PxReal maxPenetration;
+		PxU16 startIndex;
+		PxU16 stride;
+		PxU16 rootIndex;
+		PxU16 index;
+	};
+
+	struct SortBoundsPredicateManifold
+	{
+		bool operator()(const ContactPatch* idx1, const ContactPatch* idx2) const
+		{
+			return idx1->maxPenetration < idx2->maxPenetration;
+		}
+	};
+
+
+
+	template <PxU32 MaxPatches>
+	class ContactReduction
+	{
+	public:
+		ReducedContactPatch mPatches[MaxPatches];
+		PxU32 mNumPatches;
+		ContactPatch mIntermediatePatches[CONTACT_REDUCTION_MAX_PATCHES];
+		ContactPatch* mIntermediatePatchesPtrs[CONTACT_REDUCTION_MAX_PATCHES];
+		PxU32 mNumIntermediatePatches;
+		Gu::ContactPoint* PX_RESTRICT mOriginalContacts;
+		PxsMaterialInfo* PX_RESTRICT mMaterialInfo;
+		PxU32 mNumOriginalContacts;
+
+		ContactReduction(Gu::ContactPoint* PX_RESTRICT originalContacts, PxsMaterialInfo* PX_RESTRICT materialInfo, PxU32 numContacts) : 
+		mNumPatches(0), mNumIntermediatePatches(0),	mOriginalContacts(originalContacts), mMaterialInfo(materialInfo), mNumOriginalContacts(numContacts)
+		{
+		}
+
+		void reduceContacts()
+		{
+			//First pass, break up into contact patches, storing the start and stride of the patches
+			//We will need to have contact patches and then coallesce them
+			mIntermediatePatches[0].rootNormal = mOriginalContacts[0].normal;
+			mIntermediatePatches[0].mNextPatch = NULL;
+			mIntermediatePatches[0].startIndex = 0;
+			mIntermediatePatches[0].rootIndex = 0;
+			mIntermediatePatches[0].maxPenetration = mOriginalContacts[0].separation;
+			mIntermediatePatches[0].index = 0;
+			PxU16 numPatches = 1;
+			//PxU32 startIndex = 0;
+			PxU32 numUniquePatches = 1;
+			PxU16 m = 1;
+			for(; m < mNumOriginalContacts; ++m)
+			{
+				PxI32 index = -1;
+				for(PxU32 b = numPatches; b > 0; --b)
+				{
+					ContactPatch& patch = mIntermediatePatches[b-1];
+					if(mMaterialInfo[patch.startIndex].mMaterialIndex0 == mMaterialInfo[m].mMaterialIndex0 && mMaterialInfo[patch.startIndex].mMaterialIndex1 == mMaterialInfo[m].mMaterialIndex1 && 
+						patch.rootNormal.dot(mOriginalContacts[m].normal) >= PXS_NORMAL_TOLERANCE)
+					{
+						index = PxI32(b-1);
+						break;
+					}
+				}
+
+				if(index != numPatches - 1)
+				{
+					mIntermediatePatches[numPatches-1].stride = PxU16(m - mIntermediatePatches[numPatches - 1].startIndex);
+					//Create a new patch...
+					if(numPatches == CONTACT_REDUCTION_MAX_PATCHES)
+					{
+						break;
+					}
+					mIntermediatePatches[numPatches].startIndex = m;
+					mIntermediatePatches[numPatches].mNextPatch = NULL;
+					if(index == -1)
+					{
+						mIntermediatePatches[numPatches].rootIndex = numPatches;
+						mIntermediatePatches[numPatches].rootNormal = mOriginalContacts[m].normal;
+						mIntermediatePatches[numPatches].maxPenetration = mOriginalContacts[m].separation;
+						mIntermediatePatches[numPatches].index = numPatches;
+						++numUniquePatches;
+					}
+					else
+					{
+						//Find last element in the link
+						PxU16 rootIndex = mIntermediatePatches[index].rootIndex;
+						mIntermediatePatches[index].mNextPatch = &mIntermediatePatches[numPatches];
+						mIntermediatePatches[numPatches].rootNormal = mIntermediatePatches[index].rootNormal;
+						mIntermediatePatches[rootIndex].maxPenetration = mIntermediatePatches[numPatches].maxPenetration = PxMin(mIntermediatePatches[rootIndex].maxPenetration, mOriginalContacts[m].separation);
+						mIntermediatePatches[numPatches].rootIndex = rootIndex;
+						mIntermediatePatches[numPatches].index = numPatches;
+					}
+					++numPatches;
+				}
+			}
+			mIntermediatePatches[numPatches-1].stride = PxU16(m - mIntermediatePatches[numPatches-1].startIndex);
+
+			//OK, we have a list of contact patches so that we can start contact reduction per-patch
+
+			//OK, now we can go and reduce the contacts on a per-patch basis...
+
+			for(PxU32 a = 0; a < numPatches; ++a)
+			{
+				mIntermediatePatchesPtrs[a] = &mIntermediatePatches[a];
+			}
+
+
+			SortBoundsPredicateManifold predicate;
+			Ps::sort(mIntermediatePatchesPtrs, numPatches, predicate);
+
+			PxU32 numReducedPatches = 0;
+			for(PxU32 a = 0; a < numPatches; ++a)
+			{
+				if(mIntermediatePatchesPtrs[a]->rootIndex == mIntermediatePatchesPtrs[a]->index)
+				{
+					//Reduce this patch...
+					if(numReducedPatches == MaxPatches)
+						break;
+
+					ReducedContactPatch& reducedPatch = mPatches[numReducedPatches++];
+					//OK, now we need to work out if we have to reduce patches...
+					PxU32 contactCount = 0;
+					{
+						ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+
+						while(tmpPatch)
+						{
+							contactCount += tmpPatch->stride;
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+					}
+
+					if(contactCount <= CONTACT_REDUCTION_MAX_CONTACTS)
+					{
+						//Just add the contacts...
+						ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+
+						PxU32 ind = 0;
+						while(tmpPatch)
+						{
+							for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+							{
+								reducedPatch.contactPoints[ind++] = tmpPatch->startIndex + b;
+							}
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+						reducedPatch.numContactPoints = contactCount;
+					}
+					else
+					{
+						//Iterate through and find the most extreme point
+						
+
+						PxU32 ind = 0;
+
+						{
+							PxReal dist = 0.f;
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									PxReal magSq = mOriginalContacts[tmpPatch->startIndex + b].point.magnitudeSquared();
+									if(dist < magSq)
+									{
+										ind = tmpPatch->startIndex + b;
+										dist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}	
+						reducedPatch.contactPoints[0] = ind;
+						const PxVec3 p0 = mOriginalContacts[ind].point;
+
+						//Now find the point farthest from this point...						
+						{
+							PxReal maxDist = 0.f;
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									PxReal magSq = (p0 - mOriginalContacts[tmpPatch->startIndex + b].point).magnitudeSquared();
+									if(magSq > maxDist)
+									{
+										ind = tmpPatch->startIndex + b;
+										maxDist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}
+						reducedPatch.contactPoints[1] = ind;
+						const PxVec3 p1 = mOriginalContacts[ind].point;
+
+						//Now find the point farthest from the segment
+
+						PxVec3 n = (p0 - p1).cross(mIntermediatePatchesPtrs[a]->rootNormal);
+
+						//PxReal tVal = 0.f;
+						{
+							PxReal maxDist = 0.f;
+							//PxReal tmpTVal;
+							
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									
+									//PxReal magSq = tmpDistancePointSegmentSquared(p0, p1, mOriginalContacts[tmpPatch->startIndex + b].point, tmpTVal);
+									PxReal magSq = (mOriginalContacts[tmpPatch->startIndex + b].point - p0).dot(n);
+									if(magSq > maxDist)
+									{
+										ind = tmpPatch->startIndex + b;
+										//tVal = tmpTVal;
+										maxDist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}
+						reducedPatch.contactPoints[2] = ind;
+
+						//const PxVec3 closest = (p0 + (p1 - p0) * tVal);
+
+						const PxVec3 dir = -n;//closest - p3;
+
+						{
+							PxReal maxDist = 0.f;
+							//PxReal tVal = 0.f;
+							ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+							while(tmpPatch)
+							{
+								for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+								{
+									PxReal magSq =  (mOriginalContacts[tmpPatch->startIndex + b].point - p0).dot(dir);
+									if(magSq > maxDist)
+									{
+										ind = tmpPatch->startIndex + b;
+										maxDist = magSq;
+									}
+								}
+								tmpPatch = tmpPatch->mNextPatch;
+							}
+						}
+						reducedPatch.contactPoints[3] = ind;
+
+						//Now, we iterate through all the points, and cluster the points. From this, we establish the deepest point that's within a 
+						//tolerance of this point and keep that point
+
+						PxReal separation[CONTACT_REDUCTION_MAX_CONTACTS];
+						PxU32 deepestInd[CONTACT_REDUCTION_MAX_CONTACTS];
+						for(PxU32 i = 0; i < 4; ++i)
+						{
+							PxU32 index = reducedPatch.contactPoints[i];
+							separation[i] = mOriginalContacts[index].separation - PXS_SEPARATION_TOLERANCE;
+							deepestInd[i] = index;
+						}
+
+						ContactPatch* tmpPatch = mIntermediatePatchesPtrs[a];
+						while(tmpPatch)
+						{
+							for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+							{
+								Gu::ContactPoint& point = mOriginalContacts[tmpPatch->startIndex + b];
+								
+								PxReal distance = PX_MAX_REAL;
+								PxU32 index = 0;
+								for(PxU32 c = 0; c < 4; ++c)
+								{
+									PxVec3 dif = mOriginalContacts[reducedPatch.contactPoints[c]].point - point.point;
+									PxReal d = dif.magnitudeSquared();
+									if(distance > d)
+									{
+										distance = d;
+										index = c;
+									}
+								}
+								if(separation[index] > point.separation)
+								{
+									deepestInd[index] = tmpPatch->startIndex+b;
+									separation[index] = point.separation;
+								}
+
+							}
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+
+						bool chosen[64];
+						PxMemZero(chosen, sizeof(chosen));
+						for(PxU32 i = 0; i < 4; ++i)
+						{
+							reducedPatch.contactPoints[i] = deepestInd[i];
+							chosen[deepestInd[i]] = true;
+						}						
+						
+						for(PxU32 i = 4; i < CONTACT_REDUCTION_MAX_CONTACTS; ++i)
+						{
+							separation[i] = PX_MAX_REAL;
+							deepestInd[i] = 0;
+						}
+						tmpPatch = mIntermediatePatchesPtrs[a];
+						while(tmpPatch)
+						{
+							for(PxU32 b = 0; b < tmpPatch->stride; ++b)
+							{
+								if(!chosen[tmpPatch->startIndex+b])
+								{
+									Gu::ContactPoint& point = mOriginalContacts[tmpPatch->startIndex + b];	
+									for(PxU32 j = 4; j < CONTACT_REDUCTION_MAX_CONTACTS; ++j)
+									{
+										if(point.separation < separation[j])
+										{
+											for(PxU32 k = CONTACT_REDUCTION_MAX_CONTACTS-1; k > j; --k)
+											{
+												separation[k] = separation[k-1];
+												deepestInd[k] = deepestInd[k-1];
+											}
+											separation[j] = point.separation;
+											deepestInd[j] = tmpPatch->startIndex+b;
+											break;
+										}
+									}
+								}
+							}
+							tmpPatch = tmpPatch->mNextPatch;
+						}
+
+						for(PxU32 i = 4; i < CONTACT_REDUCTION_MAX_CONTACTS; ++i)
+						{
+							reducedPatch.contactPoints[i] = deepestInd[i];
+						}
+
+						reducedPatch.numContactPoints = CONTACT_REDUCTION_MAX_CONTACTS;
+					}
+				}
+			}
+			mNumPatches = numReducedPatches;
+		}
+
+	};
+}
+
+}
+
+
+#endif //DY_CONTACT_REDUCTION_H
--- a/physx/source/lowleveldynamics/src/DyCorrelationBuffer.h
+++ b/physx/source/lowleveldynamics/src/DyCorrelationBuffer.h
@ -0,0 +1,108 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_CORRELATIONBUFFER_H
+#define DY_CORRELATIONBUFFER_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxBounds3.h"
+#include "geomutils/GuContactBuffer.h"
+
+#include "PxvConfig.h"
+#include "DyFrictionPatch.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+struct PxsMaterialInfo;
+
+namespace Dy
+{
+
+struct CorrelationBuffer
+{
+	static const PxU32 MAX_FRICTION_PATCHES = 32;
+	static const PxU16 LIST_END = 0xffff;
+
+	struct ContactPatchData
+	{
+		PxU16 start;
+		PxU16 next;
+		PxU8 flags;
+		PxU8 count;
+		PxReal staticFriction, dynamicFriction, restitution;
+		PxBounds3 patchBounds;
+	};
+
+	// we can have as many contact patches as contacts, unfortunately
+	ContactPatchData	contactPatches[Gu::ContactBuffer::MAX_CONTACTS];
+
+	FrictionPatch	PX_ALIGN(16, frictionPatches[MAX_FRICTION_PATCHES]);
+	PxVec3				PX_ALIGN(16, frictionPatchWorldNormal[MAX_FRICTION_PATCHES]);
+	PxBounds3		patchBounds[MAX_FRICTION_PATCHES];
+
+	PxU32				frictionPatchContactCounts[MAX_FRICTION_PATCHES];
+	PxU32				correlationListHeads[MAX_FRICTION_PATCHES+1];
+
+	// contact IDs are only used to identify auxiliary contact data when velocity
+	// targets have been set. 
+	PxU16				contactID[MAX_FRICTION_PATCHES][2];
+
+	PxU32 contactPatchCount, frictionPatchCount;
+
+};
+
+bool createContactPatches(CorrelationBuffer& fb, const Gu::ContactPoint* cb, PxU32 contactCount, PxReal normalTolerance);
+
+bool correlatePatches(CorrelationBuffer& fb, 
+					  const Gu::ContactPoint* cb,
+					  const PxTransform& bodyFrame0,
+					  const PxTransform& bodyFrame1,
+					  PxReal normalTolerance,
+					  PxU32 startContactPatchIndex,
+					  PxU32 startFrictionPatchIndex);
+
+void growPatches(CorrelationBuffer& fb,
+				 const Gu::ContactPoint* buffer,
+				 const PxTransform& bodyFrame0,
+				 const PxTransform& bodyFrame1,
+				 PxReal normalTolerance,
+				 PxU32 frictionPatchStartIndex,
+				 PxReal frictionOffsetThreshold);
+
+}
+
+}
+
+#endif //DY_CORRELATIONBUFFER_H
--- a/physx/source/lowleveldynamics/src/DyDynamics.cpp
+++ b/physx/source/lowleveldynamics/src/DyDynamics.cpp
--- a/physx/source/lowleveldynamics/src/DyDynamics.h
+++ b/physx/source/lowleveldynamics/src/DyDynamics.h
@ -0,0 +1,490 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_DYNAMICS_H
+#define DY_DYNAMICS_H
+
+#include "PxvConfig.h"
+#include "CmSpatialVector.h"
+#include "CmTask.h"
+#include "CmPool.h"
+#include "PxcThreadCoherentCache.h"
+#include "DyThreadContext.h"
+#include "PxcConstraintBlockStream.h"
+#include "DySolverBody.h"
+#include "DyContext.h"
+#include "PxsIslandManagerTypes.h"
+#include "PxvNphaseImplementationContext.h"
+#include "solver/PxSolverDefs.h"
+
+namespace physx
+{
+
+namespace Cm
+{
+	class FlushPool;
+}
+
+namespace IG
+{
+	class SimpleIslandManager;
+	struct Edge;
+}
+
+class PxsRigidBody;
+
+class PxsStreamedThresholdTable;
+
+struct PxsBodyCore;
+struct PxsIslandObjects;
+class PxsIslandIndices;
+struct PxsIndexedInteraction;
+class PxsIslandManager;
+struct PxsIndexedConstraint;
+struct PxsIndexedContactManager;
+class PxsHeapMemoryAllocator;
+class PxsMemoryManager;
+class PxsDefaultMemoryManager;
+struct PxSolverConstraintDesc;
+
+namespace Cm
+{
+	class Bitmap;
+	class SpatialVector;
+}
+
+namespace Dy
+{
+	class SolverCore;
+	struct SolverIslandParams;
+	struct ArticulationSolverDesc;
+	class Articulation;
+	class DynamicsContext;
+
+
+
+
+#define SOLVER_PARALLEL_METHOD_ARGS									\
+	DynamicsContext&	context,									\
+	SolverIslandParams& params,										\
+	IG::IslandSim& islandSim
+
+//typedef	void (*PxsSolveParallelMethod)(SOLVER_PARALLEL_METHOD_ARGS);
+//extern PxsSolveParallelMethod solveParallel[3];
+
+void solveParallel(SOLVER_PARALLEL_METHOD_ARGS);
+void solveParallelCouloumFriction(SOLVER_PARALLEL_METHOD_ARGS);
+
+
+struct SolverIslandObjects;
+
+/**
+\brief Solver body pool (array) that enforces 128-byte alignment for base address of array.
+\note This reduces cache misses on platforms with 128-byte-size cache lines by aligning the start of the array to the beginning of a cache line.
+*/
+class SolverBodyPool : public Ps::Array<PxSolverBody, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverBody> > > 
+{ 
+	PX_NOCOPY(SolverBodyPool)
+public:
+	SolverBodyPool() {}
+};
+
+/**
+\brief Solver body data pool (array) that enforces 128-byte alignment for base address of array.
+\note This reduces cache misses on platforms with 128-byte-size cache lines by aligning the start of the array to the beginning of a cache line.
+*/
+class SolverBodyDataPool : public Ps::Array<PxSolverBodyData, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverBodyData> > >
+{
+	PX_NOCOPY(SolverBodyDataPool)
+public:
+	SolverBodyDataPool() {}
+};
+
+class SolverConstraintDescPool : public Ps::Array<PxSolverConstraintDesc, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverConstraintDesc> > >
+{
+	PX_NOCOPY(SolverConstraintDescPool)
+public:
+	SolverConstraintDescPool() { }
+};
+
+/**
+\brief Encapsulates an island's context
+*/
+
+struct IslandContext
+{
+	//The thread context for this island (set in in the island start task, released in the island end task)
+	ThreadContext* mThreadContext;
+	PxsIslandIndices		mCounts;
+};
+
+
+/**
+\brief Encapsules the data used by the constraint solver.
+*/
+
+#if PX_VC 
+    #pragma warning(push)
+	#pragma warning( disable : 4324 ) // Padding was added at the end of a structure because of a __declspec(align) value.
+#endif
+
+
+class DynamicsContext : public Context
+{
+	PX_NOCOPY(DynamicsContext)
+public:
+	
+	/**
+	\brief Creates a DynamicsContext associated with a PxsContext
+	\return A pointer to the newly-created DynamicsContext.
+	*/
+	static DynamicsContext*	create(	PxcNpMemBlockPool* memBlockPool,
+									PxcScratchAllocator& scratchAllocator,
+									Cm::FlushPool& taskPool,
+									PxvSimStats& simStats,
+									PxTaskManager* taskManager,
+									Ps::VirtualAllocatorCallback* allocator,
+									PxsMaterialManager* materialManager,
+									IG::IslandSim* accurateIslandSim,
+									PxU64 contextID,
+									const bool enableStabilization,
+									const bool useEnhancedDeterminism,
+									const bool useAdaptiveForce,
+									const PxReal maxBiasCoefficient,
+									const bool frictionEveryIteration
+									);
+	
+	/**
+	\brief Destroys this DynamicsContext
+	*/
+	void						destroy();
+
+	/**
+	\brief Returns the static world solver body
+	\return The static world solver body.
+	*/
+	PX_FORCE_INLINE PxSolverBody&		getWorldSolverBody()					{ return mWorldSolverBody;  }
+
+	PX_FORCE_INLINE Cm::FlushPool&			getTaskPool()						{ return mTaskPool;			}
+
+	PX_FORCE_INLINE ThresholdStream&		getThresholdStream()					{ return *mThresholdStream;	}
+
+	PX_FORCE_INLINE PxvSimStats&			getSimStats()							{ return mSimStats;			}
+
+#if PX_ENABLE_SIM_STATS
+	void									addThreadStats(const ThreadContext::ThreadSimStats& stats);
+#endif
+
+	/**
+	\brief The entry point for the constraint solver. 
+	\param[in]	dt	The simulation time-step
+	\param[in]	continuation The continuation task for the solver
+
+	This method is called after the island generation has completed. Its main responsibilities are:
+	(1) Reserving the solver body pools
+	(2) Initializing the static and kinematic solver bodies, which are shared resources between islands.
+	(3) Construct the solver task chains for each island
+
+	Each island is solved as an independent solver task chain in parallel.
+
+	*/
+
+	virtual void						update(IG::SimpleIslandManager& simpleIslandManager, PxBaseTask* continuation, PxBaseTask* lostTouchTask,
+		PxsContactManager** foundPatchManagers, PxU32 nbFoundPatchManagers, PxsContactManager** lostPatchManagers, PxU32 nbLostPatchManagers,
+		PxU32 maxPatchesPerCM, PxsContactManagerOutputIterator& iter, PxsContactManagerOutput* gpuOutputs, const PxReal dt, const PxVec3& gravity, const PxU32 bitMapWordCounts);
+
+
+	void updatePostKinematic(IG::SimpleIslandManager& simpleIslandManager, PxBaseTask* continuation, PxBaseTask* lostTouchTask);
+
+	virtual void						processLostPatches(IG::SimpleIslandManager& /*simpleIslandManager*/, PxsContactManager** /*lostPatchManagers*/, PxU32 /*nbLostPatchManagers*/, PxsContactManagerOutputIterator& /*iterator*/){}
+
+	virtual void						updateBodyCore(PxBaseTask* continuation);
+
+	virtual void						setSimulationController(PxsSimulationController* simulationController ){ mSimulationController = simulationController; }
+	/**
+	\brief This method combines the results of several islands, e.g. constructing scene-level simulation statistics and merging together threshold streams for contact notification.
+	*/
+	virtual void							mergeResults();
+
+	virtual void							getDataStreamBase(void*& /*contactStreamBase*/, void*& /*patchStreamBase*/, void*& /*forceAndIndicesStreamBase*/){}
+
+	/**
+	\brief Allocates and returns a thread context object.
+	\return A thread context.
+	*/
+	PX_FORCE_INLINE ThreadContext*					getThreadContext()
+	{
+		return mThreadContextPool.get();
+	}
+
+	/**
+	\brief Returns a thread context to the thread context pool.
+	\param[in] context The thread context to return to the thread context pool. 
+	*/
+	void								putThreadContext(ThreadContext* context)
+	{
+		mThreadContextPool.put(context);
+	}
+
+
+	PX_FORCE_INLINE	PxU32					getKinematicCount()		const	{ return mKinematicCount;	}
+	PX_FORCE_INLINE	PxU64					getContextId()			const	{ return mContextID;		}
+
+protected:
+
+	/**
+	\brief Constructor for DynamicsContext
+	*/
+										DynamicsContext(PxcNpMemBlockPool* memBlockPool,
+														PxcScratchAllocator& scratchAllocator,
+														Cm::FlushPool& taskPool,
+														PxvSimStats& simStats,
+														PxTaskManager* taskManager,
+														Ps::VirtualAllocatorCallback* allocator,
+														PxsMaterialManager* materialManager,
+														IG::IslandSim* accurateIslandSim,
+														PxU64 contextID,
+														const bool enableStabilization,
+														const bool useEnhancedDeterminism,
+														const bool useAdaptiveForce,
+														const PxReal maxBiasCoefficient,
+														const bool frictionEveryIteration
+														);
+	/**
+	\brief Destructor for DynamicsContext
+	*/
+	virtual								~DynamicsContext();
+
+
+	// Solver helper-methods
+	/**
+	\brief Computes the unconstrained velocity for a given PxsRigidBody
+	\param[in] atom The PxsRigidBody
+	*/
+	void								computeUnconstrainedVelocity(PxsRigidBody* atom)	const;
+
+	/**
+	\brief fills in a PxSolverConstraintDesc from an indexed interaction
+	\param[in,out] desc The PxSolverConstraintDesc
+	\param[in] constraint The PxsIndexedInteraction
+	*/
+	void								setDescFromIndices(PxSolverConstraintDesc& desc, 
+													  const PxsIndexedInteraction& constraint, const PxU32 solverBodyOffset);
+
+
+	void								setDescFromIndices(PxSolverConstraintDesc& desc, IG::EdgeIndex edgeIndex,
+											const IG::SimpleIslandManager& islandManager, PxU32* bodyRemapTable, const PxU32 solverBodyOffset);
+
+	/**
+	\brief Compute the unconstrained velocity for set of bodies in parallel. This function may spawn additional tasks.
+	\param[in] dt The timestep
+	\param[in] bodyArray The array of body cores
+	\param[in] originalBodyArray The array of PxsRigidBody
+	\param[in] nodeIndexArray The array of island node index
+	\param[in] bodyCount The number of bodies
+	\param[out] solverBodyPool The pool of solver bodies. These are synced with the corresponding body in bodyArray.
+	\param[out] solverBodyDataPool The pool of solver body data. These are synced with the corresponding body in bodyArray
+	\param[out] motionVelocityArray The motion velocities for the bodies
+	\param[out] maxSolverPositionIterations The maximum number of position iterations requested by any body in the island
+	\param[out] maxSolverVelocityIterations The maximum number of velocity iterations requested by any body in the island
+	\param[out] integrateTask The continuation task for any tasks spawned by this function.
+	*/
+	void								preIntegrationParallel(
+											   const PxF32 dt,
+											   PxsBodyCore*const* bodyArray,					// INOUT: core body attributes
+											   PxsRigidBody*const* originalBodyArray,			// IN: original body atom names (LEGACY - DON'T deref the ptrs!!)
+											   PxU32 const* nodeIndexArray,						// IN: island node index
+											   PxU32 bodyCount,									// IN: body count
+											   PxSolverBody* solverBodyPool,					// IN: solver atom pool (space preallocated)
+											   PxSolverBodyData* solverBodyDataPool,
+											   Cm::SpatialVector* motionVelocityArray,			// OUT: motion velocities
+											   PxU32& maxSolverPositionIterations,
+											   PxU32& maxSolverVelocityIterations,
+											   PxBaseTask& integrateTask
+											   );
+
+	/**
+	\brief Solves an island in parallel.
+
+	\param[in] params Solver parameter structure
+	*/
+
+	void								solveParallel(SolverIslandParams& params, IG::IslandSim& islandSim, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV);
+
+	
+
+	void								integrateCoreParallel(SolverIslandParams& params, IG::IslandSim& islandSim);
+
+
+
+
+	/**
+	\brief Resets the thread contexts
+	*/
+	void									resetThreadContexts();
+
+	/**
+	\brief Returns the scratch memory allocator.
+	\return The scratch memory allocator.
+	*/
+	PX_FORCE_INLINE PxcScratchAllocator&	getScratchAllocator() { return mScratchAllocator; }
+
+	//Data
+
+	/**
+	\brief Body to represent the world static body.
+	*/
+	PX_ALIGN(16, PxSolverBody				mWorldSolverBody);
+	/**
+	\brief Body data to represent the world static body.
+	*/
+	PX_ALIGN(16, PxSolverBodyData			mWorldSolverBodyData);
+
+	/**
+	\brief A thread context pool
+	*/
+	PxcThreadCoherentCache<ThreadContext, PxcNpMemBlockPool> mThreadContextPool;
+
+	/**
+	\brief Solver constraint desc array
+	*/
+	SolverConstraintDescPool	mSolverConstraintDescPool;
+
+	/**
+	\brief Ordered sover constraint desc array (after partitioning)
+	*/
+	SolverConstraintDescPool	mOrderedSolverConstraintDescPool;
+
+	/**
+	\brief A temporary array of constraint descs used for partitioning
+	*/
+	SolverConstraintDescPool	mTempSolverConstraintDescPool;
+
+	/**
+	\brief An array of contact constraint batch headers
+	*/
+	Ps::Array<PxConstraintBatchHeader> mContactConstraintBatchHeaders;
+
+	/**
+	\brief Array of motion velocities for all bodies in the scene.
+	*/
+	Ps::Array<Cm::SpatialVector> mMotionVelocityArray;
+
+	/**
+	\brief Array of body core pointers for all bodies in the scene.
+	*/
+	Ps::Array<PxsBodyCore*>	mBodyCoreArray;
+
+	/**
+	\brief Array of rigid body pointers for all bodies in the scene.
+	*/
+	Ps::Array<PxsRigidBody*> mRigidBodyArray;
+
+	/**
+	\brief Array of articulationpointers for all articulations in the scene.
+	*/
+	Ps::Array<ArticulationV*> mArticulationArray;
+
+	/**
+	\brief Global pool for solver bodies. Kinematic bodies are at the start, and then dynamic bodies
+	*/
+	SolverBodyPool			mSolverBodyPool;
+	/**
+	\brief Global pool for solver body data. Kinematic bodies are at the start, and then dynamic bodies
+	*/
+	SolverBodyDataPool		mSolverBodyDataPool;
+
+
+	ThresholdStream*		mExceededForceThresholdStream[2]; //this store previous and current exceeded force thresholdStream	
+
+	Ps::Array<PxU32>		mExceededForceThresholdStreamMask;
+
+	/**
+	\brief Interface to the solver core.
+	\note We currently only support PxsSolverCoreSIMD. Other cores may be added in future releases.
+	*/
+	SolverCore*				mSolverCore[PxFrictionType::eFRICTION_COUNT];
+
+	Ps::Array<PxU32>		mSolverBodyRemapTable;				//Remaps from the "active island" index to the index within a solver island
+
+	Ps::Array<PxU32>		mNodeIndexArray;					//island node index
+
+	Ps::Array<PxsIndexedContactManager> mContactList;
+	
+	/**
+	\brief The total number of kinematic bodies in the scene
+	*/
+	PxU32						mKinematicCount;
+
+	/**
+	\brief Atomic counter for the number of threshold stream elements.
+	*/
+	PxI32						mThresholdStreamOut;
+
+	
+
+	PxsMaterialManager*			mMaterialManager;
+
+	PxsContactManagerOutputIterator mOutputIterator;
+	
+private:
+	//private:
+	PxcScratchAllocator&						mScratchAllocator;
+	Cm::FlushPool&								mTaskPool;
+	PxTaskManager*								mTaskManager;
+	PxU32										mCurrentIndex; // this is the index point to the current exceeded force threshold stream
+
+	PxU64										mContextID;
+
+	protected:
+
+	friend class PxsSolverStartTask;
+	friend class PxsSolverAticulationsTask;
+	friend class PxsSolverSetupConstraintsTask;
+	friend class PxsSolverCreateFinalizeConstraintsTask;	
+	friend class PxsSolverConstraintPartitionTask;
+	friend class PxsSolverSetupSolveTask;
+	friend class PxsSolverIntegrateTask;
+	friend class PxsSolverEndTask;
+	friend class PxsSolverConstraintPostProcessTask;
+	friend class PxsForceThresholdTask;
+	friend class SolverArticulationUpdateTask;
+
+	friend void solveParallel(SOLVER_PARALLEL_METHOD_ARGS);
+};
+
+#if PX_VC 
+    #pragma warning(pop)
+#endif
+
+}
+}
+
+#endif //DY_DYNAMICS_H
--- a/physx/source/lowleveldynamics/src/DyFeatherstoneArticulation.cpp
+++ b/physx/source/lowleveldynamics/src/DyFeatherstoneArticulation.cpp
--- a/physx/source/lowleveldynamics/src/DyFeatherstoneArticulationLink.h
+++ b/physx/source/lowleveldynamics/src/DyFeatherstoneArticulationLink.h
@ -0,0 +1,69 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXD_FEATHERSTONE_ARTICULATION_LINK_H
+#define PXD_FEATHERSTONE_ARTICULATION_LINK_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxQuat.h"
+#include "foundation/PxTransform.h"
+#include "PsVecMath.h"
+#include "CmUtils.h"
+#include "CmSpatialVector.h"
+#include "DyVArticulation.h"
+#include "DyFeatherstoneArticulationUtils.h"
+
+namespace physx
+{
+	namespace Dy
+	{
+
+		class ArticulationLinkData
+		{
+			const static PxU32 MaxJointRows = 3;
+		public:
+			ArticulationLinkData()
+			{
+				maxPenBias = 0.f;
+			}
+
+			Cm::SpatialVectorF				IsW[MaxJointRows];//stI is the transpose of Is
+			PxVec3							childToBase;
+			PxVec3							r; //vector from parent com to child com
+			PxVec3							rw; //vector from parent com to child com
+			PxReal							qstZIc[MaxJointRows];//jointForce - stZIc
+			PxReal							maxPenBias;
+
+		};
+
+	}//namespace Dy
+}
+
+#endif
--- a/physx/source/lowleveldynamics/src/DyFeatherstoneForwardDynamic.cpp
+++ b/physx/source/lowleveldynamics/src/DyFeatherstoneForwardDynamic.cpp
--- a/physx/source/lowleveldynamics/src/DyFeatherstoneInverseDynamic.cpp
+++ b/physx/source/lowleveldynamics/src/DyFeatherstoneInverseDynamic.cpp
--- a/physx/source/lowleveldynamics/src/DyFrictionCorrelation.cpp
+++ b/physx/source/lowleveldynamics/src/DyFrictionCorrelation.cpp
@ -0,0 +1,298 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PxvConfig.h"
+#include "DyCorrelationBuffer.h"
+#include "PxsMaterialManager.h"
+#include "PsUtilities.h"
+#include "foundation/PxBounds3.h"
+
+using namespace physx;
+using namespace Gu;
+
+namespace physx
+{
+
+namespace Dy
+{
+
+namespace 
+{
+PX_FORCE_INLINE void initContactPatch(CorrelationBuffer::ContactPatchData& patch, PxU16 index, PxReal restitution, PxReal staticFriction, PxReal dynamicFriction,
+	PxU8 flags)
+{
+	patch.start = index;
+	patch.count = 1;
+	patch.next = 0;
+	patch.flags = flags;
+	patch.restitution = restitution;
+	patch.staticFriction = staticFriction;
+	patch.dynamicFriction = dynamicFriction;
+}
+
+PX_FORCE_INLINE void initFrictionPatch(FrictionPatch& p, const PxVec3& worldNormal, const PxTransform& body0Pose, const PxTransform& body1Pose, 
+	PxReal restitution, PxReal staticFriction, PxReal dynamicFriction, PxU8 materialFlags)
+{
+	p.body0Normal = body0Pose.rotateInv(worldNormal);
+	p.body1Normal = body1Pose.rotateInv(worldNormal);
+	p.relativeQuat = body0Pose.q.getConjugate() * body1Pose.q;
+	p.anchorCount = 0;
+	p.broken = 0;
+	p.staticFriction = staticFriction;
+	p.dynamicFriction = dynamicFriction;
+	p.restitution = restitution;
+	p.materialFlags = materialFlags;
+}
+}
+
+
+bool createContactPatches(CorrelationBuffer& fb, const Gu::ContactPoint* cb, PxU32 contactCount, PxReal normalTolerance)
+{
+
+	// PT: this rewritten version below doesn't have LHS
+
+	PxU32 contactPatchCount = fb.contactPatchCount;
+	if(contactPatchCount == Gu::ContactBuffer::MAX_CONTACTS)
+		return false;
+	if(contactCount>0)
+	{
+		CorrelationBuffer::ContactPatchData* currentPatchData = fb.contactPatches + contactPatchCount;
+		const Gu::ContactPoint* PX_RESTRICT contacts = cb;
+
+		PxU8 count=1;
+
+		initContactPatch(fb.contactPatches[contactPatchCount++], Ps::to16(0), contacts[0].restitution, 
+			contacts[0].staticFriction, contacts[0].dynamicFriction, PxU8(contacts[0].materialFlags));
+
+		PxBounds3 bounds(contacts[0].point, contacts[0].point);
+
+		PxU32 patchIndex = 0;
+
+		for (PxU32 i = 1; i<contactCount; i++)
+		{
+			const Gu::ContactPoint& curContact = contacts[i];
+			const Gu::ContactPoint& preContact = contacts[patchIndex];
+
+			if(curContact.staticFriction == preContact.staticFriction
+				&& curContact.dynamicFriction == preContact.dynamicFriction
+				&& curContact.restitution == preContact.restitution
+				&& curContact.normal.dot(preContact.normal)>=normalTolerance)
+			{
+				bounds.include(curContact.point);
+				count++;
+			}
+			else
+			{
+				if(contactPatchCount == Gu::ContactBuffer::MAX_CONTACTS)
+					return false;
+				patchIndex = i;
+				currentPatchData->count = count;
+				count = 1;
+				currentPatchData->patchBounds = bounds;
+				currentPatchData = fb.contactPatches + contactPatchCount;
+
+				initContactPatch(fb.contactPatches[contactPatchCount++], Ps::to16(i), curContact.restitution,
+					curContact.staticFriction, curContact.dynamicFriction, PxU8(curContact.materialFlags));
+
+				bounds = PxBounds3(curContact.point, curContact.point);
+			}
+		}
+		if(count!=1)
+			currentPatchData->count = count;
+
+		currentPatchData->patchBounds = bounds;
+	}
+	fb.contactPatchCount = contactPatchCount;
+	return true;
+}
+
+bool correlatePatches(CorrelationBuffer& fb, 
+					  const Gu::ContactPoint* cb,
+					  const PxTransform& bodyFrame0,
+					  const PxTransform& bodyFrame1,
+					  PxReal normalTolerance,
+					  PxU32 startContactPatchIndex,
+					  PxU32 startFrictionPatchIndex)
+{
+	bool overflow = false;
+	PxU32 frictionPatchCount = fb.frictionPatchCount;
+
+	for(PxU32 i=startContactPatchIndex;i<fb.contactPatchCount;i++)
+	{
+		CorrelationBuffer::ContactPatchData &c = fb.contactPatches[i];
+		const PxVec3 patchNormal = cb[c.start].normal;
+
+		PxU32 j=startFrictionPatchIndex;
+		for(;j<frictionPatchCount && ((patchNormal.dot(fb.frictionPatchWorldNormal[j]) < normalTolerance) 
+			|| fb.frictionPatches[j].restitution != c.restitution|| fb.frictionPatches[j].staticFriction != c.staticFriction || 
+			fb.frictionPatches[j].dynamicFriction != c.dynamicFriction);j++)
+			;
+
+		if(j==frictionPatchCount)
+		{
+			overflow |= j==CorrelationBuffer::MAX_FRICTION_PATCHES;
+			if(overflow)
+				continue;
+
+			initFrictionPatch(fb.frictionPatches[frictionPatchCount], patchNormal, bodyFrame0, bodyFrame1, c.restitution, c.staticFriction, c.dynamicFriction, c.flags);
+			fb.frictionPatchWorldNormal[j] = patchNormal;
+			fb.frictionPatchContactCounts[frictionPatchCount] = c.count;
+			fb.patchBounds[frictionPatchCount] = c.patchBounds;
+			fb.contactID[frictionPatchCount][0] = 0xffff;
+			fb.contactID[frictionPatchCount++][1] = 0xffff;
+			c.next = CorrelationBuffer::LIST_END;
+		}
+		else
+		{
+			fb.patchBounds[j].include(c.patchBounds);
+			fb.frictionPatchContactCounts[j] += c.count;
+			c.next = Ps::to16(fb.correlationListHeads[j]);
+		}
+
+		fb.correlationListHeads[j] = i;
+	}
+
+	fb.frictionPatchCount = frictionPatchCount;
+
+	return overflow;
+}
+
+// run over the friction patches, trying to find two anchors per patch. If we already have
+// anchors that are close, we keep them, which gives us persistent spring behavior
+
+void growPatches(CorrelationBuffer& fb,
+				 const ContactPoint* cb,
+				 const PxTransform& bodyFrame0,
+				 const PxTransform& bodyFrame1,
+				 PxReal correlationDistance,
+				 PxU32 frictionPatchStartIndex,
+				 PxReal frictionOffsetThreshold)
+{
+	for(PxU32 i=frictionPatchStartIndex;i<fb.frictionPatchCount;i++)
+	{
+		FrictionPatch& fp = fb.frictionPatches[i];
+
+		if (fp.anchorCount == 2 || fb.correlationListHeads[i] == CorrelationBuffer::LIST_END)
+		{
+			const PxReal frictionPatchDiagonalSq = fb.patchBounds[i].getDimensions().magnitudeSquared();
+			const PxReal anchorSqDistance = (fp.body0Anchors[0] - fp.body0Anchors[1]).magnitudeSquared();
+
+			//If the squared distance between the anchors is more than a quarter of the patch diagonal, we can keep, 
+			//otherwise the anchors are potentially clustered around a corner so force a rebuild of the patch
+			if (fb.frictionPatchContactCounts[i] == 0 || (anchorSqDistance * 4.f) >= frictionPatchDiagonalSq)
+				continue;
+
+			fp.anchorCount = 0;
+		}
+
+		PxVec3 worldAnchors[2];
+		PxU16 anchorCount = 0;
+		PxReal pointDistSq = 0.0f, dist0, dist1;
+
+		// if we have an anchor already, keep it
+		if(fp.anchorCount == 1)
+		{
+			worldAnchors[anchorCount++] = bodyFrame0.transform(fp.body0Anchors[0]);
+		}
+
+		for(PxU32 patch = fb.correlationListHeads[i]; 
+			patch!=CorrelationBuffer::LIST_END; 
+			patch = fb.contactPatches[patch].next)
+		{
+			CorrelationBuffer::ContactPatchData& cp = fb.contactPatches[patch];
+			for(PxU16 j=0;j<cp.count;j++)
+			{
+				const PxVec3& worldPoint = cb[cp.start+j].point;
+
+				if(cb[cp.start+j].separation < frictionOffsetThreshold)
+				{
+
+					switch(anchorCount)
+					{
+					case 0:
+						fb.contactID[i][0] = PxU16(cp.start+j);
+						worldAnchors[0] = worldPoint;
+						anchorCount++;
+						break;
+					case 1:
+						pointDistSq = (worldPoint-worldAnchors[0]).magnitudeSquared(); 
+						if (pointDistSq > (correlationDistance * correlationDistance))
+						{
+							fb.contactID[i][1] = PxU16(cp.start+j);
+							worldAnchors[1] = worldPoint;
+							anchorCount++;
+						}
+						break;
+					default: //case 2
+						dist0 = (worldPoint-worldAnchors[0]).magnitudeSquared();
+						dist1 = (worldPoint-worldAnchors[1]).magnitudeSquared();
+						if (dist0 > dist1)
+						{
+							if(dist0 > pointDistSq)
+							{
+								fb.contactID[i][1] = PxU16(cp.start+j);
+								worldAnchors[1] = worldPoint;
+								pointDistSq = dist0;
+							}
+						}
+						else if (dist1 > pointDistSq)
+						{
+							fb.contactID[i][0] = PxU16(cp.start+j);
+							worldAnchors[0] = worldPoint;
+							pointDistSq = dist1;
+						}
+					}
+				}
+			}
+		}
+
+		//PX_ASSERT(anchorCount > 0);
+
+		// add the new anchor(s) to the patch
+		for(PxU32 j = fp.anchorCount; j < anchorCount; j++)
+		{
+			fp.body0Anchors[j] = bodyFrame0.transformInv(worldAnchors[j]);
+			fp.body1Anchors[j] = bodyFrame1.transformInv(worldAnchors[j]);
+		}
+
+		// the block contact solver always reads at least one anchor per patch for performance reasons even if there are no valid patches,
+		// so we need to initialize this in the unexpected case that we have no anchors
+
+		if(anchorCount==0)
+			fp.body0Anchors[0] = fp.body1Anchors[0] = PxVec3(0);
+
+		fp.anchorCount = anchorCount;
+	}
+}
+
+}
+
+}
+
--- a/physx/source/lowleveldynamics/src/DyFrictionPatch.h
+++ b/physx/source/lowleveldynamics/src/DyFrictionPatch.h
@ -0,0 +1,83 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef PXC_FRICTIONPATCH_H
+#define PXC_FRICTIONPATCH_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+struct FrictionPatch
+{
+	PxU8				broken;				// PT: must be first byte of struct, see "frictionBrokenWritebackByte"
+	PxU8				materialFlags;
+	PxU16				anchorCount;
+	PxReal				restitution;
+	PxReal				staticFriction;
+	PxReal				dynamicFriction;
+	PxVec3				body0Normal;
+	PxVec3				body1Normal;
+	PxVec3				body0Anchors[2];
+	PxVec3				body1Anchors[2];
+	PxQuat				relativeQuat;
+
+	PX_FORCE_INLINE	void	operator = (const FrictionPatch& other)
+	{
+		broken = other.broken;
+		materialFlags = other.materialFlags;
+		anchorCount = other.anchorCount;
+		body0Normal = other.body0Normal;
+		body1Normal = other.body1Normal;
+		body0Anchors[0] = other.body0Anchors[0];   
+		body0Anchors[1] = other.body0Anchors[1];
+		body1Anchors[0] = other.body1Anchors[0];
+		body1Anchors[1] = other.body1Anchors[1];
+		relativeQuat = other.relativeQuat;
+		restitution = other.restitution;
+		staticFriction = other.staticFriction;
+		dynamicFriction = other.dynamicFriction;
+	}
+};  
+
+//PX_COMPILE_TIME_ASSERT(sizeof(FrictionPatch)==80);
+
+}
+
+}
+
+#endif
--- a/physx/source/lowleveldynamics/src/DyFrictionPatchStreamPair.h
+++ b/physx/source/lowleveldynamics/src/DyFrictionPatchStreamPair.h
@ -0,0 +1,128 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef PXC_FRICTIONPATCHPOOL_H
+#define PXC_FRICTIONPATCHPOOL_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxvConfig.h"
+#include "PsMutex.h"
+#include "PsArray.h"
+
+// Each narrow phase thread has an input stream of friction patches from the
+// previous frame and an output stream of friction patches which will be
+// saved for next frame. The patches persist for exactly one frame at which
+// point they get thrown away.
+
+
+// There is a stream pair per thread. A contact callback reserves space
+// for its friction patches and gets a cookie in return that can stash
+// for next frame. Cookies are valid for one frame only.
+//
+// note that all friction patches reserved are guaranteed to be contiguous;
+// this might turn out to be a bit inefficient if we often have a large
+// number of friction patches
+
+#include "PxcNpMemBlockPool.h"
+
+namespace physx
+{
+
+class FrictionPatchStreamPair
+{
+public:
+	FrictionPatchStreamPair(PxcNpMemBlockPool& blockPool);
+
+	// reserve can fail and return null. Read should never fail
+	template<class FrictionPatch>
+	FrictionPatch*		reserve(const PxU32 size);
+
+	template<class FrictionPatch>
+	const FrictionPatch* findInputPatches(const PxU8* ptr) const;
+	void					reset();
+
+	PxcNpMemBlockPool& getBlockPool() { return mBlockPool;}
+private:
+	PxcNpMemBlockPool&	mBlockPool;
+	PxcNpMemBlock*		mBlock;
+	PxU32				mUsed;
+
+	FrictionPatchStreamPair& operator=(const FrictionPatchStreamPair&);
+};
+
+PX_FORCE_INLINE FrictionPatchStreamPair::FrictionPatchStreamPair(PxcNpMemBlockPool& blockPool):
+  mBlockPool(blockPool), mBlock(NULL), mUsed(0)
+{
+}
+
+PX_FORCE_INLINE void FrictionPatchStreamPair::reset()
+{
+	mBlock = NULL;
+	mUsed = 0;
+}
+
+// reserve can fail and return null. Read should never fail
+template <class FrictionPatch>
+FrictionPatch* FrictionPatchStreamPair::reserve(const PxU32 size)
+{
+	if(size>PxcNpMemBlock::SIZE)
+	{
+		return reinterpret_cast<FrictionPatch*>(-1);
+	}
+
+	PX_ASSERT(size <= PxcNpMemBlock::SIZE);
+
+	FrictionPatch* ptr = NULL;
+
+	if(mBlock == NULL || mUsed + size > PxcNpMemBlock::SIZE)
+	{
+		mBlock = mBlockPool.acquireFrictionBlock();
+		mUsed = 0;
+	}
+
+	if(mBlock)
+	{
+		ptr = reinterpret_cast<FrictionPatch*>(mBlock->data+mUsed);
+		mUsed += size;
+	}
+
+	return ptr;
+}
+
+template <class FrictionPatch>
+const FrictionPatch* FrictionPatchStreamPair::findInputPatches(const PxU8* ptr) const
+{
+	return reinterpret_cast<const FrictionPatch*>(ptr);
+}
+
+}
+
+#endif
--- a/physx/source/lowleveldynamics/src/DyRigidBodyToSolverBody.cpp
+++ b/physx/source/lowleveldynamics/src/DyRigidBodyToSolverBody.cpp
@ -0,0 +1,94 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "CmUtils.h"
+#include "DySolverBody.h"
+#include "PxsRigidBody.h"
+#include "PxvDynamics.h"
+
+using namespace physx;
+
+// PT: TODO: SIMDify all this...
+void Dy::copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angularVelocity, const PxReal invMass, const PxVec3& invInertia, const PxTransform& globalPose,
+	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& data, PxU32 lockFlags)
+{
+	data.nodeIndex = nodeIndex;
+
+	const PxVec3 safeSqrtInvInertia = computeSafeSqrtInertia(invInertia);
+
+	// PT: TODO: re-SIMDify this one
+	const PxMat33 rotation(globalPose.q);
+
+	Cm::transformInertiaTensor(safeSqrtInvInertia, rotation, data.sqrtInvInertia);
+
+	// Copy simple properties
+	data.linearVelocity = linearVelocity;
+	data.angularVelocity = angularVelocity;
+
+	if (lockFlags)
+	{
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_X)
+			data.linearVelocity.x = 0.f;
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Y)
+			data.linearVelocity.y = 0.f;
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_LINEAR_Z)
+			data.linearVelocity.z = 0.f;
+
+		//KS - technically, we can zero the inertia columns and produce stiffer constraints. However, this can cause numerical issues with the 
+		//joint solver, which is fixed by disabling joint preprocessing and setting minResponseThreshold to some reasonable value > 0. However, until
+		//this is handled automatically, it's probably better not to zero these inertia rows
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_X)
+		{
+			data.angularVelocity.x = 0.f;
+			//data.sqrtInvInertia.column0 = PxVec3(0.f);
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Y)
+		{
+			data.angularVelocity.y = 0.f;
+			//data.sqrtInvInertia.column1 = PxVec3(0.f);
+		}
+		if (lockFlags & PxRigidDynamicLockFlag::eLOCK_ANGULAR_Z)
+		{
+			data.angularVelocity.z = 0.f;
+			//data.sqrtInvInertia.column2 = PxVec3(0.f);
+		}
+	}
+
+	PX_ASSERT(linearVelocity.isFinite());
+	PX_ASSERT(angularVelocity.isFinite());
+
+	data.invMass = invMass;
+	data.penBiasClamp = maxDepenetrationVelocity;
+	data.maxContactImpulse = maxContactImpulse;
+	data.body2World = globalPose;
+
+	data.lockFlags = PxU16(lockFlags);
+
+	data.reportThreshold = reportThreshold;
+}
--- a/physx/source/lowleveldynamics/src/DySolverBody.h
+++ b/physx/source/lowleveldynamics/src/DySolverBody.h
@ -0,0 +1,72 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERBODY_H
+#define DY_SOLVERBODY_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxMat33.h"
+#include "CmPhysXCommon.h"
+#include "CmSpatialVector.h"
+#include "solver/PxSolverDefs.h"
+
+namespace physx
+{
+
+class PxsRigidBody;
+struct PxsBodyCore;
+
+namespace Dy
+{
+
+// PT: TODO: make sure this is still needed / replace with V4sqrt
+//This method returns values of 0 when the inertia is 0. This is a bit of a hack but allows us to 
+//represent kinematic objects' velocities in our new format
+PX_FORCE_INLINE PxVec3 computeSafeSqrtInertia(const PxVec3& v)
+{
+	return PxVec3(	v.x == 0.0f ? 0.0f : PxSqrt(v.x),
+					v.y == 0.0f ? 0.0f : PxSqrt(v.y),
+					v.z == 0.0f ? 0.0f : PxSqrt(v.z));
+}
+
+void copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angularVelocity, const PxReal invMass, const PxVec3& invInertia, const PxTransform& globalPose,
+	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& solverBodyData, PxU32 lockFlags);
+
+// PT: TODO: using PxsBodyCore in the interface makes us write less data to the stack for passing arguments, and we can take advantage of the class layout
+// (we know what is aligned or not, we know if it is safe to V4Load vectors, etc). Note that this is what we previously had, which is why PxsBodyCore was still
+// forward-referenced above.
+//void copyToSolverBodyData(PxSolverBodyData& solverBodyData, const PxsBodyCore& core, const PxU32 nodeIndex);
+
+}
+
+}
+
+#endif //DY_SOLVERBODY_H
--- a/physx/source/lowleveldynamics/src/DySolverConstraint1D.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraint1D.h
@ -0,0 +1,204 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVER_CONSTRAINT_1D_H
+#define DY_SOLVER_CONSTRAINT_1D_H
+
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "DyArticulationUtils.h"
+#include "DySolverConstraintTypes.h"
+#include "DySolverBody.h"
+#include "PxConstraintDesc.h"
+#include "DySolverConstraintDesc.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+// dsequeira: we should probably fork these structures for constraints and extended constraints,
+// since there's a few things that are used for one but not the other
+
+struct SolverConstraint1DHeader
+{
+	PxU8	type;			// enum SolverConstraintType - must be first byte
+	PxU8	count;			// count of following 1D constraints
+	PxU8	dominance;
+	PxU8	breakable;		// indicate whether this constraint is breakable or not						
+
+	PxReal	linBreakImpulse;
+	PxReal	angBreakImpulse;
+	PxReal	invMass0D0;
+	PxVec3	body0WorldOffset;
+	PxReal	invMass1D1;
+	PxReal	linearInvMassScale0;		// only used by articulations
+	PxReal	angularInvMassScale0;		// only used by articulations
+	PxReal	linearInvMassScale1;		// only used by articulations
+	PxReal	angularInvMassScale1;		// only used by articulations
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DHeader) == 48);
+
+PX_ALIGN_PREFIX(16)
+struct SolverConstraint1D 
+{
+public:
+	PxVec3		lin0;					//!< linear velocity projection (body 0)	
+	PxReal		constant;				//!< constraint constant term
+
+	PxVec3		lin1;					//!< linear velocity projection (body 1)
+	PxReal		unbiasedConstant;		//!< constraint constant term without bias
+
+	PxVec3		ang0;					//!< angular velocity projection (body 0)
+	PxReal		velMultiplier;			//!< constraint velocity multiplier
+
+	PxVec3		ang1;					//!< angular velocity projection (body 1)
+	PxReal		impulseMultiplier;		//!< constraint impulse multiplier
+
+	PxVec3		ang0Writeback;			//!< unscaled angular velocity projection (body 0)
+	PxU32		pad;
+
+	PxReal		minImpulse;				//!< Lower bound on impulse magnitude	 
+	PxReal		maxImpulse;				//!< Upper bound on impulse magnitude
+	PxReal		appliedForce;			//!< applied force to correct velocity+bias
+	PxU32		flags;
+} PX_ALIGN_SUFFIX(16); 	
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1D) == 96);
+
+
+struct SolverConstraint1DExt : public SolverConstraint1D
+{
+public:
+	Cm::SpatialVectorV deltaVA;
+	Cm::SpatialVectorV deltaVB;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DExt) == 160);
+
+
+PX_FORCE_INLINE void init(SolverConstraint1DHeader& h, 
+						  PxU8 count, 
+						  bool isExtended,
+						  const PxConstraintInvMassScale& ims)
+{
+	h.type			= PxU8(isExtended ? DY_SC_TYPE_EXT_1D : DY_SC_TYPE_RB_1D);
+	h.count			= count;
+	h.dominance		= 0;
+	h.linearInvMassScale0	= ims.linear0;
+	h.angularInvMassScale0	= ims.angular0;
+	h.linearInvMassScale1	= -ims.linear1;
+	h.angularInvMassScale1	= -ims.angular1;
+}
+
+PX_FORCE_INLINE void init(SolverConstraint1D& c,
+						  const PxVec3& _linear0, const PxVec3& _linear1, 
+						  const PxVec3& _angular0, const PxVec3& _angular1,
+						  PxReal _minImpulse, PxReal _maxImpulse)
+{
+	PX_ASSERT(_linear0.isFinite());
+	PX_ASSERT(_linear1.isFinite());
+	c.lin0					= _linear0;
+	c.lin1					= _linear1;
+	c.ang0					= _angular0;
+	c.ang1					= _angular1;
+	c.minImpulse			= _minImpulse;
+	c.maxImpulse			= _maxImpulse;
+	c.flags					= 0;
+	c.appliedForce			= 0;
+}
+
+PX_FORCE_INLINE bool needsNormalVel(const Px1DConstraint &c)
+{
+	return c.flags & Px1DConstraintFlag::eRESTITUTION
+		|| (c.flags & Px1DConstraintFlag::eSPRING && c.flags & Px1DConstraintFlag::eACCELERATION_SPRING);
+}
+
+PX_FORCE_INLINE void setSolverConstants(PxReal& constant,
+										PxReal& unbiasedConstant,
+										PxReal& velMultiplier,
+										PxReal& impulseMultiplier,
+										const Px1DConstraint& c,
+										PxReal normalVel,
+										PxReal unitResponse,
+										PxReal minRowResponse,
+										PxReal erp,
+										PxReal dt,
+										PxReal recipdt)
+{
+	PX_ASSERT(PxIsFinite(unitResponse));
+	PxReal recipResponse = unitResponse <= minRowResponse ? 0 : 1.0f/unitResponse;
+
+	PxReal geomError = c.geometricError * erp;
+
+	if(c.flags & Px1DConstraintFlag::eSPRING)
+	{
+		PxReal a = dt * dt * c.mods.spring.stiffness + dt * c.mods.spring.damping;
+		PxReal b = dt * (c.mods.spring.damping * c.velocityTarget - c.mods.spring.stiffness * geomError);
+
+		if(c.flags & Px1DConstraintFlag::eACCELERATION_SPRING)
+		{	
+			PxReal x = 1.0f/(1.0f+a);
+			constant = unbiasedConstant = x * recipResponse * b;
+			velMultiplier = -x * recipResponse * a;
+			impulseMultiplier = 1.0f-x;
+		}
+		else
+		{
+			PxReal x = unitResponse == 0.f ? 0.f : 1.0f/(1.0f+a*unitResponse);
+			constant = unbiasedConstant = x * b;
+			velMultiplier = -x*a;
+			impulseMultiplier = 1.0f-x;
+		}
+	}
+	else
+	{
+		velMultiplier = -recipResponse;
+		impulseMultiplier = 1.0f;
+
+		if(c.flags & Px1DConstraintFlag::eRESTITUTION && -normalVel>c.mods.bounce.velocityThreshold)
+		{
+			unbiasedConstant = constant = recipResponse * c.mods.bounce.restitution*-normalVel;
+		}
+		else
+		{
+			// see usage of 'for internal use' in preprocessRows()
+			constant = recipResponse * (c.velocityTarget - geomError*recipdt);
+			unbiasedConstant = recipResponse * (c.velocityTarget - c.forInternalUse*recipdt);
+		}
+	}
+}
+
+}
+}
+
+#endif //DY_SOLVER_CONSTRAINT_1D_H
--- a/physx/source/lowleveldynamics/src/DySolverConstraint1D4.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraint1D4.h
@ -0,0 +1,106 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVERCONSTRAINT1D4_H
+#define DY_SOLVERCONSTRAINT1D4_H
+
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "DyArticulationUtils.h"
+#include "DySolverConstraint1D.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+struct SolverConstraint1DHeader4
+{
+	PxU8	type;			// enum SolverConstraintType - must be first byte
+	PxU8	pad0[3];	
+	//These counts are the max of the 4 sets of data.
+	//When certain pairs have fewer constraints than others, they are padded with 0s so that no work is performed but 
+	//calculations are still shared (afterall, they're computationally free because we're doing 4 things at a time in SIMD)
+	PxU32	count;
+	PxU8	count0, count1, count2, count3;
+	PxU8	break0, break1, break2, break3;
+
+	Vec4V	linBreakImpulse;
+	Vec4V	angBreakImpulse;
+	Vec4V	invMass0D0;
+	Vec4V	invMass1D1;
+	Vec4V	angD0;
+	Vec4V	angD1;
+
+	Vec4V	body0WorkOffsetX;
+	Vec4V	body0WorkOffsetY;
+	Vec4V	body0WorkOffsetZ;
+};
+
+struct SolverConstraint1DBase4 
+{
+public:
+	Vec4V		lin0X;
+	Vec4V		lin0Y;
+	Vec4V		lin0Z;
+	Vec4V		ang0X;
+	Vec4V		ang0Y;
+	Vec4V		ang0Z;
+	Vec4V		ang0WritebackX;
+	Vec4V		ang0WritebackY;
+	Vec4V		ang0WritebackZ;
+	Vec4V		constant;
+	Vec4V		unbiasedConstant;
+	Vec4V		velMultiplier;
+	Vec4V		impulseMultiplier;
+	Vec4V		minImpulse;
+	Vec4V		maxImpulse;
+	Vec4V		appliedForce;
+	PxU32		flags[4];
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DBase4) == 272);
+
+struct SolverConstraint1DDynamic4 : public SolverConstraint1DBase4
+{
+	Vec4V		lin1X;
+	Vec4V		lin1Y;
+	Vec4V		lin1Z;
+	Vec4V		ang1X;
+	Vec4V		ang1Y;
+	Vec4V		ang1Z;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverConstraint1DDynamic4) == 368);
+
+}
+
+}
+
+#endif //DY_SOLVERCONSTRAINT1D4_H
--- a/physx/source/lowleveldynamics/src/DySolverConstraint1DStep.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraint1DStep.h
@ -0,0 +1,224 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVER_CONSTRAINT_1D_STEP_H
+#define DY_SOLVER_CONSTRAINT_1D_STEP_H
+
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "DyArticulationUtils.h"
+#include "DySolverConstraintTypes.h"
+#include "DySolverBody.h"
+#include "PxConstraintDesc.h"
+#include "DySolverConstraintDesc.h"
+
+
+namespace physx
+{
+	namespace Dy
+	{
+		struct SolverContactHeaderStep
+		{
+			enum DySolverContactFlags
+			{
+				eHAS_FORCE_THRESHOLDS = 0x1
+			};
+
+			PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+			PxU8	flags;
+			PxU8	numNormalConstr;
+			PxU8	numFrictionConstr;					//4
+
+			PxReal	angDom0;							//8
+			PxReal	angDom1;							//12
+			PxReal	invMass0;							//16
+
+			Vec4V   staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;		//32
+			PxVec3	normal;															//48
+			
+			PxReal	maxPenBias;														//52
+			PxReal	invMass1;														//56
+			PxReal	minNormalForce;													//60
+			PxU32 broken;															//64
+			PxU8* frictionBrokenWritebackByte;										//68	72
+			Sc::ShapeInteraction* shapeInteraction;									//72	80
+#if !PX_P64_FAMILY
+			PxU32 pad[2];															//80
+#endif		
+
+			PX_FORCE_INLINE FloatV getStaticFriction() const { return V4GetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W); }
+			PX_FORCE_INLINE FloatV getDynamicFriction() const { return V4GetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W); }
+			PX_FORCE_INLINE FloatV getDominance0() const { return V4GetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W); }
+			PX_FORCE_INLINE FloatV getDominance1() const { return V4GetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W); }
+		};
+
+		struct SolverContactPointStep
+		{
+			PxVec3 raXnI;
+			PxF32 separation;
+			PxVec3 rbXnI;
+			PxF32 velMultiplier;
+			PxF32 targetVelocity;
+			PxF32 biasCoefficient;
+			//2 more slots here for extra data
+			PxU32 pad[2];
+		};
+
+		struct SolverContactPointStepExt : public SolverContactPointStep
+		{
+			Vec3V linDeltaVA;
+			Vec3V linDeltaVB;
+			Vec3V angDeltaVA;
+			Vec3V angDeltaVB;
+		};
+
+		struct SolverContactFrictionStep
+		{
+			Vec4V normalXYZ_ErrorW;		//16
+			Vec4V raXnI_targetVelW;
+			Vec4V rbXnI_velMultiplierW;
+			PxReal biasScale;
+			PxReal appliedForce;
+			PxReal frictionScale;
+			PxU32 pad[1];
+
+			PX_FORCE_INLINE void setAppliedForce(const FloatV f) { FStore(f, &appliedForce); }
+		};
+
+		struct SolverContactFrictionStepExt : public SolverContactFrictionStep
+		{
+			Vec3V linDeltaVA;
+			Vec3V linDeltaVB;
+			Vec3V angDeltaVA;
+			Vec3V angDeltaVB;
+		};
+
+		struct SolverConstraint1DHeaderStep
+		{
+			PxU8	type;			// enum SolverConstraintType - must be first byte
+			PxU8	count;			// count of following 1D constraints
+			PxU8	dominance;
+			PxU8	breakable;		// indicate whether this constraint is breakable or not						
+			PxReal	linBreakImpulse;
+			PxReal	angBreakImpulse;
+			PxReal	invMass0D0;
+
+			PxVec3	body0WorldOffset;
+			PxReal	invMass1D1;
+
+			PxVec3	rAWorld;
+			PxReal linearInvMassScale0;		// only used by articulations
+
+			PxVec3	rBWorld;
+			PxReal	angularInvMassScale0;
+
+			PxReal	linearInvMassScale1;		// only used by articulations
+			PxReal	angularInvMassScale1;
+			PxU32	pad[2];
+
+			//Ortho axes for body 0, recipResponse in W component
+			PxVec4    angOrthoAxis0_recipResponseW[3];
+			//Ortho axes for body 1, error of body in W component
+			PxVec4    angOrthoAxis1_Error[3];
+		};
+
+
+		PX_FORCE_INLINE void init(SolverConstraint1DHeaderStep& h,
+			PxU8 count,
+			bool isExtended,
+			const PxConstraintInvMassScale& ims)
+		{
+			h.type = PxU8(isExtended ? DY_SC_TYPE_EXT_1D : DY_SC_TYPE_RB_1D);
+			h.count = count;
+			h.dominance = 0;
+			h.linearInvMassScale0 = ims.linear0;
+			h.angularInvMassScale0 = ims.angular0;
+			h.linearInvMassScale1 = -ims.linear1;
+			h.angularInvMassScale1 = -ims.angular1;
+		}
+
+
+		PX_ALIGN_PREFIX(16)
+		struct SolverConstraint1DStep
+		{
+		public:
+			PxVec3		lin0;					//!< linear velocity projection (body 0)	
+			PxReal		error;					//!< constraint error term - must be scaled by biasScale. Can be adjusted at run-time
+
+			PxVec3		lin1;					//!< linear velocity projection (body 1)
+			PxReal		biasScale;				//!< constraint constant bias scale. Constant
+
+			PxVec3		ang0;					//!< angular velocity projection (body 0)
+			PxReal		velMultiplier;			//!< constraint velocity multiplier
+
+			PxVec3		ang1;					//!< angular velocity projection (body 1)
+			PxReal		impulseMultiplier;		//!< constraint impulse multiplier
+
+			PxReal		velTarget;				//!< Scaled target velocity of the constraint drive
+
+			PxReal		minImpulse;				//!< Lower bound on impulse magnitude	 
+			PxReal		maxImpulse;				//!< Upper bound on impulse magnitude
+			PxReal		appliedForce;			//!< applied force to correct velocity+bias
+
+			PxReal		maxBias;
+			PxU32		flags;
+			PxReal		recipResponse;			//Constant. Only used for articulations;
+			PxReal		angularErrorScale;		//Constant
+		} PX_ALIGN_SUFFIX(16);
+
+		struct SolverConstraint1DExtStep : public SolverConstraint1DStep
+		{
+		public:
+			Cm::SpatialVectorV deltaVA;
+			Cm::SpatialVectorV deltaVB;
+		};
+
+		PX_FORCE_INLINE void init(SolverConstraint1DStep& c,
+			const PxVec3& _linear0, const PxVec3& _linear1,
+			const PxVec3& _angular0, const PxVec3& _angular1,
+			PxReal _minImpulse, PxReal _maxImpulse)
+		{
+			PX_ASSERT(_linear0.isFinite());
+			PX_ASSERT(_linear1.isFinite());
+			c.lin0 = _linear0;
+			c.lin1 = _linear1;
+			c.ang0 = _angular0;
+			c.ang1 = _angular1;
+			c.minImpulse = _minImpulse;
+			c.maxImpulse = _maxImpulse;
+			c.flags = 0;
+			c.appliedForce = 0;
+			c.angularErrorScale = 1.f;
+		}
+
+	}//namespace Dy
+}
+
+#endif
--- a/physx/source/lowleveldynamics/src/DySolverConstraintDesc.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraintDesc.h
@ -0,0 +1,142 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONSTRAINTDESC_H
+#define DY_SOLVERCONSTRAINTDESC_H
+
+#include "PxvConfig.h"
+#include "DySolverConstraintTypes.h"
+#include "PsUtilities.h"
+#include "PxConstraintDesc.h"
+#include "solver/PxSolverDefs.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+
+struct PxsContactManagerOutput;
+
+namespace Cm
+{
+	class SpatialVector;
+}
+
+struct PxSolverBody;
+struct PxSolverBodyData;
+
+namespace Dy
+{
+
+struct FsData;
+
+
+
+
+// dsequeira: moved this articulation stuff here to sever a build dep on Articulation.h through DyThreadContext.h and onward
+
+struct SelfConstraintBlock
+{
+	PxU32	startId;				
+	PxU32	numSelfConstraints;	
+	PxU16	fsDataLength;		
+	PxU16	requiredSolverProgress;
+	uintptr_t eaFsData;
+};
+
+//This class rolls together multiple contact managers into a single contact manager.
+struct CompoundContactManager
+{
+	PxU32 mStartIndex;
+	PxU16 mStride;
+	PxU16 mReducedContactCount;
+
+	PxcNpWorkUnit* unit;			//This is a work unit but the contact buffer has been adjusted to contain all the contacts for all the subsequent pairs
+	PxsContactManagerOutput* cmOutput;
+	PxU8* originalContactPatches;	//This is the original contact buffer that we replaced with a combined buffer	
+	PxU8* originalContactPoints;
+	PxU8 originalContactCount;
+	PxU8 originalPatchCount;
+	PxU8 originalStatusFlags;
+	PxReal* originalForceBuffer;	//This is the original force buffer that we replaced with a combined force buffer
+	PxU16* forceBufferList;			//This is a list of indices from the reduced force buffer to the original force buffers - we need this to fix up the write-backs from the solver	
+};
+
+struct SolverConstraintPrepState
+{
+enum Enum 
+{
+	eOUT_OF_MEMORY,
+	eUNBATCHABLE,
+	eSUCCESS
+};
+};
+
+PX_FORCE_INLINE bool isArticulationConstraint(const PxSolverConstraintDesc& desc)
+{
+	return desc.linkIndexA != PxSolverConstraintDesc::NO_LINK || 
+		desc.linkIndexB != PxSolverConstraintDesc::NO_LINK;
+}
+
+
+PX_FORCE_INLINE void setConstraintLength(PxSolverConstraintDesc& desc, const PxU32 constraintLength)
+{
+	PX_ASSERT(0==(constraintLength & 0x0f));
+	PX_ASSERT(constraintLength <= PX_MAX_U16 * 16);
+	desc.constraintLengthOver16 = Ps::to16(constraintLength >> 4);
+}
+
+PX_FORCE_INLINE void setWritebackLength(PxSolverConstraintDesc& desc, const PxU32 writeBackLength)
+{
+	PX_ASSERT(0==(writeBackLength & 0x03));
+	PX_ASSERT(writeBackLength <= PX_MAX_U16 * 4);
+	desc.writeBackLengthOver4 = Ps::to16(writeBackLength >> 2);
+}
+
+PX_FORCE_INLINE PxU32 getConstraintLength(const PxSolverConstraintDesc& desc)
+{
+	return PxU32(desc.constraintLengthOver16 << 4);
+}
+
+
+PX_FORCE_INLINE PxU32 getWritebackLength(const PxSolverConstraintDesc& desc)
+{
+	return PxU32(desc.writeBackLengthOver4 << 2);
+}
+
+PX_COMPILE_TIME_ASSERT(0 == (0x0f & sizeof(PxSolverConstraintDesc)));
+
+#define MAX_PERMITTED_SOLVER_PROGRESS 0xFFFF
+
+}
+
+}
+
+#endif //DY_SOLVERCONSTRAINTDESC_H
--- a/physx/source/lowleveldynamics/src/DySolverConstraintExtShared.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraintExtShared.h
@ -0,0 +1,57 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVER_CONSTRAINT_EXT_SHARED_H
+#define DY_SOLVER_CONSTRAINT_EXT_SHARED_H
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "DyArticulationContactPrep.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DyArticulationHelper.h"
+#include "PxcNpWorkUnit.h"
+#include "PxsMaterialManager.h"
+#include "PxsMaterialCombiner.h"
+
+namespace physx
+{
+	namespace Dy
+	{
+		FloatV setupExtSolverContact(const SolverExtBody& b0, const SolverExtBody& b1,
+			const FloatV& d0, const FloatV& d1, const FloatV& angD0, const FloatV& angD1, const Vec3V& bodyFrame0p, const Vec3V& bodyFrame1p,
+			const Vec3VArg normal, const FloatVArg invDt, const FloatVArg invDtp8, const FloatVArg restDistance, const FloatVArg maxPenBias, const FloatVArg restitution,
+			const FloatVArg bounceThreshold, const Gu::ContactPoint& contact, SolverContactPointExt& solverContact, const FloatVArg ccdMaxSeparation,
+			Cm::SpatialVectorF* Z, const Cm::SpatialVectorV& v0, const Cm::SpatialVectorV& v1);
+	} //namespace Dy
+}
+
+#endif //DY_SOLVER_CONSTRAINT_EXT_SHARED_H
--- a/physx/source/lowleveldynamics/src/DySolverConstraintTypes.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraintTypes.h
@ -0,0 +1,72 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONSTRAINTTYPES_H
+#define DY_SOLVERCONSTRAINTTYPES_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxvConfig.h"
+
+namespace physx
+{
+
+enum SolverConstraintType
+{
+	DY_SC_TYPE_NONE = 0,
+	DY_SC_TYPE_RB_CONTACT,		// RB-only contact
+	DY_SC_TYPE_RB_1D,			// RB-only 1D constraint
+	DY_SC_TYPE_EXT_CONTACT,	// contact involving articulations
+	DY_SC_TYPE_EXT_1D,			// 1D constraint involving articulations
+	DY_SC_TYPE_STATIC_CONTACT,	// RB-only contact where body b is static
+	DY_SC_TYPE_NOFRICTION_RB_CONTACT, //RB-only contact with no friction patch
+	DY_SC_TYPE_BLOCK_RB_CONTACT,
+	DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT,
+	DY_SC_TYPE_BLOCK_1D,
+	DY_SC_TYPE_FRICTION,
+	DY_SC_TYPE_STATIC_FRICTION,
+	DY_SC_TYPE_EXT_FRICTION,
+	DY_SC_TYPE_BLOCK_FRICTION,
+	DY_SC_TYPE_BLOCK_STATIC_FRICTION,
+	DY_SC_CONSTRAINT_TYPE_COUNT //Count of the number of different constraint types in the solver
+};
+
+enum SolverConstraintFlags
+{
+	DY_SC_FLAG_OUTPUT_FORCE		= (1<<1),
+	DY_SC_FLAG_KEEP_BIAS		= (1<<2),
+	DY_SC_FLAG_ROT_EQ			= (1<<3),
+	DY_SC_FLAG_ORTHO_TARGET		= (1<<4),
+	DY_SC_FLAG_SPRING			= (1<<5),
+	DY_SC_FLAG_INEQUALITY		= (1<<6)
+};
+
+}
+
+#endif //DY_SOLVERCONSTRAINTTYPES_H
--- a/physx/source/lowleveldynamics/src/DySolverConstraints.cpp
+++ b/physx/source/lowleveldynamics/src/DySolverConstraints.cpp
--- a/physx/source/lowleveldynamics/src/DySolverConstraintsBlock.cpp
+++ b/physx/source/lowleveldynamics/src/DySolverConstraintsBlock.cpp
--- a/physx/source/lowleveldynamics/src/DySolverConstraintsShared.h
+++ b/physx/source/lowleveldynamics/src/DySolverConstraintsShared.h
@ -0,0 +1,171 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#ifndef DY_SOLVER_CORE_SHARED_H
+#define DY_SOLVER_CORE_SHARED_H
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContact.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+
+
+namespace physx
+{
+
+namespace Dy
+{
+	PX_FORCE_INLINE static FloatV solveDynamicContacts(SolverContactPoint* contacts, const PxU32 nbContactPoints, const Vec3VArg contactNormal,
+	const FloatVArg invMassA, const FloatVArg invMassB, const FloatVArg angDom0, const FloatVArg angDom1, Vec3V& linVel0_, Vec3V& angState0_, 
+	Vec3V& linVel1_, Vec3V& angState1_, PxF32* PX_RESTRICT forceBuffer)
+{
+	Vec3V linVel0 = linVel0_;
+	Vec3V angState0 = angState0_;
+	Vec3V linVel1 = linVel1_;
+	Vec3V angState1 = angState1_;
+	FloatV accumulatedNormalImpulse = FZero();
+
+	const Vec3V delLinVel0 = V3Scale(contactNormal, invMassA);
+	const Vec3V delLinVel1 = V3Scale(contactNormal, invMassB);
+
+	for(PxU32 i=0;i<nbContactPoints;i++)
+	{
+		SolverContactPoint& c = contacts[i];
+		Ps::prefetchLine(&contacts[i], 128);
+
+		const Vec3V raXn = c.raXn;
+
+		const Vec3V rbXn = c.rbXn;
+
+		const FloatV appliedForce = FLoad(forceBuffer[i]);
+		const FloatV velMultiplier = c.getVelMultiplier();
+		
+		/*const FloatV targetVel = c.getTargetVelocity();
+		const FloatV nScaledBias = c.getScaledBias();*/
+		const FloatV maxImpulse = c.getMaxImpulse();
+
+		//Compute the normal velocity of the constraint.
+		const Vec3V v0 = V3MulAdd(linVel0, contactNormal, V3Mul(angState0, raXn));
+		const Vec3V v1 = V3MulAdd(linVel1, contactNormal, V3Mul(angState1, rbXn));
+		const FloatV normalVel = V3SumElems(V3Sub(v0, v1));
+
+		const FloatV biasedErr = c.getBiasedErr();//FScaleAdd(targetVel, velMultiplier, nScaledBias);
+
+		//KS - clamp the maximum force
+		const FloatV _deltaF = FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const FloatV _newForce = FAdd(appliedForce, _deltaF);
+		const FloatV newForce = FMin(_newForce, maxImpulse);
+		const FloatV deltaF = FSub(newForce, appliedForce);
+
+		linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+		linVel1 = V3NegScaleSub(delLinVel1, deltaF, linVel1);
+		angState0 = V3ScaleAdd(raXn, FMul(deltaF, angDom0), angState0);
+		angState1 = V3NegScaleSub(rbXn, FMul(deltaF, angDom1), angState1);
+		
+		FStore(newForce, &forceBuffer[i]);
+
+		accumulatedNormalImpulse = FAdd(accumulatedNormalImpulse, newForce);
+	}
+
+	linVel0_ = linVel0;
+	angState0_ = angState0;
+	linVel1_ = linVel1;
+	angState1_ = angState1;
+	return accumulatedNormalImpulse;
+}
+
+PX_FORCE_INLINE static FloatV solveStaticContacts(SolverContactPoint* contacts, const PxU32 nbContactPoints, const Vec3VArg contactNormal,
+	const FloatVArg invMassA, const FloatVArg angDom0, Vec3V& linVel0_, Vec3V& angState0_, PxF32* PX_RESTRICT forceBuffer)
+{
+	Vec3V linVel0 = linVel0_;
+	Vec3V angState0 = angState0_;
+	FloatV accumulatedNormalImpulse = FZero();
+
+	const Vec3V delLinVel0 = V3Scale(contactNormal, invMassA);
+
+	for(PxU32 i=0;i<nbContactPoints;i++)
+	{
+		SolverContactPoint& c = contacts[i];
+		Ps::prefetchLine(&contacts[i],128);
+
+		const Vec3V raXn = c.raXn;
+		
+		const FloatV appliedForce = FLoad(forceBuffer[i]);
+		const FloatV velMultiplier = c.getVelMultiplier();
+
+		/*const FloatV targetVel = c.getTargetVelocity();
+		const FloatV nScaledBias = c.getScaledBias();*/
+		const FloatV maxImpulse = c.getMaxImpulse();
+		
+		const Vec3V v0 = V3MulAdd(linVel0, contactNormal, V3Mul(angState0, raXn));
+		const FloatV normalVel = V3SumElems(v0);
+
+
+		const FloatV biasedErr = c.getBiasedErr();//FScaleAdd(targetVel, velMultiplier, nScaledBias);
+
+		// still lots to do here: using loop pipelining we can interweave this code with the
+		// above - the code here has a lot of stalls that we would thereby eliminate
+		const FloatV _deltaF = FMax(FNegScaleSub(normalVel, velMultiplier, biasedErr), FNeg(appliedForce));
+		const FloatV _newForce = FAdd(appliedForce, _deltaF);
+		const FloatV newForce = FMin(_newForce, maxImpulse);
+		const FloatV deltaF = FSub(newForce, appliedForce);
+
+		linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+		angState0 = V3ScaleAdd(raXn, FMul(deltaF, angDom0), angState0);
+
+		FStore(newForce, &forceBuffer[i]);
+
+		accumulatedNormalImpulse = FAdd(accumulatedNormalImpulse, newForce);
+	}
+
+	linVel0_ = linVel0;
+	angState0_ = angState0;
+	return accumulatedNormalImpulse;
+}
+
+FloatV solveExtContacts(SolverContactPointExt* contacts, const PxU32 nbContactPoints, const Vec3VArg contactNormal,
+	Vec3V& linVel0, Vec3V& angVel0,
+	Vec3V& linVel1, Vec3V& angVel1,
+	Vec3V& li0, Vec3V& ai0,
+	Vec3V& li1, Vec3V& ai1,
+	PxF32* PX_RESTRICT appliedForceBuffer);
+
+}
+
+}
+
+#endif //DY_SOLVER_CORE_SHARED_H
+
--- a/physx/source/lowleveldynamics/src/DySolverContact.h
+++ b/physx/source/lowleveldynamics/src/DySolverContact.h
@ -0,0 +1,228 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_SOLVERCONTACT_H
+#define DY_SOLVERCONTACT_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+using namespace Ps::aos;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+/**
+\brief A header to represent a friction patch for the solver.
+*/
+
+namespace Dy
+{
+
+struct SolverContactHeader
+{
+	enum DySolverContactFlags
+	{
+		eHAS_FORCE_THRESHOLDS = 0x1
+	};
+
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	flags;	
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;					//4
+
+	PxReal	angDom0;							//8
+	PxReal	angDom1;							//12
+	PxReal	invMass0;							//16
+
+	Vec4V   staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W;		//32
+	//KS - minAppliedImpulseForFrictionW is non-zero only for articulations. This is a workaround for a case in articulations where
+	//the impulse is propagated such that many links do not apply friction because their normal forces were corrected by the solver in a previous
+	//link. This results in some links sliding unnaturally. This occurs with prismatic or revolute joints where the impulse propagation one one link
+	//resolves the normal constraint on all links
+	Vec4V	normal_minAppliedImpulseForFrictionW;							//48 
+																			
+
+	PxReal	invMass1;														//52
+	PxU32 broken;															//56
+	PxU8* frictionBrokenWritebackByte;										//60	64
+	Sc::ShapeInteraction* shapeInteraction;									//64	72
+#if PX_P64_FAMILY
+	PxU32	pad[2];															//64	80
+#endif // PX_X64
+
+
+	PX_FORCE_INLINE void setStaticFriction(const FloatV f)	{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+	PX_FORCE_INLINE void setDynamicFriction(const FloatV f)	{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+	PX_FORCE_INLINE void setDominance0(const FloatV f)		{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+	PX_FORCE_INLINE void setDominance1(const FloatV f)		{staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W=V4SetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W,f);}
+
+	PX_FORCE_INLINE FloatV getStaticFriction() const		{return V4GetX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE FloatV getDynamicFriction() const		{return V4GetY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE FloatV getDominance0() const			{return V4GetZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE FloatV getDominance1() const			{return V4GetW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+
+	PX_FORCE_INLINE void setStaticFriction(PxF32 f)			{V4WriteX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+	PX_FORCE_INLINE void setDynamicFriction(PxF32 f)		{V4WriteY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+	PX_FORCE_INLINE void setDominance0(PxF32 f)				{V4WriteZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+	PX_FORCE_INLINE void setDominance1(PxF32 f)				{V4WriteW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W, f);}
+
+	PX_FORCE_INLINE PxF32 getStaticFrictionPxF32() const	{return V4ReadX(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE PxF32 getDynamicFrictionPxF32() const	{return V4ReadY(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE PxF32 getDominance0PxF32() const		{return V4ReadZ(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+	PX_FORCE_INLINE PxF32 getDominance1PxF32() const		{return V4ReadW(staticFrictionX_dynamicFrictionY_dominance0Z_dominance1W);}
+}; 
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader) == 64);
+#else
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader) == 80);
+#endif
+
+/**
+\brief A single rigid body contact point for the solver.
+*/
+struct SolverContactPoint
+{
+	Vec3V raXn;
+	Vec3V rbXn;
+
+	PxF32 velMultiplier;
+	PxF32 biasedErr;
+	PxF32 unbiasedErr;
+	PxF32 maxImpulse;
+
+	PX_FORCE_INLINE FloatV getVelMultiplier() const			{return FLoad(velMultiplier);}
+
+	PX_FORCE_INLINE FloatV getBiasedErr() const				{return FLoad(biasedErr);}
+	PX_FORCE_INLINE FloatV getMaxImpulse() const			{return FLoad(maxImpulse);}
+
+	PX_FORCE_INLINE Vec3V getRaXn() const					{return raXn;}
+	PX_FORCE_INLINE Vec3V getRbXn() const					{return rbXn;}
+
+	PX_FORCE_INLINE void setRaXn(const PxVec3& v)			{V3WriteXYZ(raXn, v);}
+	PX_FORCE_INLINE void setRbXn(const PxVec3& v)			{V3WriteXYZ(rbXn, v);}
+	PX_FORCE_INLINE void setVelMultiplier(PxF32 f)			{velMultiplier = f;}
+
+	PX_FORCE_INLINE void setBiasedErr(PxF32 f)				{biasedErr = f;}
+	PX_FORCE_INLINE void setUnbiasedErr(PxF32 f)			{unbiasedErr = f;}
+
+	PX_FORCE_INLINE PxF32 getVelMultiplierPxF32() const		{return velMultiplier;}
+	PX_FORCE_INLINE const PxVec3& getRaXnPxVec3() const		{return V3ReadXYZ(raXn);}
+	PX_FORCE_INLINE const PxVec3& getRbXnPxVec3() const		{return V3ReadXYZ(rbXn);}
+	PX_FORCE_INLINE PxF32 getBiasedErrPxF32() const			{return biasedErr;}
+}; 
+
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactPoint) == 48);
+
+/**
+\brief A single extended articulation contact point for the solver.
+*/
+struct SolverContactPointExt : public SolverContactPoint
+{
+	Vec3V linDeltaVA;
+	Vec3V angDeltaVA;
+	Vec3V linDeltaVB;
+	Vec3V angDeltaVB;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactPointExt) == 112);
+
+
+/**
+\brief A single friction constraint for the solver.
+*/
+struct SolverContactFriction
+{
+	Vec4V normalXYZ_appliedForceW;		//16
+	Vec4V raXnXYZ_velMultiplierW;		//32
+	Vec4V rbXnXYZ_biasW;				//48
+	PxReal targetVel;					//52
+	PxU32 mPad[3];						//64
+
+	PX_FORCE_INLINE void setAppliedForce(const FloatV f)	{normalXYZ_appliedForceW=V4SetW(normalXYZ_appliedForceW,f);}
+	PX_FORCE_INLINE void setVelMultiplier(const FloatV f)	{raXnXYZ_velMultiplierW=V4SetW(raXnXYZ_velMultiplierW,f);}
+	PX_FORCE_INLINE void setBias(const FloatV f)			{rbXnXYZ_biasW=V4SetW(rbXnXYZ_biasW,f);}
+
+	PX_FORCE_INLINE FloatV getAppliedForce() const			{return V4GetW(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE FloatV getVelMultiplier() const			{return V4GetW(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE FloatV getBias() const					{return V4GetW(rbXnXYZ_biasW);}
+
+	PX_FORCE_INLINE Vec3V getNormal() const					{return Vec3V_From_Vec4V(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE Vec3V getRaXn() const					{return Vec3V_From_Vec4V(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE Vec3V getRbXn() const					{return Vec3V_From_Vec4V(rbXnXYZ_biasW);}
+
+	PX_FORCE_INLINE void setNormal(const PxVec3& v)			{V4WriteXYZ(normalXYZ_appliedForceW, v);}
+	PX_FORCE_INLINE void setRaXn(const PxVec3& v)			{V4WriteXYZ(raXnXYZ_velMultiplierW, v);}
+	PX_FORCE_INLINE void setRbXn(const PxVec3& v)			{V4WriteXYZ(rbXnXYZ_biasW, v);}
+
+	PX_FORCE_INLINE const PxVec3& getNormalPxVec3() const	{return V4ReadXYZ(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE const PxVec3& getRaXnPxVec3() const		{return V4ReadXYZ(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE const PxVec3& getRbXnPxVec3() const		{return V4ReadXYZ(rbXnXYZ_biasW);}
+
+	PX_FORCE_INLINE void setAppliedForce(PxF32 f)			{V4WriteW(normalXYZ_appliedForceW, f);}
+	PX_FORCE_INLINE void setVelMultiplier(PxF32 f)			{V4WriteW(raXnXYZ_velMultiplierW, f);}
+	PX_FORCE_INLINE void setBias(PxF32 f)					{V4WriteW(rbXnXYZ_biasW, f);}
+	
+	PX_FORCE_INLINE PxF32 getAppliedForcePxF32() const		{return V4ReadW(normalXYZ_appliedForceW);}
+	PX_FORCE_INLINE PxF32 getVelMultiplierPxF32() const		{return V4ReadW(raXnXYZ_velMultiplierW);}
+	PX_FORCE_INLINE PxF32 getBiasPxF32() const				{return V4ReadW(rbXnXYZ_biasW);}
+
+}; 
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFriction) == 64);
+
+/**
+\brief A single extended articulation friction constraint for the solver.
+*/
+struct SolverContactFrictionExt : public SolverContactFriction
+{
+	Vec3V linDeltaVA;
+	Vec3V angDeltaVA;
+	Vec3V linDeltaVB;
+	Vec3V angDeltaVB;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFrictionExt) == 128);
+
+}
+
+}
+
+
+
+#endif //DY_SOLVERCONTACT_H
--- a/physx/source/lowleveldynamics/src/DySolverContact4.h
+++ b/physx/source/lowleveldynamics/src/DySolverContact4.h
@ -0,0 +1,179 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVERCONTACT4_H
+#define DY_SOLVERCONTACT4_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+#include "DySolverContact.h"
+
+namespace physx
+{
+
+struct PxcNpWorkUnit;
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+	
+namespace Dy
+{
+
+
+
+
+/**
+\brief Batched SOA contact data. Note, we don't support batching with extended contacts for the simple reason that handling multiple articulations would be complex.
+*/
+struct SolverContactHeader4
+{
+	enum
+	{
+		eHAS_MAX_IMPULSE = 1 << 0,
+		eHAS_TARGET_VELOCITY = 1 << 1
+	};
+
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;
+	PxU8	flag;
+
+	PxU8	flags[4];
+	//These counts are the max of the 4 sets of data.
+	//When certain pairs have fewer patches/contacts than others, they are padded with 0s so that no work is performed but 
+	//calculations are still shared (afterall, they're computationally free because we're doing 4 things at a time in SIMD)
+
+	//KS - used for write-back only
+	PxU8	numNormalConstr0, numNormalConstr1, numNormalConstr2, numNormalConstr3;
+	PxU8	numFrictionConstr0, numFrictionConstr1, numFrictionConstr2, numFrictionConstr3;			
+
+	Vec4V	restitution;																			
+	Vec4V   staticFriction;
+	Vec4V	dynamicFriction;
+	//Technically, these mass properties could be pulled out into a new structure and shared. For multi-manifold contacts,
+	//this would save 64 bytes per-manifold after the cost of the first manifold
+	Vec4V	invMass0D0;
+	Vec4V	invMass1D1;
+	Vec4V	angDom0;
+	Vec4V	angDom1;
+	//Normal is shared between all contacts in the batch. This will save some memory!
+	Vec4V normalX;
+	Vec4V normalY;
+	Vec4V normalZ;
+
+	Sc::ShapeInteraction* shapeInteraction[4];		//192 or 208
+}; 
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader4) == 192);
+#else
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactHeader4) == 208);
+#endif
+
+
+/**
+\brief This represents a batch of 4 contacts with static rolled into a single structure
+*/
+struct SolverContactBatchPointBase4
+{
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V velMultiplier;
+	Vec4V scaledBias;
+	Vec4V biasedErr;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactBatchPointBase4) == 96);
+
+/**
+\brief Contains the additional data required to represent 4 contacts between 2 dynamic bodies
+@see SolverContactBatchPointBase4
+*/
+struct SolverContactBatchPointDynamic4 : public SolverContactBatchPointBase4
+{	
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+}; 
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactBatchPointDynamic4) == 144);
+
+/**
+\brief This represents the shared information of a batch of 4 friction constraints
+*/
+struct SolverFrictionSharedData4
+{
+	BoolV broken;
+	PxU8* frictionBrokenWritebackByte[4];
+	Vec4V normalX[2];
+	Vec4V normalY[2];
+	Vec4V normalZ[2];
+};
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFrictionSharedData4) == 128);
+#endif
+
+
+/**
+\brief This represents a batch of 4 friction constraints with static rolled into a single structure
+*/
+struct SolverContactFrictionBase4
+{
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V scaledBias;
+	Vec4V velMultiplier;
+	Vec4V targetVelocity;
+};
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFrictionBase4) == 96);
+
+/**
+\brief Contains the additional data required to represent 4 friction constraints between 2 dynamic bodies
+@see SolverContactFrictionBase4
+*/
+struct SolverContactFrictionDynamic4 : public SolverContactFrictionBase4
+{
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+}; 
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactFrictionDynamic4) == 144);
+
+}
+
+}
+
+#endif //DY_SOLVERCONTACT4_H
--- a/physx/source/lowleveldynamics/src/DySolverContactPF.h
+++ b/physx/source/lowleveldynamics/src/DySolverContactPF.h
@ -0,0 +1,123 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+#ifndef DY_SOLVERCONTACTPF_H
+#define DY_SOLVERCONTACTPF_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+using namespace Ps::aos;
+
+namespace Dy
+{
+
+struct SolverContactCoulombHeader
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU16	frictionOffset;					//4
+	//PxF32	restitution;
+	PxF32	angDom0;						//8
+	PxF32	dominance0;						//12
+	PxF32	dominance1;						//16
+	PX_ALIGN(16, PxVec3	normalXYZ);			//28
+	PxF32	angDom1;						//32
+	
+	Sc::ShapeInteraction* shapeInteraction;		//36	40
+	PxU8	flags;								//37	41
+	PxU8	pad0[3];							//40	44
+#if !PX_P64_FAMILY
+	PxU32	pad1[2];							//48	
+#else
+	PxU32 pad1;									//		48
+#endif
+	
+	
+	
+	PX_FORCE_INLINE void setDominance0(const FloatV f)		{FStore(f, &dominance0);}
+	PX_FORCE_INLINE void setDominance1(const FloatV f)		{FStore(f, &dominance1);}
+	PX_FORCE_INLINE void setNormal(const Vec3V n)			{V3StoreA(n, normalXYZ);}
+	
+	PX_FORCE_INLINE FloatV getDominance0() const			{return FLoad(dominance0);}
+	PX_FORCE_INLINE FloatV getDominance1() const			{return FLoad(dominance1);}
+	//PX_FORCE_INLINE FloatV getRestitution() const			{return FLoad(restitution);}
+	PX_FORCE_INLINE	Vec3V getNormal()const					{return V3LoadA(normalXYZ);}
+
+  
+	PX_FORCE_INLINE void setDominance0(PxF32 f)				{ dominance0 = f; }
+	PX_FORCE_INLINE void setDominance1(PxF32 f)				{ dominance1 = f;}
+	//PX_FORCE_INLINE void setRestitution(PxF32 f)			{ restitution = f;}
+
+	PX_FORCE_INLINE PxF32 getDominance0PxF32() const		{return dominance0;}
+	PX_FORCE_INLINE PxF32 getDominance1PxF32() const		{return dominance1;}
+	//PX_FORCE_INLINE PxF32 getRestitutionPxF32() const		{return restitution;}
+
+}; 
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactCoulombHeader) == 48);
+
+struct SolverFrictionHeader
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;
+	PxU8	flags;
+	PxF32   staticFriction;
+	PxF32   invMass0D0;
+	PxF32	invMass1D1;
+	PxF32	angDom0;
+	PxF32	angDom1;
+	PxU32	pad2[2];
+
+	PX_FORCE_INLINE void setStaticFriction(const FloatV f)	{FStore(f, &staticFriction);}
+	
+	PX_FORCE_INLINE FloatV getStaticFriction() const		{return FLoad(staticFriction);}
+	
+	PX_FORCE_INLINE void setStaticFriction(PxF32 f)			{staticFriction = f;}
+
+	PX_FORCE_INLINE PxF32 getStaticFrictionPxF32() const	{return staticFriction;}	
+
+	PX_FORCE_INLINE PxU32 getAppliedForcePaddingSize() const {return sizeof(PxU32)*((4 * ((numNormalConstr + 3)/4)));}
+	static PX_FORCE_INLINE PxU32 getAppliedForcePaddingSize(const PxU32 numConstr) {return sizeof(PxU32)*((4 * ((numConstr + 3)/4)));}
+}; 
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFrictionHeader) == 32);
+
+}
+
+}
+
+#endif //DY_SOLVERCONTACTPF_H
--- a/physx/source/lowleveldynamics/src/DySolverContactPF4.h
+++ b/physx/source/lowleveldynamics/src/DySolverContactPF4.h
@ -0,0 +1,155 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef DY_SOLVER_CONTACT_PF_4_H
+#define DY_SOLVER_CONTACT_PF_4_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "PxvConfig.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+using namespace Ps::aos;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+
+namespace Dy
+{
+
+struct SolverContactCoulombHeader4
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU16	frictionOffset;
+	PxU8	numNormalConstr0, numNormalConstr1, numNormalConstr2, numNormalConstr3;
+	PxU8	flags[4];
+	PxU32	pad;					//16
+	Vec4V	restitution;			//32
+	Vec4V	normalX;				//48
+	Vec4V	normalY;				//64
+	Vec4V	normalZ;				//80
+	Vec4V	invMassADom;			//96
+	Vec4V	invMassBDom;			//112
+	Vec4V	angD0;					//128
+	Vec4V	angD1;					//144
+	Sc::ShapeInteraction* shapeInteraction[4];		//160	or 176
+}; 
+
+#if !PX_P64_FAMILY
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactCoulombHeader4) == 160);
+#else
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContactCoulombHeader4) == 176);
+#endif
+
+struct SolverContact4Base
+{
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V appliedForce;
+	Vec4V velMultiplier;
+	Vec4V targetVelocity;
+	Vec4V scaledBias;
+	Vec4V maxImpulse;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContact4Base) == 128);
+
+struct SolverContact4Dynamic : public SolverContact4Base
+{
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverContact4Dynamic) == 176);
+
+struct SolverFrictionHeader4
+{
+	PxU8	type;					//Note: mType should be first as the solver expects a type in the first byte.
+	PxU8	numNormalConstr;
+	PxU8	numFrictionConstr;
+	PxU8	numNormalConstr0;
+	PxU8	numNormalConstr1;
+	PxU8	numNormalConstr2;
+	PxU8	numNormalConstr3;
+	PxU8	numFrictionConstr0;
+	PxU8	numFrictionConstr1;
+	PxU8	numFrictionConstr2;
+	PxU8	numFrictionConstr3;
+	PxU8	pad0;
+	PxU32	frictionPerContact;
+
+	Vec4V	staticFriction;
+	Vec4V   invMassADom;
+	Vec4V   invMassBDom;
+	Vec4V	angD0;
+	Vec4V	angD1;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFrictionHeader4) == 96);
+
+struct SolverFriction4Base
+{
+	Vec4V normalX;
+	Vec4V normalY;
+	Vec4V normalZ;
+	Vec4V raXnX;
+	Vec4V raXnY;
+	Vec4V raXnZ;
+	Vec4V appliedForce;
+	Vec4V velMultiplier;
+	Vec4V targetVelocity;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFriction4Base) == 144);
+
+struct SolverFriction4Dynamic : public SolverFriction4Base
+{
+	Vec4V rbXnX;
+	Vec4V rbXnY;
+	Vec4V rbXnZ;
+};
+
+PX_COMPILE_TIME_ASSERT(sizeof(SolverFriction4Dynamic) == 192);
+
+}
+
+}
+
+
+
+#endif //DY_SOLVER_CONTACT_PF_4_H
+
--- a/physx/source/lowleveldynamics/src/DySolverContext.h
+++ b/physx/source/lowleveldynamics/src/DySolverContext.h
@ -0,0 +1,66 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONTEXT_H
+#define DY_SOLVERCONTEXT_H
+
+namespace physx
+{
+	struct PxSolverBodyData;
+
+namespace Dy
+{
+	struct ThresholdStreamElement;
+	
+
+struct SolverContext
+{
+	bool doFriction;
+	bool writeBackIteration;
+
+	// for threshold stream output
+	ThresholdStreamElement*					mThresholdStream;
+	PxU32									mThresholdStreamIndex;
+	PxU32									mThresholdStreamLength;
+	PxSolverBodyData*						solverBodyArray;
+
+	ThresholdStreamElement* PX_RESTRICT		mSharedThresholdStream;
+	PxU32									mSharedThresholdStreamLength;
+	PxI32*									mSharedOutThresholdPairs;
+	Cm::SpatialVectorF*						Z;
+	Cm::SpatialVectorF*						deltaV;
+
+};
+
+}
+
+}
+
+#endif //DY_SOLVERCONTEXT_H
--- a/physx/source/lowleveldynamics/src/DySolverControl.cpp
+++ b/physx/source/lowleveldynamics/src/DySolverControl.cpp
@ -0,0 +1,753 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+
+#include "PsAllocator.h"
+#include <new>
+#include <stdio.h>
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DyThresholdTable.h"
+#include "DySolverControl.h"
+#include "DyArticulationHelper.h"
+#include "PsAtomic.h"
+#include "PsIntrinsics.h"
+#include "DyArticulationPImpl.h"
+#include "PsThread.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverContext.h"
+
+
+namespace physx
+{
+
+namespace Dy
+{
+
+//-----------------------------------
+
+void solve1DBlock					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContact_BStaticBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_Static	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4_Block					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solve1DConcludeBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactConcludeBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContact_BStaticConcludeBlock	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_Conclude		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_Conclude				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solve1DBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContact_BStaticBlockWriteBack	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_WriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void writeBack1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void contactBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void extContactBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void ext1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void contactPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void writeBack1D4Block				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+// could move this to PxPreprocessor.h but 
+// no implementation available for MSVC
+#if PX_GCC_FAMILY
+#define PX_UNUSED_ATTRIBUTE __attribute__((unused))
+#else
+#define PX_UNUSED_ATTRIBUTE 
+#endif
+ 
+#define DYNAMIC_ARTICULATION_REGISTRATION(x) 0
+
+static SolveBlockMethod gVTableSolveBlock[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactBlock,														// DY_SC_TYPE_RB_CONTACT
+	solve1DBlock,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactBlock),				// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlock),						// DY_SC_TYPE_EXT_1D
+	solveContact_BStaticBlock,												// DY_SC_TYPE_STATIC_CONTACT
+	solveContactBlock,														// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactPreBlock,													// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactPreBlock_Static,											// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4_Block,															// DY_SC_TYPE_BLOCK_1D,
+};
+
+static SolveWriteBackBlockMethod gVTableSolveWriteBackBlock[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactBlockWriteBack,												// DY_SC_TYPE_RB_CONTACT
+	solve1DBlockWriteBack,													// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactBlockWriteBack),		// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlockWriteBack),			// DY_SC_TYPE_EXT_1D
+	solveContact_BStaticBlockWriteBack,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactBlockWriteBack,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactPreBlock_WriteBack,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactPreBlock_WriteBackStatic,									// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_WriteBack,												// DY_SC_TYPE_BLOCK_1D,
+};
+
+static SolveBlockMethod gVTableSolveConcludeBlock[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactConcludeBlock,												// DY_SC_TYPE_RB_CONTACT
+	solve1DConcludeBlock,													// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactConcludeBlock),		// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DConcludeBlock),				// DY_SC_TYPE_EXT_1D
+	solveContact_BStaticConcludeBlock,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactConcludeBlock,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactPreBlock_Conclude,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactPreBlock_ConcludeStatic,									// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_Conclude,													// DY_SC_TYPE_BLOCK_1D,
+};
+
+void SolverCoreRegisterArticulationFns()
+{
+	gVTableSolveBlock[DY_SC_TYPE_EXT_CONTACT] = solveExtContactBlock;
+	gVTableSolveBlock[DY_SC_TYPE_EXT_1D] = solveExt1DBlock;
+
+	gVTableSolveWriteBackBlock[DY_SC_TYPE_EXT_CONTACT] = solveExtContactBlockWriteBack;
+	gVTableSolveWriteBackBlock[DY_SC_TYPE_EXT_1D] = solveExt1DBlockWriteBack;
+	gVTableSolveConcludeBlock[DY_SC_TYPE_EXT_CONTACT] = solveExtContactConcludeBlock;
+	gVTableSolveConcludeBlock[DY_SC_TYPE_EXT_1D] = solveExt1DConcludeBlock;
+}
+
+SolveBlockMethod* getSolveBlockTable()
+{
+	return gVTableSolveBlock;
+}
+
+SolveBlockMethod* getSolverConcludeBlockTable()
+{
+	return gVTableSolveConcludeBlock;
+}
+
+SolveWriteBackBlockMethod* getSolveWritebackBlockTable()
+{
+	return gVTableSolveWriteBackBlock;
+}
+
+SolverCoreGeneral* SolverCoreGeneral::create(bool fricEveryIteration)
+{
+	SolverCoreGeneral* scg = reinterpret_cast<SolverCoreGeneral*>(
+		PX_ALLOC(sizeof(SolverCoreGeneral), "SolverCoreGeneral"));
+
+	if (scg)
+	{
+		new (scg) SolverCoreGeneral;
+		scg->frictionEveryIteration = fricEveryIteration;
+	}
+
+	return scg;
+}
+
+void SolverCoreGeneral::destroyV()
+{
+	this->~SolverCoreGeneral();
+	PX_FREE(this);
+}
+
+void SolverCoreGeneral::solveV_Blocks(SolverIslandParams& params) const
+{
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	SolverContext cache;
+	cache.solverBodyArray			= params.bodyDataList;
+	cache.mThresholdStream			= tempThresholdStream;
+	cache.mThresholdStreamLength	= TempThresholdStreamSize;
+	cache.mThresholdStreamIndex		= 0;
+	cache.writeBackIteration		= false;
+	cache.Z							= params.Z;
+	cache.deltaV					= params.deltaV;
+
+	const PxI32 batchCount = PxI32(params.numConstraintHeaders);
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+	const PxU32 bodyListSize = params.bodyListSize;
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	const PxU32 velocityIterations = params.velocityIterations;
+	const PxU32 positionIterations = params.positionIterations;
+
+	const PxU32 numConstraintHeaders = params.numConstraintHeaders;
+	const PxU32 articulationListSize = params.articulationListSize;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	if(numConstraintHeaders == 0)
+	{
+		for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+		{
+			Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+			const PxSolverBody& atom = bodyListStart[baIdx];
+
+			motionVel.linear = atom.linearVelocity;
+			motionVel.angular = atom.angularState;
+		}
+
+		//Even thought there are no external constraints, there may still be internal constraints in the articulations...
+		for(PxU32 i = 0; i < positionIterations; ++i)
+			for (PxU32 j = 0; j < articulationListSize; ++j)
+				articulationListStart[j].articulation->solveInternalConstraints(params.dt, params.invDt, cache.Z, cache.deltaV, false, false, 0.f);
+
+		for (PxU32 i = 0; i < articulationListSize; i++)
+			ArticulationPImpl::saveVelocity(articulationListStart[i], cache.deltaV);
+
+		for (PxU32 i = 0; i < velocityIterations; ++i)
+			for (PxU32 j = 0; j < articulationListSize; ++j)
+				articulationListStart[j].articulation->solveInternalConstraints(params.dt, params.invDt, cache.Z, cache.deltaV, true, false, 0.f);
+
+		for (PxU32 j = 0; j < articulationListSize; ++j)
+			articulationListStart[j].articulation->writebackInternalConstraints(false);
+
+		return;
+	}
+
+	BatchIterator contactIterator(params.constraintBatchHeaders, params.numConstraintHeaders);
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+
+	//0-(n-1) iterations
+	PxI32 normalIter = 0;
+
+	for (PxU32 iteration = positionIterations; iteration > 0; iteration--)	//decreasing positive numbers == position iters
+	{
+		cache.doFriction = this->frictionEveryIteration ? true : iteration <= 3;
+
+		SolveBlockParallel(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, iteration == 1 ? gVTableSolveConcludeBlock : gVTableSolveBlock, normalIter);
+
+		for (PxU32 i = 0; i < articulationListSize; ++i)
+			articulationListStart[i].articulation->solveInternalConstraints(params.dt, params.invDt, cache.Z, cache.deltaV, false, false, 0.f);
+
+		++normalIter;
+	}
+
+	for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+	{
+		const PxSolverBody& atom = bodyListStart[baIdx];
+		Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+		motionVel.linear = atom.linearVelocity;
+		motionVel.angular = atom.angularState;
+	}
+	
+	for (PxU32 i = 0; i < articulationListSize; i++)
+		ArticulationPImpl::saveVelocity(articulationListStart[i], cache.deltaV);
+
+	const PxI32 velItersMinOne = (PxI32(velocityIterations)) - 1;
+
+	PxI32 iteration = 0;
+
+	for(; iteration < velItersMinOne; ++iteration)
+	{
+		SolveBlockParallel(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveBlock, normalIter);
+
+		for (PxU32 i = 0; i < articulationListSize; ++i)
+			articulationListStart[i].articulation->solveInternalConstraints(params.dt, params.invDt, cache.Z, cache.deltaV, true, false, 0.f);
+		++normalIter;
+	}
+
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	PxU32 thresholdStreamLength = params.thresholdStreamLength;
+
+	cache.writeBackIteration = true;
+	cache.mSharedThresholdStream = thresholdStream;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	//PGS solver always runs at least one velocity iteration (otherwise writeback won't happen)
+	{
+		SolveBlockParallel(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveWriteBackBlock, normalIter);
+
+		for (PxU32 i = 0; i < articulationListSize; ++i)
+		{
+			articulationListStart[i].articulation->solveInternalConstraints(params.dt, params.invDt, cache.Z, cache.deltaV, true, false, 0.f);
+			articulationListStart[i].articulation->writebackInternalConstraints(false);
+		}
+
+		++normalIter;
+	}
+
+	//Write back remaining threshold streams
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(outThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+		{
+			thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+PxI32 SolverCoreGeneral::solveVParallelAndWriteBack
+(SolverIslandParams& params, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV) const
+{
+#if PX_PROFILE_SOLVE_STALLS
+	PxU64 startTime = readTimer();
+
+	PxU64 stallCount = 0;
+#endif
+
+	SolverContext cache;
+	cache.solverBodyArray = params.bodyDataList;
+	const PxU32 batchSize = params.batchSize;
+
+	const PxI32 UnrollCount = PxI32(batchSize);
+	const PxI32 ArticCount = 2;
+	const PxI32 SaveUnrollCount = 32;
+
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	const PxI32 bodyListSize = PxI32(params.bodyListSize);
+	const PxI32 articulationListSize = PxI32(params.articulationListSize);
+
+	const PxI32 batchCount = PxI32(params.numConstraintHeaders);
+	cache.mThresholdStream = tempThresholdStream;
+	cache.mThresholdStreamLength = TempThresholdStreamSize;
+	cache.mThresholdStreamIndex = 0;
+	cache.writeBackIteration = false;
+	cache.Z = Z;
+	cache.deltaV = deltaV;
+
+	const PxReal dt = params.dt;
+	const PxReal invDt = params.invDt;
+
+	const PxI32 positionIterations = PxI32(params.positionIterations);
+	const PxI32 velocityIterations = PxI32(params.velocityIterations);
+
+	PxI32* constraintIndex = &params.constraintIndex;
+	PxI32* constraintIndex2 = &params.constraintIndex2;
+
+	PxI32* articIndex = &params.articSolveIndex;
+	PxI32* articIndex2 = &params.articSolveIndex2;
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	const PxU32 nbPartitions = params.nbPartitions;	
+
+	PxU32* headersPerPartition = params.headersPerPartition;
+
+	PX_UNUSED(velocityIterations);
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	PxI32 endIndexCount = UnrollCount;
+	PxI32 index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+
+	PxI32 articSolveStart = 0;
+	PxI32 articSolveEnd = 0;
+	PxI32 maxArticIndex = 0;
+	PxI32 articIndexCounter = 0;
+	
+	BatchIterator contactIter(params.constraintBatchHeaders, params.numConstraintHeaders);
+
+	PxI32 maxNormalIndex = 0;
+	PxI32 normalIteration = 0;
+	PxU32 a = 0;
+	PxI32 targetConstraintIndex = 0;
+	PxI32 targetArticIndex = 0;
+	
+	for(PxU32 i = 0; i < 2; ++i)
+	{
+		SolveBlockMethod* solveTable = i == 0 ? gVTableSolveBlock : gVTableSolveConcludeBlock;
+		for(; a < positionIterations - 1 + i; ++a)
+		{
+			WAIT_FOR_PROGRESS(articIndex2, targetArticIndex);
+
+			cache.doFriction = this->frictionEveryIteration ? true : (positionIterations - a) <= 3;
+			for(PxU32 b = 0; b < nbPartitions; ++b)
+			{
+				WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+				maxNormalIndex += headersPerPartition[b];
+				
+				PxI32 nbSolved = 0;
+				while(index < maxNormalIndex)
+				{
+					const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+					SolveBlockParallel(constraintList, remainder, index, batchCount, cache, contactIter, solveTable, 
+						normalIteration);
+					index += remainder;
+					endIndexCount -= remainder;
+					nbSolved += remainder;
+					if(endIndexCount == 0)
+					{
+						endIndexCount = UnrollCount;
+						index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+					}
+				}
+				if(nbSolved)
+				{
+					Ps::memoryBarrier();
+					physx::shdfnd::atomicAdd(constraintIndex2, nbSolved);
+				}
+				targetConstraintIndex += headersPerPartition[b]; //Increment target constraint index by batch count
+			}
+
+			WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+			maxArticIndex += articulationListSize;
+			targetArticIndex += articulationListSize;
+
+			while (articSolveStart < maxArticIndex)
+			{
+				const PxI32 endIdx = PxMin(articSolveEnd, maxArticIndex);
+
+				PxI32 nbSolved = 0;
+				while (articSolveStart < endIdx)
+				{
+					articulationListStart[articSolveStart - articIndexCounter].articulation->solveInternalConstraints(dt, invDt, cache.Z, cache.deltaV, false, false, 0.f);
+					articSolveStart++;
+					nbSolved++;
+				}
+
+				if (nbSolved)
+				{
+					physx::shdfnd::atomicAdd(articIndex2, nbSolved);
+				}
+
+				const PxI32 remaining = articSolveEnd - articSolveStart;
+
+				if (remaining == 0)
+				{
+					articSolveStart = physx::shdfnd::atomicAdd(articIndex, ArticCount) - ArticCount;
+					articSolveEnd = articSolveStart + ArticCount;
+				}
+			}
+
+			articIndexCounter += articulationListSize;
+
+			++normalIteration;
+		}
+	}
+
+	PxI32* bodyListIndex = &params.bodyListIndex;
+	PxI32* bodyListIndex2 = &params.bodyListIndex2;
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	//Save velocity - articulated
+	PxI32 endIndexCount2 = SaveUnrollCount;
+	PxI32 index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+	{
+		WAIT_FOR_PROGRESS(articIndex2, targetArticIndex);
+		WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+		PxI32 nbConcluded = 0;
+		while(index2 < articulationListSize)
+		{
+			const PxI32 remainder = PxMin(SaveUnrollCount, (articulationListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				ArticulationPImpl::saveVelocity(articulationListStart[index2], cache.deltaV);
+			}
+			if(endIndexCount2 == 0)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+				endIndexCount2 = SaveUnrollCount;
+			}
+			nbConcluded += remainder;
+		}
+
+		index2 -= articulationListSize;
+
+		//save velocity
+		
+		while(index2 < bodyListSize)
+		{
+			const PxI32 remainder = PxMin(endIndexCount2, (bodyListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				Ps::prefetchLine(&bodyListStart[index2 + 8]);
+				Ps::prefetchLine(&motionVelocityArray[index2 + 8]);
+				PxSolverBody& body = bodyListStart[index2];
+				Cm::SpatialVector& motionVel = motionVelocityArray[index2];
+				motionVel.linear = body.linearVelocity;
+				motionVel.angular = body.angularState;
+				PX_ASSERT(motionVel.linear.isFinite());
+				PX_ASSERT(motionVel.angular.isFinite());
+			}
+
+			nbConcluded += remainder;
+			
+			//Branch not required because this is the last time we use this atomic variable
+			//if(index2 < articulationListSizePlusbodyListSize)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount - articulationListSize;
+				endIndexCount2 = SaveUnrollCount;
+			}
+		}
+
+		if(nbConcluded)
+		{
+			Ps::memoryBarrier();
+			physx::shdfnd::atomicAdd(bodyListIndex2, nbConcluded);
+		}
+	}
+
+	WAIT_FOR_PROGRESS(bodyListIndex2, (bodyListSize + articulationListSize));
+
+	a = 1;
+	for(; a < params.velocityIterations; ++a)
+	{
+		WAIT_FOR_PROGRESS(articIndex2, targetArticIndex);
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+			maxNormalIndex += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+				SolveBlockParallel(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveBlock, 
+					normalIteration);
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				physx::shdfnd::atomicAdd(constraintIndex2, nbSolved);
+			}
+			targetConstraintIndex += headersPerPartition[b]; //Increment target constraint index by batch count
+		}
+
+		WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+		maxArticIndex += articulationListSize;
+		targetArticIndex += articulationListSize;
+
+		while (articSolveStart < maxArticIndex)
+		{
+			const PxI32 endIdx = PxMin(articSolveEnd, maxArticIndex);
+
+			PxI32 nbSolved = 0;
+			while (articSolveStart < endIdx)
+			{
+				articulationListStart[articSolveStart - articIndexCounter].articulation->solveInternalConstraints(dt, invDt, cache.Z, cache.deltaV, true, false, 0.f);
+				articSolveStart++;
+				nbSolved++;
+			}
+
+			if (nbSolved)
+			{
+				physx::shdfnd::atomicAdd(articIndex2, nbSolved);
+			}
+
+			const PxI32 remaining = articSolveEnd - articSolveStart;
+
+			if (remaining == 0)
+			{
+				articSolveStart = physx::shdfnd::atomicAdd(articIndex, ArticCount) - ArticCount;
+				articSolveEnd = articSolveStart + ArticCount;
+			}
+
+		}
+		++normalIteration;
+		articIndexCounter += articulationListSize;
+	}
+
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	PxU32 thresholdStreamLength = params.thresholdStreamLength;
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	cache.mSharedThresholdStream = thresholdStream;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+
+	//Last iteration - do writeback as well!
+	cache.writeBackIteration = true;
+	{
+		WAIT_FOR_PROGRESS(articIndex2, targetArticIndex);
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+			maxNormalIndex += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+
+				SolveBlockParallel(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveWriteBackBlock, 
+					normalIteration);
+
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				physx::shdfnd::atomicAdd(constraintIndex2, nbSolved);
+			}
+			targetConstraintIndex += headersPerPartition[b]; //Increment target constraint index by batch count
+		}
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, targetConstraintIndex);
+
+			maxArticIndex += articulationListSize;
+			targetArticIndex += articulationListSize;
+
+			while (articSolveStart < maxArticIndex)
+			{
+				const PxI32 endIdx = PxMin(articSolveEnd, maxArticIndex);
+
+				PxI32 nbSolved = 0;
+				while (articSolveStart < endIdx)
+				{
+					articulationListStart[articSolveStart - articIndexCounter].articulation->solveInternalConstraints(dt, invDt, cache.Z, cache.deltaV, false, false, 0.f);
+					articulationListStart[articSolveStart - articIndexCounter].articulation->writebackInternalConstraints(false);
+					articSolveStart++;
+					nbSolved++;
+				}
+
+				if (nbSolved)
+				{
+					physx::shdfnd::atomicAdd(articIndex2, nbSolved);
+				}
+
+				PxI32 remaining = articSolveEnd - articSolveStart;
+
+				if (remaining == 0)
+				{
+					articSolveStart = physx::shdfnd::atomicAdd(articIndex, ArticCount) - ArticCount;
+					articSolveEnd = articSolveStart + ArticCount;
+				}
+			}
+		}
+
+		if(cache.mThresholdStreamIndex > 0)
+		{
+			//Write back to global buffer
+			PxI32 threshIndex = physx::shdfnd::atomicAdd(outThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+			for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+			{
+				thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+			}
+			cache.mThresholdStreamIndex = 0;
+		}
+
+		++normalIteration;
+	}
+
+#if PX_PROFILE_SOLVE_STALLS
+
+	PxU64 endTime = readTimer();
+	PxReal totalTime = (PxReal)(endTime - startTime);
+	PxReal stallTime = (PxReal)stallCount;
+	PxReal stallRatio = stallTime/totalTime;
+	if(0)//stallRatio > 0.2f)
+	{
+		LARGE_INTEGER frequency;
+		QueryPerformanceFrequency( &frequency );
+		printf("Warning -- percentage time stalled = %f; stalled for %f seconds; total Time took %f seconds\n", 
+			stallRatio * 100.f, stallTime/(PxReal)frequency.QuadPart, totalTime/(PxReal)frequency.QuadPart);
+	}
+#endif
+
+	return normalIteration * batchCount;
+}
+
+void SolverCoreGeneral::writeBackV
+(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 /*constraintListSize*/, PxConstraintBatchHeader* batchHeaders, const PxU32 numBatches,
+ ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+ PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const
+{
+	SolverContext cache;
+	cache.solverBodyArray			= atomListData;
+	cache.mThresholdStream			= thresholdStream;
+	cache.mThresholdStreamLength	= thresholdStreamLength;
+	cache.mThresholdStreamIndex		= 0;
+
+	PxI32 outThreshIndex = 0;
+	for(PxU32 j = 0; j < numBatches; ++j)
+	{
+		PxU8 type = *constraintList[batchHeaders[j].startIndex].constraint;
+		writeBackTable[type](constraintList + batchHeaders[j].startIndex,
+			batchHeaders[j].stride, cache);
+	}
+
+	outThresholdPairs = PxU32(outThreshIndex);
+}
+
+void solveVBlock(SOLVEV_BLOCK_METHOD_ARGS)
+{
+	solverCore->solveV_Blocks(params);
+}
+
+}
+}
+
+
+//#endif
--- a/physx/source/lowleveldynamics/src/DySolverControl.h
+++ b/physx/source/lowleveldynamics/src/DySolverControl.h
@ -0,0 +1,161 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCOREGENERAL_H
+#define DY_SOLVERCOREGENERAL_H
+
+#include "DySolverCore.h"
+#include "DySolverConstraintDesc.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+struct FsData;
+
+inline void BusyWaitState(volatile PxU32* state, const PxU32 requiredState)
+{
+	while(requiredState != *state );
+}
+
+inline void WaitBodyRequiredState(PxU32* state, const PxU32 requiredState)
+{
+	if(*state != requiredState)
+	{
+		BusyWaitState(state, requiredState);
+	}
+}
+
+inline void BusyWaitStates(volatile PxU32* stateA, volatile PxU32* stateB, const PxU32 requiredStateA, const PxU32 requiredStateB)
+{
+	while(*stateA != requiredStateA);
+	while(*stateB != requiredStateB);
+}
+
+
+class BatchIterator
+{
+public:
+	PxConstraintBatchHeader* constraintBatchHeaders;
+	PxU32 mSize;
+	PxU32 mCurrentIndex;
+
+	BatchIterator(PxConstraintBatchHeader* _constraintBatchHeaders, PxU32 size) : constraintBatchHeaders(_constraintBatchHeaders),
+		mSize(size), mCurrentIndex(0)
+	{
+	}
+
+	PX_FORCE_INLINE const PxConstraintBatchHeader& GetCurrentHeader(const PxU32 constraintIndex)
+	{
+		PxU32 currentIndex = mCurrentIndex;
+		while((constraintIndex - constraintBatchHeaders[currentIndex].startIndex) >= constraintBatchHeaders[currentIndex].stride)
+			currentIndex = (currentIndex + 1)%mSize;
+		Ps::prefetchLine(&constraintBatchHeaders[currentIndex], 128);
+		mCurrentIndex = currentIndex;
+		return constraintBatchHeaders[currentIndex];
+	}
+private:
+	BatchIterator& operator=(const BatchIterator&);
+};
+
+
+inline void SolveBlockParallel	(PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxI32 batchCount, const PxI32 index,  
+						 const PxI32 headerCount, SolverContext& cache, BatchIterator& iterator,
+						 SolveBlockMethod solveTable[],
+						 const PxI32 iteration
+						)
+{
+	const PxI32 indA = index - (iteration * headerCount);
+
+	const PxConstraintBatchHeader* PX_RESTRICT headers = iterator.constraintBatchHeaders;
+
+	const PxI32 endIndex = indA + batchCount;
+	for(PxI32 i = indA; i < endIndex; ++i)
+	{
+		const PxConstraintBatchHeader& header = headers[i];
+
+		const PxI32 numToGrab = header.stride;
+		PxSolverConstraintDesc* PX_RESTRICT block = &constraintList[header.startIndex];
+
+		Ps::prefetch(block[0].constraint, 384);
+
+		for(PxI32 b = 0; b < numToGrab; ++b)
+		{
+			Ps::prefetchLine(block[b].bodyA);
+			Ps::prefetchLine(block[b].bodyB);
+		}
+
+		//OK. We have a number of constraints to run...
+		solveTable[header.constraintType](block, PxU32(numToGrab), cache);
+	}
+}
+
+class SolverCoreGeneral : public SolverCore
+{
+public:
+	bool frictionEveryIteration;
+	static SolverCoreGeneral* create(bool fricEveryIteration);
+
+	// Implements SolverCore
+	virtual void destroyV();
+
+	virtual PxI32 solveVParallelAndWriteBack
+		(SolverIslandParams& params, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV) const;
+
+	virtual void solveV_Blocks
+		(SolverIslandParams& params) const;
+
+	virtual void writeBackV
+		(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 constraintListSize, PxConstraintBatchHeader* contactConstraintBatches, const PxU32 numBatches,
+		 ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+		 PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const;
+
+private:
+
+	//~Implements SolverCore
+};
+
+#define SOLVEV_BLOCK_METHOD_ARGS	SolverCore*	solverCore, SolverIslandParams& params
+
+void solveVBlock(SOLVEV_BLOCK_METHOD_ARGS);
+
+SolveBlockMethod* getSolveBlockTable();
+
+SolveBlockMethod* getSolverConcludeBlockTable();
+
+SolveWriteBackBlockMethod* getSolveWritebackBlockTable();
+
+}
+
+}
+
+#endif //DY_SOLVERCOREGENERAL_H
--- a/physx/source/lowleveldynamics/src/DySolverControlPF.cpp
+++ b/physx/source/lowleveldynamics/src/DySolverControlPF.cpp
@ -0,0 +1,742 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+#include "PsAllocator.h"
+#include <new>
+#include <stdio.h>
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverContact.h"
+#include "DyThresholdTable.h"
+#include "DySolverControl.h"
+#include "DyArticulationHelper.h"
+#include "PsAtomic.h"
+#include "PsIntrinsics.h"
+#include "DyArticulationPImpl.h"
+#include "PsThread.h"
+#include "DySolverConstraintDesc.h"
+#include "DySolverContext.h"
+#include "DySolverControlPF.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+//-----------------------------------
+
+void solve1DBlock					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4_Block					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solve1DConcludeBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_Conclude				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solve1DBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExt1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solve1D4Block_WriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void writeBack1DBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void ext1DBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void writeBack1D4Block				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solveFrictionBlock					(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFriction_BStaticBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtFrictionBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactCoulombBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulomb_BStaticBlock	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+void solveContactCoulombConcludeBlock			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactCoulombConcludeBlock		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulomb_BStaticConcludeBlock	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveContactCoulombBlockWriteBack			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtContactCoulombBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulomb_BStaticBlockWriteBack	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFriction_BStaticBlockWriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveExtFrictionBlockWriteBack				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+//Pre-block 1d/2d friction stuff...
+
+void solveContactCoulombPreBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_Static			(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_Conclude		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_ConcludeStatic	(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveContactCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionCoulombPreBlock				(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveFrictionCoulombPreBlock_Static		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionCoulombPreBlock_Conclude		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+void solveFrictionCoulombPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveFrictionCoulombPreBlock_WriteBack		(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+void solveFrictionCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache);
+
+
+// could move this to PxPreprocessor.h but 
+// no implementation available for MSVC
+#if PX_GCC_FAMILY
+#define PX_UNUSED_ATTRIBUTE __attribute__((unused))
+#else
+#define PX_UNUSED_ATTRIBUTE 
+#endif
+ 
+#define DYNAMIC_ARTICULATION_REGISTRATION(x) 0
+
+
+static SolveBlockMethod gVTableSolveBlockCoulomb[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactCoulombBlock,												// DY_SC_TYPE_RB_CONTACT
+	solve1DBlock,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactCoulombBlock),			// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlock),						// DY_SC_TYPE_EXT_1D
+	solveContactCoulomb_BStaticBlock,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactCoulombBlock,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactCoulombPreBlock,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactCoulombPreBlock_Static,										// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4_Block,															// DY_SC_TYPE_BLOCK_1D,
+	solveFrictionBlock,														// DY_SC_TYPE_FRICTION_CONSTRAINT
+	solveFriction_BStaticBlock,												// DY_SC_TYPE_STATIC_FRICTION_CONSTRAINT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtFrictionBlock),				// DY_SC_TYPE_EXT_FRICTION_CONSTRAINT
+	solveFrictionCoulombPreBlock,											// DY_SC_TYPE_BLOCK_FRICTION					
+	solveFrictionCoulombPreBlock_Static										// DY_SC_TYPE_BLOCK_STATIC_FRICTION
+};
+
+static SolveWriteBackBlockMethod gVTableSolveWriteBackBlockCoulomb[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactCoulombBlockWriteBack,												// DY_SC_TYPE_RB_CONTACT
+	solve1DBlockWriteBack,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactCoulombBlockWriteBack),		// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DBlockWriteBack),					// DY_SC_TYPE_EXT_1D
+	solveContactCoulomb_BStaticBlockWriteBack,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactCoulombBlockWriteBack,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactCoulombPreBlock_WriteBack,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactCoulombPreBlock_WriteBackStatic,									// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_WriteBack,														// DY_SC_TYPE_BLOCK_1D,
+	solveFrictionBlockWriteBack,													// DY_SC_TYPE_FRICTION_CONSTRAINT
+	solveFriction_BStaticBlockWriteBack,											// DY_SC_TYPE_STATIC_FRICTION_CONSTRAINT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtFrictionBlockWriteBack),				// DY_SC_TYPE_EXT_FRICTION_CONSTRAINT
+	solveFrictionCoulombPreBlock_WriteBack,											// DY_SC_TYPE_BLOCK_FRICTION
+	solveFrictionCoulombPreBlock_WriteBackStatic									// DY_SC_TYPE_BLOCK_STATIC_FRICTION
+};
+
+
+static SolveBlockMethod gVTableSolveConcludeBlockCoulomb[] PX_UNUSED_ATTRIBUTE = 
+{
+	0,
+	solveContactCoulombConcludeBlock,												// DY_SC_TYPE_RB_CONTACT
+	solve1DConcludeBlock,															// DY_SC_TYPE_RB_1D
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtContactCoulombConcludeBlock),			// DY_SC_TYPE_EXT_CONTACT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExt1DConcludeBlock),						// DY_SC_TYPE_EXT_1D
+	solveContactCoulomb_BStaticConcludeBlock,										// DY_SC_TYPE_STATIC_CONTACT
+	solveContactCoulombConcludeBlock,												// DY_SC_TYPE_NOFRICTION_RB_CONTACT
+	solveContactCoulombPreBlock_Conclude,											// DY_SC_TYPE_BLOCK_RB_CONTACT
+	solveContactCoulombPreBlock_ConcludeStatic,										// DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT
+	solve1D4Block_Conclude,															// DY_SC_TYPE_BLOCK_1D,
+	solveFrictionBlock,																// DY_SC_TYPE_FRICTION_CONSTRAINT
+	solveFriction_BStaticBlock,														// DY_SC_TYPE_STATIC_FRICTION_CONSTRAINT
+	DYNAMIC_ARTICULATION_REGISTRATION(solveExtFrictionBlock),						// DY_SC_TYPE_EXT_FRICTION_CONSTRAINT
+	solveFrictionCoulombPreBlock_Conclude,											// DY_SC_TYPE_BLOCK_FRICTION
+	solveFrictionCoulombPreBlock_ConcludeStatic										// DY_SC_TYPE_BLOCK_STATIC_FRICTION
+};
+
+
+void SolverCoreRegisterArticulationFnsCoulomb()
+{
+	gVTableSolveBlockCoulomb[DY_SC_TYPE_EXT_CONTACT] = solveExtContactCoulombBlock;
+	gVTableSolveBlockCoulomb[DY_SC_TYPE_EXT_1D] = solveExt1DBlock;
+
+	gVTableSolveWriteBackBlockCoulomb[DY_SC_TYPE_EXT_CONTACT] = solveExtContactCoulombBlockWriteBack;
+	gVTableSolveWriteBackBlockCoulomb[DY_SC_TYPE_EXT_1D] = solveExt1DBlockWriteBack;
+	gVTableSolveConcludeBlockCoulomb[DY_SC_TYPE_EXT_CONTACT] = solveExtContactCoulombConcludeBlock;
+	gVTableSolveConcludeBlockCoulomb[DY_SC_TYPE_EXT_1D] = solveExt1DConcludeBlock;
+
+	gVTableSolveBlockCoulomb[DY_SC_TYPE_EXT_FRICTION] = solveExtFrictionBlock;
+	gVTableSolveWriteBackBlockCoulomb[DY_SC_TYPE_EXT_FRICTION] = solveExtFrictionBlockWriteBack;
+	gVTableSolveConcludeBlockCoulomb[DY_SC_TYPE_EXT_FRICTION] = solveExtFrictionBlock;
+}
+
+SolverCoreGeneralPF* SolverCoreGeneralPF::create()
+{
+	SolverCoreGeneralPF* scg = reinterpret_cast<SolverCoreGeneralPF*>(
+		PX_ALLOC(sizeof(SolverCoreGeneralPF), "SolverCoreGeneral"));
+
+	if(scg)
+		new (scg) SolverCoreGeneralPF;
+
+	return scg;
+}
+
+void SolverCoreGeneralPF::destroyV()
+{
+	this->~SolverCoreGeneralPF();
+	PX_FREE(this);
+}
+
+void SolverCoreGeneralPF::solveV_Blocks(SolverIslandParams& params) const
+{
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	SolverContext cache;
+	cache.solverBodyArray			= params.bodyDataList;
+	cache.mThresholdStream			= tempThresholdStream;
+	cache.mThresholdStreamLength	= TempThresholdStreamSize;
+	cache.mThresholdStreamIndex		= 0;
+	cache.writeBackIteration		= false;
+	cache.deltaV					= params.deltaV;
+	cache.Z							= params.Z;
+
+	PxI32 batchCount = PxI32(params.numConstraintHeaders);
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+	const PxU32 bodyListSize = params.bodyListSize;
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	const PxU32 velocityIterations = params.velocityIterations;
+	const PxU32 positionIterations = params.positionIterations;
+
+	const PxU32 numConstraintHeaders = params.numConstraintHeaders;
+	const PxU32 articulationListSize = params.articulationListSize;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	if(numConstraintHeaders == 0)
+	{
+		for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+		{
+			Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+			PxSolverBody& atom = bodyListStart[baIdx];
+			motionVel.linear = atom.linearVelocity;
+			motionVel.angular = atom.angularState;
+		}
+
+		for (PxU32 i = 0; i < articulationListSize; i++)
+			ArticulationPImpl::saveVelocity(articulationListStart[i], cache.deltaV);
+
+		return;
+	}
+
+	BatchIterator contactIterator(params.constraintBatchHeaders, params.numConstraintHeaders);
+	BatchIterator frictionIterator(params.frictionConstraintBatches, params.numFrictionConstraintHeaders);
+
+	const PxI32 frictionBatchCount = PxI32(params.numFrictionConstraintHeaders);
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+
+	PxSolverConstraintDesc* PX_RESTRICT frictionConstraintList = params.frictionConstraintList;
+
+	//0-(n-1) iterations
+	PxI32 normalIter = 0;
+	PxI32 frictionIter = 0;
+	for (PxU32 iteration = positionIterations; iteration > 0; iteration--)	//decreasing positive numbers == position iters
+	{
+
+		SolveBlockParallel(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, iteration == 1 ? gVTableSolveConcludeBlockCoulomb : gVTableSolveBlockCoulomb, normalIter);
+		++normalIter;
+	}
+
+	if(frictionBatchCount>0)
+	{
+		const PxU32 numIterations = positionIterations * 2;
+		for (PxU32 iteration = numIterations; iteration > 0; iteration--)	//decreasing positive numbers == position iters
+		{
+			SolveBlockParallel(frictionConstraintList, frictionBatchCount, frictionIter * frictionBatchCount, frictionBatchCount, 
+				cache, frictionIterator, iteration == 1 ? gVTableSolveConcludeBlockCoulomb : gVTableSolveBlockCoulomb, frictionIter);
+			++frictionIter;
+		}
+	}
+
+	for (PxU32 baIdx = 0; baIdx < bodyListSize; baIdx++)
+	{
+		const PxSolverBody& atom = bodyListStart[baIdx];
+		Cm::SpatialVector& motionVel = motionVelocityArray[baIdx];
+		motionVel.linear = atom.linearVelocity;
+		motionVel.angular = atom.angularState;
+	}
+	
+	for (PxU32 i = 0; i < articulationListSize; i++)
+		ArticulationPImpl::saveVelocity(articulationListStart[i], cache.deltaV);
+
+	const PxU32 velItersMinOne = velocityIterations - 1;
+
+	PxU32 iteration = 0;
+
+	for(; iteration < velItersMinOne; ++iteration)
+	{	
+		SolveBlockParallel(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveBlockCoulomb, normalIter);
+		++normalIter;
+
+		if(frictionBatchCount > 0)
+		{
+			SolveBlockParallel(frictionConstraintList, frictionBatchCount, frictionIter * frictionBatchCount, frictionBatchCount, 
+				cache, frictionIterator, gVTableSolveBlockCoulomb, frictionIter);
+			++frictionIter;
+		}
+	}
+
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	PxU32 thresholdStreamLength = params.thresholdStreamLength;
+
+	cache.writeBackIteration = true;
+
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+	cache.mSharedThresholdStream = thresholdStream;
+
+	//PGS always runs one velocity iteration
+	{
+		SolveBlockParallel(constraintList, batchCount, normalIter * batchCount, batchCount, 
+			cache, contactIterator, gVTableSolveWriteBackBlockCoulomb, normalIter);
+		++normalIter;
+
+		if(frictionBatchCount > 0)
+		{
+			SolveBlockParallel(frictionConstraintList, frictionBatchCount, frictionIter * frictionBatchCount, frictionBatchCount, 
+				cache, frictionIterator, gVTableSolveWriteBackBlockCoulomb, frictionIter);
+				++frictionIter;
+		}
+	}
+
+	//Write back remaining threshold streams
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Write back to global buffer
+		const PxI32 threshIndex = physx::shdfnd::atomicAdd(reinterpret_cast<PxI32*>(&outThresholdPairs), PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+		{
+			thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+PxI32 SolverCoreGeneralPF::solveVParallelAndWriteBack(SolverIslandParams& params,
+	Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV) const
+{
+	SolverContext cache;
+	cache.solverBodyArray = params.bodyDataList;
+
+	const PxI32 UnrollCount = PxI32(params.batchSize);
+	const PxI32 SaveUnrollCount = 64;
+
+	const PxI32 TempThresholdStreamSize = 32;
+	ThresholdStreamElement tempThresholdStream[TempThresholdStreamSize];
+
+	const PxI32 batchCount = PxI32(params.numConstraintHeaders);
+	const PxI32 frictionBatchCount = PxI32(params.numFrictionConstraintHeaders);//frictionConstraintBatches.size();
+	cache.mThresholdStream = tempThresholdStream;
+	cache.mThresholdStreamLength = TempThresholdStreamSize;
+	cache.mThresholdStreamIndex = 0;
+	cache.Z = Z;
+	cache.deltaV = deltaV;
+
+	const PxI32 positionIterations = PxI32(params.positionIterations);
+	const PxU32 velocityIterations = params.velocityIterations;
+
+	const PxI32 bodyListSize = PxI32(params.bodyListSize);
+	const PxI32 articulationListSize = PxI32(params.articulationListSize);
+
+	PX_ASSERT(velocityIterations >= 1);
+	PX_ASSERT(positionIterations >= 1);
+
+	PxI32* constraintIndex = &params.constraintIndex;
+	PxI32* constraintIndex2 = &params.constraintIndex2;
+	PxI32* frictionConstraintIndex = &params.frictionConstraintIndex;
+
+	PxI32 endIndexCount = UnrollCount;
+	PxI32 index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+	PxI32 frictionIndex = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+	
+	BatchIterator contactIter(params.constraintBatchHeaders, params.numConstraintHeaders);
+	BatchIterator frictionIter(params.frictionConstraintBatches, params.numFrictionConstraintHeaders);
+
+	PxU32* headersPerPartition = params.headersPerPartition;
+	PxU32 nbPartitions = params.nbPartitions;
+
+	PxU32* frictionHeadersPerPartition = params.frictionHeadersPerPartition;
+	PxU32 nbFrictionPartitions = params.nbFrictionPartitions;
+
+	PxSolverConstraintDesc* PX_RESTRICT constraintList = params.constraintList;
+	PxSolverConstraintDesc* PX_RESTRICT frictionConstraintList = params.frictionConstraintList;
+
+	PxI32 maxNormalIndex = 0;
+	PxI32 maxProgress = 0;
+	PxI32 frictionEndIndexCount = UnrollCount;
+	PxI32 maxFrictionIndex = 0;
+
+	PxI32 normalIteration = 0;
+	PxI32 frictionIteration = 0;
+	PxU32 a = 0;
+	for(PxU32 i = 0; i < 2; ++i)
+	{
+		SolveBlockMethod* solveTable = i == 0 ? gVTableSolveBlockCoulomb : gVTableSolveConcludeBlockCoulomb;
+		for(; a < positionIterations - 1 + i; ++a)
+		{
+			for(PxU32 b = 0; b < nbPartitions; ++b)
+			{
+				WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+				maxNormalIndex += headersPerPartition[b];
+				maxProgress += headersPerPartition[b];
+				PxI32 nbSolved = 0;
+				while(index < maxNormalIndex)
+				{
+					const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+					SolveBlockParallel(constraintList, remainder, index, batchCount, cache, contactIter, solveTable, 
+						normalIteration);
+					index += remainder;
+					endIndexCount -= remainder;
+					nbSolved += remainder;
+					if(endIndexCount == 0)
+					{
+						endIndexCount = UnrollCount;
+						index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+					}
+				}
+				if(nbSolved)
+				{
+					Ps::memoryBarrier();
+					Ps::atomicAdd(constraintIndex2, nbSolved);
+				}
+			}
+			++normalIteration;
+		}
+	}
+
+	for(PxU32 i = 0; i < 2; ++i)
+	{
+		SolveBlockMethod* solveTable = i == 0 ? gVTableSolveBlockCoulomb : gVTableSolveConcludeBlockCoulomb;
+		const PxI32 numIterations = positionIterations *2;
+		for(; a <  numIterations - 1 + i; ++a)
+		{
+			for(PxU32 b = 0; b < nbFrictionPartitions; ++b)
+			{
+				WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+				maxProgress += frictionHeadersPerPartition[b];
+				maxFrictionIndex += frictionHeadersPerPartition[b];
+				PxI32 nbSolved = 0;
+				while(frictionIndex < maxFrictionIndex)
+				{
+					const PxI32 remainder = PxMin(maxFrictionIndex - frictionIndex, frictionEndIndexCount);
+					SolveBlockParallel(frictionConstraintList, remainder, frictionIndex, frictionBatchCount, cache, frictionIter, 
+						solveTable, frictionIteration);
+					frictionIndex += remainder;
+					frictionEndIndexCount -= remainder;
+					nbSolved += remainder;
+					if(frictionEndIndexCount == 0)
+					{
+						frictionEndIndexCount = UnrollCount;
+						frictionIndex  = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+					}
+				}
+				if(nbSolved)
+				{
+					Ps::memoryBarrier();
+					Ps::atomicAdd(constraintIndex2, nbSolved);
+				}
+			}
+			++frictionIteration;
+		}
+	}
+
+	WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+
+	PxI32* bodyListIndex = &params.bodyListIndex;
+
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart = params.articulationListStart;
+
+	PxSolverBody* PX_RESTRICT bodyListStart = params.bodyListStart;
+
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray = params.motionVelocityArray;
+
+	PxI32* bodyListIndex2 = &params.bodyListIndex2;
+
+	PxI32 endIndexCount2 = SaveUnrollCount;
+	PxI32 index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+	{
+		PxI32 nbConcluded = 0;
+		while(index2 < articulationListSize)
+		{
+			const PxI32 remainder = PxMin(SaveUnrollCount, (articulationListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				ArticulationPImpl::saveVelocity(articulationListStart[index2], cache.deltaV);
+			}
+			nbConcluded += remainder;
+			if(endIndexCount2 == 0)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount;
+				endIndexCount2 = SaveUnrollCount;
+			}
+			nbConcluded += remainder;
+		}
+
+		index2 -= articulationListSize;
+
+		//save velocity
+		
+		while(index2 < bodyListSize)
+		{
+			const PxI32 remainder = PxMin(endIndexCount2, (bodyListSize - index2));
+			endIndexCount2 -= remainder;
+			for(PxI32 b = 0; b < remainder; ++b, ++index2)
+			{
+				Ps::prefetchLine(&bodyListStart[index2 + 8]);
+				Ps::prefetchLine(&motionVelocityArray[index2 + 8]);
+				PxSolverBody& body = bodyListStart[index2];
+				Cm::SpatialVector& motionVel = motionVelocityArray[index2];
+				motionVel.linear = body.linearVelocity;
+				motionVel.angular = body.angularState;
+				PX_ASSERT(motionVel.linear.isFinite());
+				PX_ASSERT(motionVel.angular.isFinite());
+			}
+
+			nbConcluded += remainder;
+			
+			//Branch not required because this is the last time we use this atomic variable
+			//if(index2 < articulationListSizePlusbodyListSize)
+			{
+				index2 = physx::shdfnd::atomicAdd(bodyListIndex, SaveUnrollCount) - SaveUnrollCount - articulationListSize;
+				endIndexCount2 = SaveUnrollCount;
+			}
+		}
+
+		if(nbConcluded)
+		{
+			Ps::memoryBarrier();
+			physx::shdfnd::atomicAdd(bodyListIndex2, nbConcluded);
+		}
+	}
+
+	WAIT_FOR_PROGRESS(bodyListIndex2, (bodyListSize + articulationListSize));
+
+	a = 0;
+	for(; a < velocityIterations-1; ++a)
+	{
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxNormalIndex += headersPerPartition[b];
+			maxProgress += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+				SolveBlockParallel(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveBlockCoulomb, normalIteration);
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+		++normalIteration;
+
+		for(PxU32 b = 0; b < nbFrictionPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxFrictionIndex += frictionHeadersPerPartition[b];
+			maxProgress += frictionHeadersPerPartition[b];
+
+			PxI32 nbSolved = 0;
+			while(frictionIndex < maxFrictionIndex)
+			{
+				const PxI32 remainder = PxMin(maxFrictionIndex - frictionIndex, frictionEndIndexCount);
+				SolveBlockParallel(constraintList, remainder, index, batchCount, cache, contactIter, gVTableSolveBlockCoulomb, 
+					normalIteration);
+
+				frictionIndex += remainder;
+				frictionEndIndexCount -= remainder;
+				nbSolved += remainder;
+				if(frictionEndIndexCount == 0)
+				{
+					frictionEndIndexCount = UnrollCount;
+					frictionIndex  = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+
+		++frictionIteration;
+	}
+
+	ThresholdStreamElement* PX_RESTRICT thresholdStream = params.thresholdStream;
+	const PxU32 thresholdStreamLength = params.thresholdStreamLength;
+	PxI32* outThresholdPairs = params.outThresholdPairs;
+
+	cache.mSharedThresholdStream = thresholdStream;
+	cache.mSharedOutThresholdPairs = outThresholdPairs;
+	cache.mSharedThresholdStreamLength = thresholdStreamLength;
+
+	{
+		for(PxU32 b = 0; b < nbPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxNormalIndex += headersPerPartition[b];
+			maxProgress += headersPerPartition[b];
+			
+			PxI32 nbSolved = 0;
+			while(index < maxNormalIndex)
+			{
+				const PxI32 remainder = PxMin(maxNormalIndex - index, endIndexCount);
+
+				SolveBlockParallel(constraintList, remainder, normalIteration * batchCount, batchCount, 
+					cache, contactIter, gVTableSolveWriteBackBlockCoulomb, normalIteration);
+
+				index += remainder;
+				endIndexCount -= remainder;
+				nbSolved += remainder;
+				if(endIndexCount == 0)
+				{
+					endIndexCount = UnrollCount;
+					index = physx::shdfnd::atomicAdd(constraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+
+		++normalIteration;
+
+		cache.mSharedOutThresholdPairs = outThresholdPairs;
+		cache.mSharedThresholdStream = thresholdStream;
+		cache.mSharedThresholdStreamLength = thresholdStreamLength;
+
+		for(PxU32 b = 0; b < nbFrictionPartitions; ++b)
+		{
+			WAIT_FOR_PROGRESS(constraintIndex2, maxProgress);
+			maxFrictionIndex += frictionHeadersPerPartition[b];
+			maxProgress += frictionHeadersPerPartition[b];
+
+			PxI32 nbSolved = 0;
+			while(frictionIndex < maxFrictionIndex)
+			{
+				const PxI32 remainder = PxMin(maxFrictionIndex - frictionIndex, frictionEndIndexCount);
+
+				SolveBlockParallel(frictionConstraintList, remainder, frictionIndex, frictionBatchCount, cache, frictionIter, 
+					gVTableSolveWriteBackBlockCoulomb, frictionIteration);
+
+				frictionIndex += remainder;
+				frictionEndIndexCount -= remainder;
+				nbSolved += remainder;
+				if(frictionEndIndexCount == 0)
+				{
+					frictionEndIndexCount = UnrollCount;
+					frictionIndex  = physx::shdfnd::atomicAdd(frictionConstraintIndex, UnrollCount) - UnrollCount;
+				}
+			}
+			if(nbSolved)
+			{
+				Ps::memoryBarrier();
+				Ps::atomicAdd(constraintIndex2, nbSolved);
+			}
+		}
+
+		if(cache.mThresholdStreamIndex > 0)
+		{
+			//Write back to global buffer
+			PxI32 threshIndex = physx::shdfnd::atomicAdd(outThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+			for(PxU32 b = 0; b < cache.mThresholdStreamIndex; ++b)
+			{
+				thresholdStream[b + threshIndex] = cache.mThresholdStream[b];
+			}
+			cache.mThresholdStreamIndex = 0;
+		}
+
+		++frictionIteration;
+	}
+
+	return normalIteration * batchCount + frictionIteration * frictionBatchCount;
+}
+
+
+void SolverCoreGeneralPF::writeBackV
+(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 /*constraintListSize*/, PxConstraintBatchHeader* batchHeaders, const PxU32 numBatches,
+ ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+ PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const
+{
+	SolverContext cache;
+	cache.solverBodyArray			= atomListData;
+	cache.mThresholdStream			= thresholdStream;
+	cache.mThresholdStreamLength	= thresholdStreamLength;
+	cache.mThresholdStreamIndex		= 0;
+
+	PxI32 outThreshIndex = 0;
+	for(PxU32 j = 0; j < numBatches; ++j)
+	{
+		PxU8 type = *constraintList[batchHeaders[j].startIndex].constraint;
+		writeBackTable[type](constraintList + batchHeaders[j].startIndex,
+			batchHeaders[j].stride, cache);
+	}
+
+	outThresholdPairs = PxU32(outThreshIndex);
+}
+
+}
+
+}
+
+
+//#endif
--- a/physx/source/lowleveldynamics/src/DySolverControlPF.h
+++ b/physx/source/lowleveldynamics/src/DySolverControlPF.h
@ -0,0 +1,71 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCONTROLPF_H
+#define DY_SOLVERCONTROLPF_H
+
+#include "DySolverCore.h"
+#include "DySolverConstraintDesc.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+class SolverCoreGeneralPF : public SolverCore
+{
+public:
+	static SolverCoreGeneralPF* create();
+
+	// Implements SolverCore
+	virtual void destroyV();
+
+	virtual PxI32 solveVParallelAndWriteBack
+		(SolverIslandParams& params, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV) const;
+
+	virtual void solveV_Blocks
+		(SolverIslandParams& params) const;
+
+	virtual void writeBackV
+		(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 constraintListSize, PxConstraintBatchHeader* contactConstraintBatches, const PxU32 numBatches,
+		 ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+		 PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const;
+
+private:
+
+	//~Implements SolverCore
+};
+
+}
+
+}
+
+#endif //DY_SOLVERCOREGENERALPF_H
--- a/physx/source/lowleveldynamics/src/DySolverCore.h
+++ b/physx/source/lowleveldynamics/src/DySolverCore.h
@ -0,0 +1,251 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVERCORE_H
+#define DY_SOLVERCORE_H
+
+#include "PxvConfig.h"
+#include "PsArray.h"
+#include "PsThread.h"
+
+
+namespace physx
+{
+
+struct PxSolverBody;
+struct PxSolverBodyData;
+struct PxSolverConstraintDesc;
+struct PxConstraintBatchHeader;
+
+namespace Dy
+{
+struct ThresholdStreamElement;
+	
+
+struct ArticulationSolverDesc;
+class Articulation;
+struct SolverContext;
+
+typedef void (*WriteBackMethod)(const PxSolverConstraintDesc& desc, SolverContext& cache, PxSolverBodyData& sbd0, PxSolverBodyData& sbd1);
+typedef void (*SolveMethod)(const PxSolverConstraintDesc& desc, SolverContext& cache);
+typedef void (*SolveBlockMethod)(const PxSolverConstraintDesc* desc, const PxU32 constraintCount, SolverContext& cache);
+typedef void (*SolveWriteBackBlockMethod)(const PxSolverConstraintDesc* desc, const PxU32 constraintCount, SolverContext& cache);
+typedef void (*WriteBackBlockMethod)(const PxSolverConstraintDesc* desc, const PxU32 constraintCount, SolverContext& cache);
+
+#define PX_PROFILE_SOLVE_STALLS 0
+#if PX_PROFILE_SOLVE_STALLS
+#if PX_WINDOWS
+#include <windows.h>
+
+
+PX_FORCE_INLINE PxU64 readTimer()
+{
+	//return __rdtsc();
+
+	LARGE_INTEGER i;
+	QueryPerformanceCounter(&i);
+	return i.QuadPart;
+}
+
+#endif
+#endif
+
+
+#define YIELD_THREADS 1
+
+#if YIELD_THREADS
+
+#define ATTEMPTS_BEFORE_BACKOFF 30000
+#define ATTEMPTS_BEFORE_RETEST 10000
+
+#endif
+
+PX_INLINE void WaitForProgressCount(volatile PxI32* pGlobalIndex, const PxI32 targetIndex)
+{
+#if YIELD_THREADS
+	if(*pGlobalIndex < targetIndex)
+	{
+		bool satisfied = false;
+		PxU32 count = ATTEMPTS_BEFORE_BACKOFF;
+		do
+		{
+			satisfied = true;
+			while(*pGlobalIndex < targetIndex)
+			{
+				if(--count == 0)
+				{
+					satisfied = false;
+					break;
+				}
+			}
+			if(!satisfied)
+				Ps::Thread::yield();
+			count = ATTEMPTS_BEFORE_RETEST;
+		}
+		while(!satisfied);
+	}
+#else
+	while(*pGlobalIndex < targetIndex);
+#endif
+}
+
+
+#if PX_PROFILE_SOLVE_STALLS
+PX_INLINE void WaitForProgressCount(volatile PxI32* pGlobalIndex, const PxI32 targetIndex, PxU64& stallTime)
+{
+	if(*pGlobalIndex < targetIndex)
+	{
+		bool satisfied = false;
+		PxU32 count = ATTEMPTS_BEFORE_BACKOFF;
+		do
+		{
+			satisfied = true;
+			PxU64 startTime = readTimer();
+			while(*pGlobalIndex < targetIndex)
+			{
+				if(--count == 0)
+				{
+					satisfied = false;
+					break;
+				}
+			}
+			PxU64 endTime = readTimer();
+			stallTime += (endTime - startTime);
+			if(!satisfied)
+				Ps::Thread::yield();
+			count = ATTEMPTS_BEFORE_BACKOFF;
+		}
+		while(!satisfied);
+	}
+}
+
+#define WAIT_FOR_PROGRESS(pGlobalIndex, targetIndex) if(*pGlobalIndex < targetIndex) WaitForProgressCount(pGlobalIndex, targetIndex, stallCount)
+#else
+#define WAIT_FOR_PROGRESS(pGlobalIndex, targetIndex) if(*pGlobalIndex < targetIndex) WaitForProgressCount(pGlobalIndex, targetIndex)
+#endif
+#define WAIT_FOR_PROGRESS_NO_TIMER(pGlobalIndex, targetIndex) if(*pGlobalIndex < targetIndex) WaitForProgressCount(pGlobalIndex, targetIndex)
+
+
+struct SolverIslandParams
+{
+	//Default friction model params
+	PxU32 positionIterations;
+	PxU32 velocityIterations;
+	PxSolverBody* PX_RESTRICT bodyListStart;
+	PxSolverBodyData* PX_RESTRICT bodyDataList;
+	PxU32 bodyListSize;
+	PxU32 solverBodyOffset;
+	ArticulationSolverDesc* PX_RESTRICT articulationListStart; 
+	PxU32 articulationListSize;
+	PxSolverConstraintDesc* PX_RESTRICT constraintList;
+	PxConstraintBatchHeader* constraintBatchHeaders;
+	PxU32 numConstraintHeaders;
+	PxU32* headersPerPartition;
+	PxU32 nbPartitions;
+	Cm::SpatialVector* PX_RESTRICT motionVelocityArray;
+	PxU32 batchSize;
+	PxsBodyCore*const* bodyArray;
+	PxsRigidBody** PX_RESTRICT rigidBodies;
+
+	//Shared state progress counters
+	PxI32 constraintIndex;
+	PxI32 constraintIndex2;
+	PxI32 bodyListIndex;
+	PxI32 bodyListIndex2;
+	PxI32 articSolveIndex;
+	PxI32 articSolveIndex2;
+	PxI32 bodyIntegrationListIndex;
+	PxI32 numObjectsIntegrated;
+
+	PxReal dt;
+	PxReal invDt;
+
+
+	//Additional 1d/2d friction model params
+	PxSolverConstraintDesc* PX_RESTRICT frictionConstraintList;
+	
+	PxConstraintBatchHeader* frictionConstraintBatches;
+	PxU32 numFrictionConstraintHeaders;
+	PxU32* frictionHeadersPerPartition;
+	PxU32 nbFrictionPartitions;
+
+	//Additional Shared state progress counters
+	PxI32 frictionConstraintIndex;
+
+	//Write-back threshold information
+	ThresholdStreamElement* PX_RESTRICT thresholdStream;
+	PxU32 thresholdStreamLength;
+
+	PxI32* outThresholdPairs;
+
+	PxU32 mMaxArticulationLinks;
+	Cm::SpatialVectorF* Z;
+	Cm::SpatialVectorF* deltaV;
+};
+
+
+/*!
+Interface to constraint solver cores
+
+*/    
+class SolverCore
+{
+public:
+	virtual void destroyV() = 0;
+    virtual ~SolverCore() {}
+	/*
+	solves dual problem exactly by GS-iterating until convergence stops
+	only uses regular velocity vector for storing results, and backs up initial state, which is restored.
+	the solution forces are saved in a vector.
+
+	state should not be stored, this function is safe to call from multiple threads.
+
+	Returns the total number of constraints that should be solved across all threads. Used for synchronization outside of this method
+	*/
+
+	virtual PxI32 solveVParallelAndWriteBack
+		(SolverIslandParams& params, Cm::SpatialVectorF* Z, Cm::SpatialVectorF* deltaV) const = 0;
+
+
+	virtual void solveV_Blocks
+		(SolverIslandParams& params) const = 0;
+
+
+	virtual void writeBackV
+		(const PxSolverConstraintDesc* PX_RESTRICT constraintList, const PxU32 constraintListSize, PxConstraintBatchHeader* contactConstraintBatches, const PxU32 numConstraintBatches,
+	 	 ThresholdStreamElement* PX_RESTRICT thresholdStream, const PxU32 thresholdStreamLength, PxU32& outThresholdPairs,
+		 PxSolverBodyData* atomListData, WriteBackBlockMethod writeBackTable[]) const = 0;
+};
+
+}
+
+}
+
+#endif //DY_SOLVERCORE_H
--- a/physx/source/lowleveldynamics/src/DySolverExt.h
+++ b/physx/source/lowleveldynamics/src/DySolverExt.h
@ -0,0 +1,95 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SOLVEREXTBODY_H
+#define DY_SOLVEREXTBODY_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "CmPhysXCommon.h"
+#include "CmSpatialVector.h"
+#include "PsVecMath.h"
+
+namespace physx
+{
+
+class PxsRigidBody;
+struct PxsBodyCore;
+struct PxSolverBody;
+struct PxSolverBodyData;
+
+
+namespace Dy
+{
+
+
+class ArticulationV;
+struct SolverConstraint1D;
+
+class SolverExtBody
+{
+public:
+	union
+	{
+		const ArticulationV* mArticulation;
+		const PxSolverBody* mBody;
+	};
+	const PxSolverBodyData* mBodyData;
+
+	PxU16 mLinkIndex;
+
+	SolverExtBody(const void* bodyOrArticulation, const void* bodyData, PxU16 linkIndex): 
+	  mBody(reinterpret_cast<const PxSolverBody*>(bodyOrArticulation)),
+	  mBodyData(reinterpret_cast<const PxSolverBodyData*>(bodyData)),
+		  mLinkIndex(linkIndex)
+	  {}
+
+	  void getResponse(const PxVec3& linImpulse, const PxVec3& angImpulse,
+					   PxVec3& linDeltaV, PxVec3& angDeltaV, PxReal dominance) const;
+
+	  void getResponse(const Ps::aos::Vec3V& linImpulse, const Ps::aos::Vec3V& angImpulse,
+		  Ps::aos::Vec3V& linDeltaV, Ps::aos::Vec3V& angDeltaV, Ps::aos::FloatV dominance) const;
+
+	  PxReal projectVelocity(const PxVec3& linear, const PxVec3& angular) const;
+	  Ps::aos::FloatV projectVelocity(const Ps::aos::Vec3V& linear, const Ps::aos::Vec3V& angular) const;
+	  PxVec3 getLinVel() const;
+	  PxVec3 getAngVel() const;
+
+	  Ps::aos::Vec3V getLinVelV() const;
+	  Ps::aos::Vec3V getAngVelV() const;
+
+	  Cm::SpatialVectorV getVelocity() const;
+};
+
+}
+
+}
+
+#endif //DY_SOLVEREXTBODY_H
--- a/physx/source/lowleveldynamics/src/DySolverPFConstraints.cpp
+++ b/physx/source/lowleveldynamics/src/DySolverPFConstraints.cpp
@ -0,0 +1,879 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContact.h"
+#include "DySolverContactPF.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "DyThresholdTable.h"
+#include "DySolverContext.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+#include "DyThresholdTable.h"
+#include "DySolverConstraintsShared.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+void solveContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+	PxSolverBody& b1 = *desc.bodyB;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V linVel1 = V3LoadA(b1.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+	Vec3V angState1 = V3LoadA(b1.angularState);
+
+	SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint);
+	const PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;//getConstraintLength(desc);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	
+	//const FloatV zero = FZero();
+
+	while(currPtr < last)
+	{
+		SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+		currPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		const Vec3V normal = hdr->getNormal();
+		const FloatV invMassDom0 = FLoad(hdr->dominance0);
+		const FloatV invMassDom1 = FLoad(hdr->dominance1);
+		const FloatV angD0 = FLoad(hdr->angDom0);
+		const FloatV angD1 = FLoad(hdr->angDom1);
+		
+
+
+		SolverContactPoint* PX_RESTRICT contacts = reinterpret_cast<SolverContactPoint*>(currPtr);
+		currPtr += numNormalConstr * sizeof(SolverContactPoint);
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*> ((reinterpret_cast<PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader));
+		Ps::prefetchLine(appliedImpulse);
+
+		solveDynamicContacts(contacts, numNormalConstr, normal, invMassDom0, invMassDom1, 
+			angD0, angD1, linVel0, angState0, linVel1, angState1, appliedImpulse); 
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(linVel1, b1.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+	V3StoreA(angState1, b1.angularState);
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveFriction(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+	PxSolverBody& b1 = *desc.bodyB;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V linVel1 = V3LoadA(b1.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+	Vec3V angState1 = V3LoadA(b1.angularState);
+
+	PxU8* PX_RESTRICT ptr = desc.constraint;
+	PxU8* PX_RESTRICT currPtr = ptr;
+
+	const PxU8* PX_RESTRICT last = ptr + getConstraintLength(desc);
+
+
+	while(currPtr < last)
+	{
+		const SolverFrictionHeader* PX_RESTRICT frictionHeader = reinterpret_cast<SolverFrictionHeader*>(currPtr);
+		currPtr += sizeof(SolverFrictionHeader);
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += frictionHeader->getAppliedForcePaddingSize();
+
+		SolverContactFriction* PX_RESTRICT frictions = reinterpret_cast<SolverContactFriction*>(currPtr);
+		const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr;
+		const PxU32 numNormalConstr = frictionHeader->numNormalConstr;
+
+		const PxU32 numFrictionPerPoint = numFrictionConstr/numNormalConstr;
+
+		currPtr += numFrictionConstr * sizeof(SolverContactFriction);
+		const FloatV staticFriction = frictionHeader->getStaticFriction();
+
+		const FloatV invMass0D0 = FLoad(frictionHeader->invMass0D0);
+		const FloatV invMass1D1 = FLoad(frictionHeader->invMass1D1);
+
+		
+		const FloatV angD0 = FLoad(frictionHeader->angDom0);
+		const FloatV angD1 = FLoad(frictionHeader->angDom1);
+
+		for(PxU32 i=0, j = 0;i<numFrictionConstr;j++)
+		{
+			for(PxU32 p = 0; p < numFrictionPerPoint; p++, i++)
+			{
+		
+				SolverContactFriction& f = frictions[i];
+				Ps::prefetchLine(&frictions[i], 128);
+
+				const Vec3V t0 = Vec3V_From_Vec4V(f.normalXYZ_appliedForceW);
+				const Vec3V raXt0 = Vec3V_From_Vec4V(f.raXnXYZ_velMultiplierW);
+				const Vec3V rbXt0 = Vec3V_From_Vec4V(f.rbXnXYZ_biasW);
+
+				const FloatV appliedForce = V4GetW(f.normalXYZ_appliedForceW);
+				const FloatV velMultiplier = V4GetW(f.raXnXYZ_velMultiplierW);
+
+				const FloatV targetVel = FLoad(f.targetVel);
+
+				const FloatV normalImpulse = FLoad(appliedImpulse[j]);
+				const FloatV maxFriction = FMul(staticFriction, normalImpulse);
+				const FloatV nMaxFriction = FNeg(maxFriction);
+
+				//Compute the normal velocity of the constraint.
+
+				const FloatV t0Vel1 = V3Dot(t0, linVel0);
+				const FloatV t0Vel2 = V3Dot(raXt0, angState0);
+				const FloatV t0Vel3 = V3Dot(t0, linVel1);
+				const FloatV t0Vel4 = V3Dot(rbXt0, angState1);
+
+
+				const FloatV t0Vel = FSub(FAdd(t0Vel1, t0Vel2), FAdd(t0Vel3, t0Vel4));
+
+				const Vec3V delLinVel0 = V3Scale(t0, invMass0D0);
+				const Vec3V delLinVel1 = V3Scale(t0, invMass1D1);
+
+				// still lots to do here: using loop pipelining we can interweave this code with the
+				// above - the code here has a lot of stalls that we would thereby eliminate
+				
+				const FloatV tmp = FNegScaleSub(targetVel,velMultiplier,appliedForce);
+				FloatV newForce = FScaleAdd(t0Vel, velMultiplier, tmp);
+				newForce = FClamp(newForce, nMaxFriction, maxFriction);
+				FloatV deltaF = FSub(newForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+				linVel1 = V3NegScaleSub(delLinVel1, deltaF, linVel1);
+				angState0 = V3ScaleAdd(raXt0, FMul(deltaF, angD0), angState0);
+				angState1 = V3NegScaleSub(rbXt0, FMul(deltaF, angD1), angState1);
+
+				f.setAppliedForce(newForce);
+			}
+		}
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(linVel1, b1.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+	V3StoreA(angState1, b1.angularState);
+
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveContactCoulomb_BStatic(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+
+	SolverContactCoulombHeader* firstHeader = reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint);
+	const PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;//getConstraintLength(desc);
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	//const FloatV zero = FZero();
+
+	while(currPtr < last)
+	{
+		SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+		currPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		SolverContactPoint* PX_RESTRICT contacts = reinterpret_cast<SolverContactPoint*>(currPtr);
+		Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPoint);
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*> ((reinterpret_cast<PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader));
+		Ps::prefetchLine(appliedImpulse);
+
+		const Vec3V normal = hdr->getNormal();
+
+		const FloatV invMassDom0 = FLoad(hdr->dominance0);
+
+		const FloatV angD0 = FLoad(hdr->angDom0);
+		
+		solveStaticContacts(contacts, numNormalConstr, normal, invMassDom0, 
+			angD0, linVel0, angState0, appliedImpulse); 
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveFriction_BStatic(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b0 = *desc.bodyA;
+
+	Vec3V linVel0 = V3LoadA(b0.linearVelocity);
+	Vec3V angState0 = V3LoadA(b0.angularState);
+
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	const PxU8* PX_RESTRICT last = currPtr + getConstraintLength(desc);
+
+	while(currPtr < last)
+	{
+
+		const SolverFrictionHeader* PX_RESTRICT frictionHeader = reinterpret_cast<SolverFrictionHeader*>(currPtr);
+		const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr;
+		const PxU32 numNormalConstr = frictionHeader->numNormalConstr;
+		const PxU32 numFrictionPerPoint = numFrictionConstr/numNormalConstr;
+		currPtr +=sizeof(SolverFrictionHeader);
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(currPtr);
+		currPtr +=frictionHeader->getAppliedForcePaddingSize();
+
+		SolverContactFriction* PX_RESTRICT frictions = reinterpret_cast<SolverContactFriction*>(currPtr);
+		currPtr += numFrictionConstr * sizeof(SolverContactFriction);
+
+		const FloatV invMass0 = FLoad(frictionHeader->invMass0D0);
+		const FloatV angD0 = FLoad(frictionHeader->angDom0);
+		//const FloatV angD1 = FLoad(frictionHeader->angDom1);
+
+
+		const FloatV staticFriction = frictionHeader->getStaticFriction();
+
+		for(PxU32 i=0, j = 0;i<numFrictionConstr;j++)
+		{
+			for(PxU32 p = 0; p < numFrictionPerPoint; p++, i++)
+			{
+				SolverContactFriction& f = frictions[i];
+				Ps::prefetchLine(&frictions[i+1]);
+
+				const Vec3V t0 = Vec3V_From_Vec4V(f.normalXYZ_appliedForceW);
+				const Vec3V raXt0 = Vec3V_From_Vec4V(f.raXnXYZ_velMultiplierW);
+
+				const FloatV appliedForce = V4GetW(f.normalXYZ_appliedForceW);
+				const FloatV velMultiplier = V4GetW(f.raXnXYZ_velMultiplierW);
+
+				const FloatV targetVel = FLoad(f.targetVel);
+				
+				//const FloatV normalImpulse = contacts[f.contactIndex].getAppliedForce();
+				const FloatV normalImpulse = FLoad(appliedImpulse[j]);
+				const FloatV maxFriction = FMul(staticFriction, normalImpulse);
+				const FloatV nMaxFriction = FNeg(maxFriction);
+
+				//Compute the normal velocity of the constraint.
+
+				const FloatV t0Vel1 = V3Dot(t0, linVel0);
+				const FloatV t0Vel2 = V3Dot(raXt0, angState0);
+
+				const FloatV t0Vel = FAdd(t0Vel1, t0Vel2);
+
+				const Vec3V delangState0 = V3Scale(raXt0, angD0);
+				const Vec3V delLinVel0 = V3Scale(t0, invMass0);
+
+				// still lots to do here: using loop pipelining we can interweave this code with the
+				// above - the code here has a lot of stalls that we would thereby eliminate
+
+				const FloatV tmp = FNegScaleSub(targetVel,velMultiplier,appliedForce);
+				FloatV newForce = FScaleAdd(t0Vel, velMultiplier, tmp);
+				newForce = FClamp(newForce, nMaxFriction, maxFriction);
+				const FloatV deltaF = FSub(newForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(delLinVel0, deltaF, linVel0);
+				angState0 = V3ScaleAdd(delangState0, deltaF, angState0);
+
+				f.setAppliedForce(newForce);
+			}
+		}
+	}
+
+	// Write back
+	V3StoreA(linVel0, b0.linearVelocity);
+	V3StoreA(angState0, b0.angularState);
+
+	PX_ASSERT(currPtr == last);
+}
+
+
+void concludeContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& /*cache*/)
+{
+	PxU8* PX_RESTRICT cPtr = desc.constraint;
+
+	const SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+	PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;//getConstraintLength(desc);
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		
+		//if(cPtr < last)
+		//Ps::prefetchLine(cPtr, 512);
+		Ps::prefetchLine(cPtr,128);
+		Ps::prefetchLine(cPtr,256);
+		Ps::prefetchLine(cPtr,384);
+
+		const PxU32 pointStride = hdr->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactPointExt)
+																	   : sizeof(SolverContactPoint);
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContactPoint *c = reinterpret_cast<SolverContactPoint*>(cPtr);
+			cPtr += pointStride;
+			//c->scaledBias = PxMin(c->scaledBias, 0.f);
+			c->biasedErr = c->unbiasedErr;
+		}
+	}
+	PX_ASSERT(cPtr == last);
+}
+
+void  writeBackContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& cache,
+					  PxSolverBodyData& bd0, PxSolverBodyData& bd1)
+{
+
+	PxReal normalForce = 0.f;
+
+	PxU8* PX_RESTRICT cPtr = desc.constraint;
+	PxReal* PX_RESTRICT vForceWriteback = reinterpret_cast<PxReal*>(desc.writeBack);
+	const SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+	PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;
+
+	const PxU32 pointStride = firstHeader->type == DY_SC_TYPE_EXT_CONTACT ? sizeof(SolverContactPointExt)
+																	   : sizeof(SolverContactPoint);
+
+	bool hasForceThresholds = false;
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader);
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*> (const_cast<PxU8*>((reinterpret_cast<const PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader)));
+
+		hasForceThresholds = hdr->flags & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		Ps::prefetchLine(cPtr, 256);
+		Ps::prefetchLine(cPtr, 384);
+
+		if(vForceWriteback!=NULL)
+		{
+			for(PxU32 i=0; i<numNormalConstr; i++)
+			{
+				PxF32 imp = appliedImpulse[i];
+				*vForceWriteback = imp;
+				vForceWriteback++;
+				normalForce += imp;
+			}
+		}
+		cPtr += numNormalConstr * pointStride;
+	}
+	PX_ASSERT(cPtr == last);
+
+	if(hasForceThresholds && desc.linkIndexA == PxSolverConstraintDesc::NO_LINK && desc.linkIndexB == PxSolverConstraintDesc::NO_LINK &&
+		normalForce !=0 && (bd0.reportThreshold < PX_MAX_REAL  || bd1.reportThreshold < PX_MAX_REAL))
+	{
+		ThresholdStreamElement elt;
+		elt.normalForce = normalForce;
+		elt.threshold = PxMin<float>(bd0.reportThreshold, bd1.reportThreshold);
+		elt.nodeIndexA = IG::NodeIndex(bd0.nodeIndex);
+		elt.nodeIndexB = IG::NodeIndex(bd1.nodeIndex);
+		elt.shapeInteraction = (reinterpret_cast<SolverContactCoulombHeader*>(desc.constraint))->shapeInteraction;
+		Ps::order(elt.nodeIndexA, elt.nodeIndexB);
+		PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
+
+		PX_ASSERT(cache.mThresholdStreamIndex<cache.mThresholdStreamLength);
+		cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
+	}
+
+}
+
+
+void solveFrictionBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction(desc[a], cache);
+	}
+}
+
+
+void solveFrictionBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction(desc[a], cache);
+	}
+}
+
+void solveFriction_BStaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction_BStatic(desc[a], cache);
+	}
+}
+
+
+void solveFriction_BStaticConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction_BStatic(desc[a], cache);
+	}
+}
+
+void solveFriction_BStaticBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveFriction_BStatic(desc[a], cache);
+	}
+}
+
+
+void solveContactCoulombBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveContactCoulombConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb(desc[a], cache);
+		concludeContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveContactCoulombBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].bodyBDataIndex];
+		solveContactCoulomb(desc[a], cache);
+		writeBackContactCoulomb(desc[a], cache, bd0, bd1);
+	}
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveContactCoulomb_BStaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb_BStatic(desc[a], cache);
+	}
+}
+
+void solveContactCoulomb_BStaticConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveContactCoulomb_BStatic(desc[a], cache);
+		concludeContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveContactCoulomb_BStaticBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].bodyBDataIndex];
+		solveContactCoulomb_BStatic(desc[a], cache);
+		writeBackContactCoulomb(desc[a], cache, bd0, bd1);
+	}
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Not enough space to write 4 more thresholds back!
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveExtContactCoulomb(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	//We'll need this.
+//	const FloatV zero	= FZero();
+//	const FloatV one	= FOne();
+
+	Vec3V linVel0, angVel0, linVel1, angVel1;
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel0 = V3LoadA(desc.bodyA->linearVelocity);
+		angVel0 = V3LoadA(desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = desc.articulationA->pxcFsGetVelocity(desc.linkIndexA);
+		linVel0 = v.linear;
+		angVel0 = v.angular;
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel1 = V3LoadA(desc.bodyB->linearVelocity);
+		angVel1 = V3LoadA(desc.bodyB->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = desc.articulationB->pxcFsGetVelocity(desc.linkIndexB);
+		linVel1 = v.linear;
+		angVel1 = v.angular;
+	}
+
+	//const PxU8* PX_RESTRICT last = desc.constraint + desc.constraintLengthOver16*16;
+
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	const SolverContactCoulombHeader* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+
+	const PxU8* PX_RESTRICT last = desc.constraint + firstHeader->frictionOffset;
+
+	//hopefully pointer aliasing doesn't bite.
+
+	Vec3V linImpulse0 = V3Zero(), linImpulse1 = V3Zero(), angImpulse0 = V3Zero(), angImpulse1 = V3Zero();
+
+	while(currPtr < last)
+	{
+		const SolverContactCoulombHeader* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader*>(currPtr);
+		currPtr += sizeof(SolverContactCoulombHeader);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(const_cast<PxU8*>(((reinterpret_cast<const PxU8*>(hdr)) + hdr->frictionOffset + sizeof(SolverFrictionHeader))));
+		Ps::prefetchLine(appliedImpulse);
+		
+		SolverContactPointExt* PX_RESTRICT contacts = reinterpret_cast<SolverContactPointExt*>(currPtr);
+		Ps::prefetchLine(contacts);
+		currPtr += numNormalConstr * sizeof(SolverContactPointExt);
+
+		Vec3V li0 = V3Zero(), li1 = V3Zero(), ai0 = V3Zero(), ai1 = V3Zero();
+
+		const Vec3V normal = hdr->getNormal();
+
+		solveExtContacts(contacts, numNormalConstr, normal, linVel0, angVel0, linVel1, angVel1, li0, ai0, li1, ai1, appliedImpulse);
+
+		linImpulse0 = V3ScaleAdd(li0, FLoad(hdr->dominance0), linImpulse0);		
+		angImpulse0 = V3ScaleAdd(ai0, FLoad(hdr->angDom0), angImpulse0);
+		linImpulse1 = V3NegScaleSub(li1, FLoad(hdr->dominance1), linImpulse1);	
+		angImpulse1 = V3NegScaleSub(ai1, FLoad(hdr->angDom1), angImpulse1);
+	}
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel0, desc.bodyA->linearVelocity);
+		V3StoreA(angVel0, desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorF Z[64];
+		Cm::SpatialVectorF deltaV[64];
+		desc.articulationA->pxcFsApplyImpulse(desc.linkIndexA, linImpulse0, 
+			angImpulse0, Z, deltaV);
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel1, desc.bodyB->linearVelocity);
+		V3StoreA(angVel1, desc.bodyB->angularState);
+	}
+	else
+	{
+		desc.articulationB->pxcFsApplyImpulse(desc.linkIndexB, linImpulse1,
+			angImpulse1, cache.Z, cache.deltaV);
+	}
+
+	PX_ASSERT(currPtr == last);
+}
+
+void solveExtFriction(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	Vec3V linVel0, angVel0, linVel1, angVel1;
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel0 = V3LoadA(desc.bodyA->linearVelocity);
+		angVel0 = V3LoadA(desc.bodyA->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = desc.articulationA->pxcFsGetVelocity(desc.linkIndexA);
+		linVel0 = v.linear;
+		angVel0 = v.angular;
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		linVel1 = V3LoadA(desc.bodyB->linearVelocity);
+		angVel1 = V3LoadA(desc.bodyB->angularState);
+	}
+	else
+	{
+		Cm::SpatialVectorV v = desc.articulationB->pxcFsGetVelocity(desc.linkIndexB);
+		linVel1 = v.linear;
+		angVel1 = v.angular;
+	}
+
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc.constraint;
+
+	const PxU8* PX_RESTRICT last = currPtr + desc.constraintLengthOver16*16;
+
+	Vec3V linImpulse0 = V3Zero(), linImpulse1 = V3Zero(), angImpulse0 = V3Zero(), angImpulse1 = V3Zero();
+
+	while(currPtr < last)
+	{
+	
+		const SolverFrictionHeader* PX_RESTRICT frictionHeader = reinterpret_cast<SolverFrictionHeader*>(currPtr);
+		currPtr += sizeof(SolverFrictionHeader);
+		PxF32* appliedImpulse = reinterpret_cast<PxF32*>(currPtr);
+		currPtr += frictionHeader->getAppliedForcePaddingSize();
+
+		SolverContactFrictionExt* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionExt*>(currPtr);
+		const PxU32 numFrictionConstr = frictionHeader->numFrictionConstr;
+
+		currPtr += numFrictionConstr * sizeof(SolverContactFrictionExt);
+		const FloatV staticFriction = frictionHeader->getStaticFriction();
+	
+	
+		Vec3V li0 = V3Zero(), li1 = V3Zero(), ai0 = V3Zero(), ai1 = V3Zero();
+
+		PxU32 numNormalConstr = frictionHeader->numNormalConstr;
+		PxU32 nbFrictionsPerPoint = numFrictionConstr/numNormalConstr;
+
+
+
+
+		for(PxU32 i = 0, j = 0; i < numFrictionConstr; j++)
+		{
+			for(PxU32 p=0;p<nbFrictionsPerPoint;p++, i++)
+			{
+				SolverContactFrictionExt& f = frictions[i];
+				Ps::prefetchLine(&frictions[i+1]);
+			
+
+				const Vec3V t0 = Vec3V_From_Vec4V(f.normalXYZ_appliedForceW);
+				const Vec3V raXt0 = Vec3V_From_Vec4V(f.raXnXYZ_velMultiplierW);
+				const Vec3V rbXt0 = Vec3V_From_Vec4V(f.rbXnXYZ_biasW);
+
+				const FloatV appliedForce = V4GetW(f.normalXYZ_appliedForceW);
+				const FloatV velMultiplier = V4GetW(f.raXnXYZ_velMultiplierW);
+				const FloatV targetVel = FLoad(f.targetVel);
+
+				const FloatV normalImpulse = FLoad(appliedImpulse[j]);//contacts[f.contactIndex].getAppliedForce();
+				const FloatV maxFriction = FMul(staticFriction, normalImpulse);
+				const FloatV nMaxFriction = FNeg(maxFriction);
+
+				//Compute the normal velocity of the constraint.
+
+				Vec3V rVel = V3MulAdd(linVel0, t0, V3Mul(angVel0, raXt0));
+				rVel = V3Sub(rVel, V3MulAdd(linVel1, t0, V3Mul(angVel1, rbXt0)));
+				const FloatV t0Vel = FAdd(V3SumElems(rVel), targetVel);
+
+				FloatV deltaF = FNeg(FMul(t0Vel, velMultiplier));
+				FloatV newForce = FAdd(appliedForce, deltaF);
+				newForce = FClamp(newForce, nMaxFriction, maxFriction);
+				deltaF = FSub(newForce, appliedForce);
+
+				linVel0 = V3ScaleAdd(f.linDeltaVA, deltaF, linVel0);	
+				angVel0 = V3ScaleAdd(f.angDeltaVA, deltaF, angVel0);
+				linVel1 = V3ScaleAdd(f.linDeltaVB, deltaF, linVel1);	
+				angVel1 = V3ScaleAdd(f.angDeltaVB, deltaF, angVel1);
+
+				li0 = V3ScaleAdd(t0, deltaF, li0);	ai0 = V3ScaleAdd(raXt0, deltaF, ai0);
+				li1 = V3ScaleAdd(t0, deltaF, li1);	ai1 = V3ScaleAdd(rbXt0, deltaF, ai1);
+
+				f.setAppliedForce(newForce);
+			}
+		}
+
+
+		linImpulse0 = V3ScaleAdd(li0, FLoad(frictionHeader->invMass0D0), linImpulse0);		
+		angImpulse0 = V3ScaleAdd(ai0, FLoad(frictionHeader->angDom0), angImpulse0);
+		linImpulse1 = V3NegScaleSub(li1, FLoad(frictionHeader->invMass1D1), linImpulse1);	
+		angImpulse1 = V3NegScaleSub(ai1, FLoad(frictionHeader->angDom1), angImpulse1);
+	}
+
+	if(desc.linkIndexA == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel0, desc.bodyA->linearVelocity);
+		V3StoreA(angVel0, desc.bodyA->angularState);
+	}
+	else
+	{
+		desc.articulationA->pxcFsApplyImpulse(desc.linkIndexA, 
+			linImpulse0, angImpulse0, cache.Z, cache.deltaV);
+	}
+
+	if(desc.linkIndexB == PxSolverConstraintDesc::NO_LINK)
+	{
+		V3StoreA(linVel1, desc.bodyB->linearVelocity);
+		V3StoreA(angVel1, desc.bodyB->angularState);
+	}
+	else
+	{
+		desc.articulationB->pxcFsApplyImpulse(desc.linkIndexB, 
+			linImpulse1, angImpulse1, cache.Z, cache.deltaV);
+	}
+
+	PX_ASSERT(currPtr == last);
+
+}
+
+void solveExtFrictionBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtFriction(desc[a], cache);
+	}
+}
+
+void solveExtFrictionConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtFriction(desc[a], cache);
+	}
+}
+
+void solveExtFrictionBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtFriction(desc[a], cache);
+	}
+}
+
+
+void solveConcludeExtContactCoulomb		(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveExtContactCoulomb(desc, cache);
+	concludeContactCoulomb(desc, cache);
+}
+
+void solveExtContactCoulombBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveExtContactCoulombConcludeBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		solveExtContactCoulomb(desc[a], cache);
+		concludeContactCoulomb(desc[a], cache);
+	}
+}
+
+void solveExtContactCoulombBlockWriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 constraintCount, SolverContext& cache)
+{
+	for(PxU32 a = 0; a < constraintCount; ++a)
+	{
+		PxSolverBodyData& bd0 = cache.solverBodyArray[desc[a].linkIndexA != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyADataIndex];
+		PxSolverBodyData& bd1 = cache.solverBodyArray[desc[a].linkIndexB != PxSolverConstraintDesc::NO_LINK ? 0 : desc[a].bodyBDataIndex];
+
+		solveExtContactCoulomb(desc[a], cache);
+		writeBackContactCoulomb(desc[a], cache, bd0, bd1);
+	}
+	if(cache.mThresholdStreamIndex > 0)
+	{
+		//Not enough space to write 4 more thresholds back!
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+
+void solveConcludeContactCoulomb			(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveContactCoulomb(desc, cache);
+	concludeContactCoulomb(desc, cache);
+}
+
+
+void solveConcludeContactCoulomb_BStatic	(const PxSolverConstraintDesc& desc, SolverContext& cache)
+{
+	solveContactCoulomb_BStatic(desc, cache);
+	concludeContactCoulomb(desc, cache);
+}
+
+
+
+}
+
+}
+
--- a/physx/source/lowleveldynamics/src/DySolverPFConstraintsBlock.cpp
+++ b/physx/source/lowleveldynamics/src/DySolverPFConstraintsBlock.cpp
@ -0,0 +1,985 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxPreprocessor.h"
+#include "PsVecMath.h"
+#include "PsFPU.h"
+#include "CmPhysXCommon.h"
+#include "DySolverBody.h"
+#include "DySolverContactPF4.h"
+#include "DySolverConstraint1D.h"
+#include "DySolverConstraintDesc.h"
+#include "DyThresholdTable.h"
+#include "DySolverContext.h"
+#include "PsUtilities.h"
+#include "DyConstraint.h"
+#include "PsAtomic.h"
+#include "DySolverContact.h"
+
+namespace physx
+{
+
+namespace Dy
+{
+
+static void solveContactCoulomb4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b01 = *desc[0].bodyB;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b11 = *desc[1].bodyB;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b21 = *desc[2].bodyB;
+	PxSolverBody& b30 = *desc[3].bodyA;
+	PxSolverBody& b31 = *desc[3].bodyB;
+
+	//We'll need this.
+	const Vec4V vZero	= V4Zero();
+		
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+	Vec4V angState01 = V4LoadA(&b01.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+	Vec4V angState11 = V4LoadA(&b11.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+	Vec4V angState21 = V4LoadA(&b21.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+	Vec4V angState31 = V4LoadA(&b31.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V linVel1T0, linVel1T1, linVel1T2, linVel1T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+	Vec4V angState1T0, angState1T1, angState1T2, angState1T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2, linVel1T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	PX_TRANSPOSE_44(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2, angState1T3);
+
+
+	
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+	SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+	const PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+	//const PxU8* PX_RESTRICT endPtr = desc[0].constraint + getConstraintLength(desc[0]);
+
+
+	//TODO - can I avoid this many tests???
+	while(currPtr < last)
+	{
+
+		SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+		Vec4V* appliedForceBuffer = reinterpret_cast<Vec4V*>(currPtr + hdr->frictionOffset + sizeof(SolverFrictionHeader4));
+
+		//PX_ASSERT((PxU8*)appliedForceBuffer < endPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		SolverContact4Dynamic* PX_RESTRICT contacts = reinterpret_cast<SolverContact4Dynamic*>(currPtr);
+		//const Vec4V dominance1 = V4Neg(__dominance1);
+
+		currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V invMass1D1 = hdr->invMassBDom;
+		const Vec4V angD0 = hdr->angD0;
+		const Vec4V angD1 = hdr->angD1;
+
+		const Vec4V normalT0 = hdr->normalX;
+		const Vec4V normalT1 = hdr->normalY;
+		const Vec4V normalT2 = hdr->normalZ;
+
+		const Vec4V __normalVel1 = V4Mul(linVel0T0, normalT0);
+		const Vec4V __normalVel3 = V4Mul(linVel1T0, normalT0);
+		const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalT1, __normalVel1);
+		const Vec4V _normalVel3 = V4MulAdd(linVel1T1, normalT1, __normalVel3);
+
+		Vec4V normalVel1 = V4MulAdd(linVel0T2, normalT2, _normalVel1);
+		Vec4V normalVel3 = V4MulAdd(linVel1T2, normalT2, _normalVel3);
+
+		Vec4V accumDeltaF = vZero;
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContact4Dynamic& c = contacts[i];
+			Ps::prefetchLine((&contacts[i+1]));
+			Ps::prefetchLine((&contacts[i+1]), 128);
+			Ps::prefetchLine((&contacts[i+1]), 256);
+			Ps::prefetchLine((&contacts[i+1]), 384);
+
+			const Vec4V appliedForce = c.appliedForce;
+			const Vec4V velMultiplier = c.velMultiplier;
+			
+			const Vec4V targetVel = c.targetVelocity;
+			const Vec4V scaledBias = c.scaledBias;
+			const Vec4V maxImpulse = c.maxImpulse;
+
+			const Vec4V raXnT0 = c.raXnX;
+			const Vec4V raXnT1 = c.raXnY;
+			const Vec4V raXnT2 = c.raXnZ;
+			const Vec4V rbXnT0 = c.rbXnX;
+			const Vec4V rbXnT1 = c.rbXnY;
+			const Vec4V rbXnT2 = c.rbXnZ;
+
+			
+			const Vec4V __normalVel2 = V4Mul(raXnT0, angState0T0);
+			const Vec4V __normalVel4 = V4Mul(rbXnT0, angState1T0);
+
+			
+			const Vec4V _normalVel2 = V4MulAdd(raXnT1, angState0T1, __normalVel2);
+			const Vec4V _normalVel4 = V4MulAdd(rbXnT1, angState1T1, __normalVel4);
+
+			
+			const Vec4V normalVel2 = V4MulAdd(raXnT2, angState0T2, _normalVel2);
+			const Vec4V normalVel4 = V4MulAdd(rbXnT2, angState1T2, _normalVel4);
+
+			const Vec4V biasedErr = V4MulAdd(targetVel, velMultiplier, V4Neg(scaledBias));
+
+			//Linear component - normal * invMass_dom
+
+			const Vec4V _normalVel(V4Add(normalVel1, normalVel2));
+			const Vec4V __normalVel(V4Add(normalVel3, normalVel4));
+		
+			const Vec4V normalVel = V4Sub(_normalVel, __normalVel );
+
+			const Vec4V _deltaF = V4NegMulSub(normalVel, velMultiplier, biasedErr);
+			const Vec4V nAppliedForce = V4Neg(appliedForce);
+			const Vec4V _deltaF2 = V4Max(_deltaF, nAppliedForce);
+			const Vec4V _newAppliedForce(V4Add(appliedForce, _deltaF2));
+			const Vec4V newAppliedForce = V4Min(_newAppliedForce, maxImpulse);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			normalVel1 = V4MulAdd(invMass0D0, deltaF, normalVel1);
+			normalVel3 = V4NegMulSub(invMass1D1, deltaF, normalVel3);
+
+			accumDeltaF = V4Add(deltaF, accumDeltaF);
+
+			const Vec4V deltaFAng0 = V4Mul(angD0, deltaF);
+			const Vec4V deltaFAng1 = V4Mul(angD1, deltaF);
+
+			angState0T0 = V4MulAdd(raXnT0, deltaFAng0, angState0T0);
+			angState1T0 = V4NegMulSub(rbXnT0, deltaFAng1, angState1T0);
+
+			angState0T1 = V4MulAdd(raXnT1, deltaFAng0, angState0T1);
+			angState1T1 = V4NegMulSub(rbXnT1, deltaFAng1, angState1T1);
+
+			angState0T2 = V4MulAdd(raXnT2, deltaFAng0, angState0T2);
+			angState1T2 = V4NegMulSub(rbXnT2, deltaFAng1, angState1T2);
+
+			c.appliedForce = newAppliedForce;
+			appliedForceBuffer[i] = newAppliedForce;
+		}
+
+		const Vec4V accumDeltaF0 = V4Mul(accumDeltaF, invMass0D0);
+		const Vec4V accumDeltaF1 = V4Mul(accumDeltaF, invMass1D1);
+
+		linVel0T0 = V4MulAdd(normalT0, accumDeltaF0, linVel0T0);
+		linVel1T0 = V4NegMulSub(normalT0, accumDeltaF1, linVel1T0);
+		linVel0T1 = V4MulAdd(normalT1, accumDeltaF0, linVel0T1);
+		linVel1T1 = V4NegMulSub(normalT1, accumDeltaF1, linVel1T1);
+		linVel0T2 = V4MulAdd(normalT2, accumDeltaF0, linVel0T2);
+		linVel1T2 = V4NegMulSub(normalT2, accumDeltaF1, linVel1T2);
+	}
+
+	PX_ASSERT(currPtr == last);
+	
+
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(linVel1T0, linVel1T1, linVel1T2, linVel1T3, linVel01, linVel11, linVel21, linVel31);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+	PX_TRANSPOSE_44(angState1T0, angState1T1, angState1T2, angState1T3, angState01, angState11, angState21, angState31);
+
+
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(linVel01, &b01.linearVelocity.x);
+	V4StoreA(linVel11, &b11.linearVelocity.x);
+	V4StoreA(linVel21, &b21.linearVelocity.x);
+	V4StoreA(linVel31, &b31.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	V4StoreA(angState01, &b01.angularState.x);
+	V4StoreA(angState11, &b11.angularState.x);
+	V4StoreA(angState21, &b21.angularState.x);
+	V4StoreA(angState31, &b31.angularState.x);
+}
+
+
+static void solveContactCoulomb4_StaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b30 = *desc[3].bodyA;
+
+	//We'll need this.
+	const Vec4V vZero	= V4Zero();
+		
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	
+
+	//hopefully pointer aliasing doesn't bite.
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+
+	SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+	const PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+
+	//TODO - can I avoid this many tests???
+	while(currPtr < last)
+	{
+
+		SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverContactCoulombHeader4*>(currPtr);
+
+		Vec4V* appliedForceBuffer = reinterpret_cast<Vec4V*>(currPtr + hdr->frictionOffset + sizeof(SolverFrictionHeader4));
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		SolverContact4Base* PX_RESTRICT contacts = reinterpret_cast<SolverContact4Base*>(currPtr);
+	
+		currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V angD0 = hdr->angD0;
+
+		const Vec4V normalT0 = hdr->normalX;
+		const Vec4V normalT1 = hdr->normalY;
+		const Vec4V normalT2 = hdr->normalZ;
+
+		const Vec4V __normalVel1 = V4Mul(linVel0T0, normalT0);
+		const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalT1, __normalVel1);
+
+		Vec4V normalVel1 = V4MulAdd(linVel0T2, normalT2, _normalVel1);
+
+		Vec4V accumDeltaF = vZero;
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContact4Base& c = contacts[i];
+			Ps::prefetchLine((&contacts[i+1]));
+			Ps::prefetchLine((&contacts[i+1]), 128);
+			Ps::prefetchLine((&contacts[i+1]), 256);
+
+			const Vec4V appliedForce = c.appliedForce;
+			const Vec4V velMultiplier = c.velMultiplier;
+			
+			const Vec4V targetVel = c.targetVelocity;
+			const Vec4V scaledBias = c.scaledBias;
+			const Vec4V maxImpulse = c.maxImpulse;
+
+			const Vec4V raXnT0 = c.raXnX;
+			const Vec4V raXnT1 = c.raXnY;
+			const Vec4V raXnT2 = c.raXnZ;
+
+			
+			const Vec4V __normalVel2 = V4Mul(raXnT0, angState0T0);
+			
+			const Vec4V _normalVel2 = V4MulAdd(raXnT1, angState0T1, __normalVel2);
+			
+			const Vec4V normalVel2 = V4MulAdd(raXnT2, angState0T2, _normalVel2);
+
+			const Vec4V biasedErr = V4MulAdd(targetVel, velMultiplier, V4Neg(scaledBias));
+
+			//Linear component - normal * invMass_dom
+
+			const Vec4V normalVel(V4Add(normalVel1, normalVel2));
+
+			const Vec4V _deltaF = V4NegMulSub(normalVel, velMultiplier, biasedErr);
+			const Vec4V nAppliedForce = V4Neg(appliedForce);
+
+			const Vec4V _deltaF2 = V4Max(_deltaF, nAppliedForce);
+
+			const Vec4V _newAppliedForce(V4Add(appliedForce, _deltaF2));
+			const Vec4V newAppliedForce = V4Min(_newAppliedForce, maxImpulse);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+			const Vec4V deltaAngF = V4Mul(deltaF, angD0);
+
+			normalVel1 = V4MulAdd(invMass0D0, deltaF, normalVel1);
+
+			accumDeltaF = V4Add(deltaF, accumDeltaF);
+
+			angState0T0 = V4MulAdd(raXnT0, deltaAngF, angState0T0);
+			angState0T1 = V4MulAdd(raXnT1, deltaAngF, angState0T1);
+			angState0T2 = V4MulAdd(raXnT2, deltaAngF, angState0T2);
+
+			c.appliedForce = newAppliedForce;
+			appliedForceBuffer[i] = newAppliedForce;
+		}
+		const Vec4V scaledAccumDeltaF = V4Mul(accumDeltaF, invMass0D0);
+		linVel0T0 = V4MulAdd(normalT0, scaledAccumDeltaF, linVel0T0);
+		linVel0T1 = V4MulAdd(normalT1, scaledAccumDeltaF, linVel0T1);
+		linVel0T2 = V4MulAdd(normalT2, scaledAccumDeltaF, linVel0T2);
+	}
+
+	PX_ASSERT(currPtr == last);
+	
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+
+	// Write back
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+}
+
+static void solveFriction4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b01 = *desc[0].bodyB;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b11 = *desc[1].bodyB;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b21 = *desc[2].bodyB;
+	PxSolverBody& b30 = *desc[3].bodyA;
+	PxSolverBody& b31 = *desc[3].bodyB;
+
+
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+	Vec4V angState01 = V4LoadA(&b01.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+	Vec4V angState11 = V4LoadA(&b11.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+	Vec4V angState21 = V4LoadA(&b21.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+	Vec4V angState31 = V4LoadA(&b31.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V linVel1T0, linVel1T1, linVel1T2, linVel1T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+	Vec4V angState1T0, angState1T1, angState1T2, angState1T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2, linVel1T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+	PX_TRANSPOSE_44(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2, angState1T3);
+
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+	PxU8* PX_RESTRICT endPtr = desc[0].constraint + getConstraintLength(desc[0]);
+	
+
+	while(currPtr < endPtr)
+	{
+		SolverFrictionHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverFrictionHeader4*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		Vec4V* appliedImpulses = reinterpret_cast<Vec4V*>(currPtr);
+
+		currPtr += hdr->numNormalConstr * sizeof(Vec4V);
+
+		Ps::prefetchLine(currPtr, 128);
+		Ps::prefetchLine(currPtr,256);
+		Ps::prefetchLine(currPtr,384);
+		
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverFriction4Dynamic* PX_RESTRICT frictions = reinterpret_cast<SolverFriction4Dynamic*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(frictions + hdr->numFrictionConstr);
+
+		const PxU32 maxFrictionConstr = numFrictionConstr;
+	
+		const Vec4V staticFric = hdr->staticFriction;
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V invMass1D1 = hdr->invMassBDom;
+
+		const Vec4V angD0 = hdr->angD0;
+		const Vec4V angD1 = hdr->angD1;
+
+		for(PxU32 i=0;i<maxFrictionConstr;i++)
+		{
+			SolverFriction4Dynamic& f = frictions[i];
+			Ps::prefetchLine((&f)+1);
+			Ps::prefetchLine((&f)+1,128);
+			Ps::prefetchLine((&f)+1,256);
+			Ps::prefetchLine((&f)+1,384);
+
+			const Vec4V appliedImpulse = appliedImpulses[i>>hdr->frictionPerContact];
+
+			const Vec4V maxFriction =  V4Mul(staticFric, appliedImpulse);
+
+			const Vec4V nMaxFriction = V4Neg(maxFriction); 
+
+			const Vec4V normalX = f.normalX;
+			const Vec4V normalY = f.normalY;
+			const Vec4V normalZ = f.normalZ;
+
+			const Vec4V raXnX = f.raXnX;
+			const Vec4V raXnY = f.raXnY;
+			const Vec4V raXnZ = f.raXnZ;
+
+			const Vec4V rbXnX = f.rbXnX;
+			const Vec4V rbXnY = f.rbXnY;
+			const Vec4V rbXnZ = f.rbXnZ;
+
+			const Vec4V appliedForce(f.appliedForce);
+			const Vec4V velMultiplier(f.velMultiplier);
+			const Vec4V targetVel(f.targetVelocity);
+	
+			//4 x 4 Dot3 products encoded as 8 M44 transposes, 4 MulV and 8 MulAdd ops
+
+			const Vec4V __normalVel1 = V4Mul(linVel0T0, normalX);
+			const Vec4V __normalVel2 = V4Mul(raXnX, angState0T0);
+			const Vec4V __normalVel3 = V4Mul(linVel1T0, normalX);
+			const Vec4V __normalVel4 = V4Mul(rbXnX, angState1T0);
+
+			const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalY, __normalVel1);
+			const Vec4V _normalVel2 = V4MulAdd(raXnY, angState0T1, __normalVel2);
+			const Vec4V _normalVel3 = V4MulAdd(linVel1T1, normalY, __normalVel3);
+			const Vec4V _normalVel4 = V4MulAdd(rbXnY, angState1T1, __normalVel4);
+
+			const Vec4V normalVel1 = V4MulAdd(linVel0T2, normalZ, _normalVel1);
+			const Vec4V normalVel2 = V4MulAdd(raXnZ, angState0T2, _normalVel2);
+			const Vec4V normalVel3 = V4MulAdd(linVel1T2, normalZ, _normalVel3);
+			const Vec4V normalVel4 = V4MulAdd(rbXnZ, angState1T2, _normalVel4);
+
+
+			const Vec4V _normalVel = V4Add(normalVel1, normalVel2);
+			const Vec4V __normalVel = V4Add(normalVel3, normalVel4);
+
+			const Vec4V normalVel = V4Sub(_normalVel, __normalVel );
+
+			const Vec4V tmp = V4NegMulSub(targetVel, velMultiplier, appliedForce);
+			Vec4V newAppliedForce = V4MulAdd(normalVel, velMultiplier, tmp);
+			newAppliedForce = V4Clamp(newAppliedForce,nMaxFriction,  maxFriction);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			const Vec4V deltaLinF0 = V4Mul(invMass0D0, deltaF);
+			const Vec4V deltaLinF1 = V4Mul(invMass1D1, deltaF);
+
+			const Vec4V deltaAngF0 = V4Mul(angD0, deltaF);
+			const Vec4V deltaAngF1 = V4Mul(angD1, deltaF);
+
+
+			linVel0T0 = V4MulAdd(normalX, deltaLinF0, linVel0T0);
+			linVel1T0 = V4NegMulSub(normalX, deltaLinF1, linVel1T0);
+			angState0T0 = V4MulAdd(raXnX, deltaAngF0, angState0T0);
+			angState1T0 = V4NegMulSub(rbXnX, deltaAngF1, angState1T0);
+
+			linVel0T1 = V4MulAdd(normalY, deltaLinF0, linVel0T1);
+			linVel1T1 = V4NegMulSub(normalY, deltaLinF1, linVel1T1);
+			angState0T1 = V4MulAdd(raXnY, deltaAngF0, angState0T1);
+			angState1T1 = V4NegMulSub(rbXnY, deltaAngF1, angState1T1);
+
+			linVel0T2 = V4MulAdd(normalZ, deltaLinF0, linVel0T2);
+			linVel1T2 = V4NegMulSub(normalZ, deltaLinF1, linVel1T2);
+			angState0T2 = V4MulAdd(raXnZ, deltaAngF0, angState0T2);
+			angState1T2 = V4NegMulSub(rbXnZ, deltaAngF1, angState1T2);
+
+			f.appliedForce = newAppliedForce;
+		}
+	}
+
+	PX_ASSERT(currPtr == endPtr);
+
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(linVel1T0, linVel1T1, linVel1T2, linVel1T3, linVel01, linVel11, linVel21, linVel31);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+	PX_TRANSPOSE_44(angState1T0, angState1T1, angState1T2, angState1T3, angState01, angState11, angState21, angState31);
+
+
+	// Write back
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(linVel01, &b01.linearVelocity.x);
+	V4StoreA(linVel11, &b11.linearVelocity.x);
+	V4StoreA(linVel21, &b21.linearVelocity.x);
+	V4StoreA(linVel31, &b31.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+
+	V4StoreA(angState01, &b01.angularState.x);
+	V4StoreA(angState11, &b11.angularState.x);
+	V4StoreA(angState21, &b21.angularState.x);
+	V4StoreA(angState31, &b31.angularState.x);
+
+}
+
+
+static void solveFriction4_StaticBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext& /*cache*/)
+{
+
+	PxSolverBody& b00 = *desc[0].bodyA;
+	PxSolverBody& b10 = *desc[1].bodyA;
+	PxSolverBody& b20 = *desc[2].bodyA;
+	PxSolverBody& b30 = *desc[3].bodyA;
+
+
+	Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
+	Vec4V angState00 = V4LoadA(&b00.angularState.x);
+
+	Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
+	Vec4V angState10 = V4LoadA(&b10.angularState.x);
+
+	Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
+	Vec4V angState20 = V4LoadA(&b20.angularState.x);
+
+	Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
+	Vec4V angState30 = V4LoadA(&b30.angularState.x);
+
+
+	Vec4V linVel0T0, linVel0T1, linVel0T2, linVel0T3;
+	Vec4V angState0T0, angState0T1, angState0T2, angState0T3;
+
+
+	PX_TRANSPOSE_44(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2, linVel0T3);
+	PX_TRANSPOSE_44(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2, angState0T3);
+
+	PxU8* PX_RESTRICT currPtr = desc[0].constraint;
+	PxU8* PX_RESTRICT endPtr = desc[0].constraint + getConstraintLength(desc[0]);
+	
+
+	while(currPtr < endPtr)
+	{
+		SolverFrictionHeader4* PX_RESTRICT hdr = reinterpret_cast<SolverFrictionHeader4*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(hdr + 1);
+
+		Vec4V* appliedImpulses = reinterpret_cast<Vec4V*>(currPtr);
+
+		currPtr += hdr->numNormalConstr * sizeof(Vec4V);
+
+		Ps::prefetchLine(currPtr, 128);
+		Ps::prefetchLine(currPtr,256);
+		Ps::prefetchLine(currPtr,384);
+		
+		const PxU32	numFrictionConstr = hdr->numFrictionConstr;
+
+		SolverFriction4Base* PX_RESTRICT frictions = reinterpret_cast<SolverFriction4Base*>(currPtr);
+
+		currPtr = reinterpret_cast<PxU8*>(frictions + hdr->numFrictionConstr);
+
+		const PxU32 maxFrictionConstr = numFrictionConstr;
+	
+		const Vec4V staticFric = hdr->staticFriction;
+
+		const Vec4V invMass0D0 = hdr->invMassADom;
+		const Vec4V angD0 = hdr->angD0;
+
+		for(PxU32 i=0;i<maxFrictionConstr;i++)
+		{
+			SolverFriction4Base& f = frictions[i];
+			Ps::prefetchLine((&f)+1);
+			Ps::prefetchLine((&f)+1,128);
+			Ps::prefetchLine((&f)+1,256);
+
+			const Vec4V appliedImpulse = appliedImpulses[i>>hdr->frictionPerContact];
+
+			const Vec4V maxFriction =  V4Mul(staticFric, appliedImpulse);
+
+			const Vec4V nMaxFriction = V4Neg(maxFriction); 
+
+			const Vec4V normalX = f.normalX;
+			const Vec4V normalY = f.normalY;
+			const Vec4V normalZ = f.normalZ;
+
+			const Vec4V raXnX = f.raXnX;
+			const Vec4V raXnY = f.raXnY;
+			const Vec4V raXnZ = f.raXnZ;
+
+			const Vec4V appliedForce(f.appliedForce);
+			const Vec4V velMultiplier(f.velMultiplier);
+			const Vec4V targetVel(f.targetVelocity);
+	
+			//4 x 4 Dot3 products encoded as 8 M44 transposes, 4 MulV and 8 MulAdd ops
+
+			const Vec4V __normalVel1 = V4Mul(linVel0T0, normalX);
+			const Vec4V __normalVel2 = V4Mul(raXnX, angState0T0);
+
+			const Vec4V _normalVel1 = V4MulAdd(linVel0T1, normalY, __normalVel1);
+			const Vec4V _normalVel2 = V4MulAdd(raXnY, angState0T1, __normalVel2);
+
+			const Vec4V normalVel1 = V4MulAdd(linVel0T2, normalZ, _normalVel1);
+			const Vec4V normalVel2 = V4MulAdd(raXnZ, angState0T2, _normalVel2);
+
+			const Vec4V delLinVel00 = V4Mul(normalX, invMass0D0);
+
+			const Vec4V delLinVel10 = V4Mul(normalY, invMass0D0);
+
+			const Vec4V normalVel = V4Add(normalVel1, normalVel2);
+
+			const Vec4V delLinVel20 = V4Mul(normalZ, invMass0D0);
+
+			const Vec4V tmp = V4NegMulSub(targetVel, velMultiplier, appliedForce);
+
+			Vec4V newAppliedForce = V4MulAdd(normalVel, velMultiplier, tmp);
+			newAppliedForce = V4Clamp(newAppliedForce,nMaxFriction,  maxFriction);
+			const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
+
+			const Vec4V deltaAngF0 = V4Mul(angD0, deltaF);
+
+			linVel0T0 = V4MulAdd(delLinVel00, deltaF, linVel0T0);
+			angState0T0 = V4MulAdd(raXnX, deltaAngF0, angState0T0);
+
+			linVel0T1 = V4MulAdd(delLinVel10, deltaF, linVel0T1);
+			angState0T1 = V4MulAdd(raXnY, deltaAngF0, angState0T1);
+
+			linVel0T2 = V4MulAdd(delLinVel20, deltaF, linVel0T2);
+			angState0T2 = V4MulAdd(raXnZ, deltaAngF0, angState0T2);
+
+			f.appliedForce = newAppliedForce;
+		}
+	}
+
+	PX_ASSERT(currPtr == endPtr);
+
+	//KS - we need to use PX_TRANSPOSE_44 here instead of the 34_43 variants because the W components are being used to 
+	//store the bodies' progress counters.
+
+	PX_TRANSPOSE_44(linVel0T0, linVel0T1, linVel0T2, linVel0T3, linVel00, linVel10, linVel20, linVel30);
+	PX_TRANSPOSE_44(angState0T0, angState0T1, angState0T2, angState0T3, angState00, angState10, angState20, angState30);
+
+	// Write back
+	// Write back
+	V4StoreA(linVel00, &b00.linearVelocity.x);
+	V4StoreA(linVel10, &b10.linearVelocity.x);
+	V4StoreA(linVel20, &b20.linearVelocity.x);
+	V4StoreA(linVel30, &b30.linearVelocity.x);
+
+	V4StoreA(angState00, &b00.angularState.x);
+	V4StoreA(angState10, &b10.angularState.x);
+	V4StoreA(angState20, &b20.angularState.x);
+	V4StoreA(angState30, &b30.angularState.x);
+}
+
+static void concludeContactCoulomb4(const PxSolverConstraintDesc* desc, SolverContext& /*cache*/)
+{
+	PxU8* PX_RESTRICT cPtr = desc[0].constraint;
+
+	const Vec4V zero = V4Zero();
+
+	const SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+	PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+	PxU32 pointStride = firstHeader->type == DY_SC_TYPE_BLOCK_RB_CONTACT ? sizeof(SolverContact4Dynamic) : sizeof(SolverContact4Base);
+
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader4);
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+		
+		//if(cPtr < last)
+		//Ps::prefetchLine(cPtr, 512);
+		Ps::prefetchLine(cPtr,128);
+		Ps::prefetchLine(cPtr,256);
+		Ps::prefetchLine(cPtr,384);
+
+		for(PxU32 i=0;i<numNormalConstr;i++)
+		{
+			SolverContact4Base *c = reinterpret_cast<SolverContact4Base*>(cPtr);
+			cPtr += pointStride;
+			c->scaledBias = V4Max(c->scaledBias, zero);
+		}
+	}
+	PX_ASSERT(cPtr == last);
+}
+
+void  writeBackContactCoulomb4(const PxSolverConstraintDesc* desc, SolverContext& cache,
+					  const PxSolverBodyData** PX_RESTRICT bd0, const PxSolverBodyData** PX_RESTRICT bd1)
+{
+	Vec4V normalForceV = V4Zero();
+	PxU8* PX_RESTRICT cPtr = desc[0].constraint;
+	PxReal* PX_RESTRICT vForceWriteback0 = reinterpret_cast<PxReal*>(desc[0].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback1 = reinterpret_cast<PxReal*>(desc[1].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback2 = reinterpret_cast<PxReal*>(desc[2].writeBack);
+	PxReal* PX_RESTRICT vForceWriteback3 = reinterpret_cast<PxReal*>(desc[3].writeBack);
+
+	const SolverContactCoulombHeader4* PX_RESTRICT firstHeader = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+	PxU8* PX_RESTRICT last = desc[0].constraint + firstHeader->frictionOffset;
+
+	const PxU32 pointStride = firstHeader->type == DY_SC_TYPE_BLOCK_RB_CONTACT ? sizeof(SolverContact4Dynamic)
+																	   : sizeof(SolverContact4Base);
+
+	bool writeBackThresholds[4] = {false, false, false, false};
+
+
+	while(cPtr < last)
+	{
+		const SolverContactCoulombHeader4* PX_RESTRICT hdr = reinterpret_cast<const SolverContactCoulombHeader4*>(cPtr);
+		cPtr += sizeof(SolverContactCoulombHeader4);
+
+		writeBackThresholds[0] = hdr->flags[0] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[1] = hdr->flags[1] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[2] = hdr->flags[2] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+		writeBackThresholds[3] = hdr->flags[3] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
+
+		const PxU32 numNormalConstr = hdr->numNormalConstr;
+
+		Ps::prefetchLine(cPtr, 256);
+		Ps::prefetchLine(cPtr, 384);
+
+		
+		for(PxU32 i=0; i<numNormalConstr; i++)
+		{
+			SolverContact4Base* c = reinterpret_cast<SolverContact4Base*>(cPtr);
+			cPtr += pointStride;
+
+			const Vec4V appliedForce = c->appliedForce;
+			if(vForceWriteback0 && i < hdr->numNormalConstr0)
+				FStore(V4GetX(appliedForce), vForceWriteback0++);
+			if(vForceWriteback1 && i < hdr->numNormalConstr1)
+				FStore(V4GetY(appliedForce), vForceWriteback1++);
+			if(vForceWriteback2 && i < hdr->numNormalConstr2)
+				FStore(V4GetZ(appliedForce), vForceWriteback2++);
+			if(vForceWriteback3 && i < hdr->numNormalConstr3)
+				FStore(V4GetW(appliedForce), vForceWriteback3++);
+			
+			normalForceV = V4Add(normalForceV, appliedForce);
+		}
+	}
+	PX_ASSERT(cPtr == last);
+
+	PX_ALIGN(16, PxReal nf[4]);
+	V4StoreA(normalForceV, nf);
+
+	//all constraint pointer in descs are the same constraint
+	Sc::ShapeInteraction** shapeInteractions = reinterpret_cast<SolverContactCoulombHeader4*>(desc[0].constraint)->shapeInteraction;
+
+	for(PxU32 a = 0; a < 4; ++a)
+	{
+		if(writeBackThresholds[a] && desc[a].linkIndexA == PxSolverConstraintDesc::NO_LINK && desc[a].linkIndexB == PxSolverConstraintDesc::NO_LINK &&
+			nf[a] !=0.f && (bd0[a]->reportThreshold < PX_MAX_REAL  || bd1[a]->reportThreshold < PX_MAX_REAL))
+		{
+			ThresholdStreamElement elt;
+			elt.normalForce = nf[a];
+			elt.threshold = PxMin<float>(bd0[a]->reportThreshold, bd1[a]->reportThreshold);
+			elt.nodeIndexA = IG::NodeIndex(bd0[a]->nodeIndex);
+			elt.nodeIndexB = IG::NodeIndex(bd1[a]->nodeIndex);
+			elt.shapeInteraction = shapeInteractions[a];
+			Ps::order(elt.nodeIndexA, elt.nodeIndexB);
+			PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
+			PX_ASSERT(cache.mThresholdStreamIndex<cache.mThresholdStreamLength);
+			cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
+		}
+	}
+}
+
+void solveContactCoulombPreBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_Block(desc, cache);
+}
+
+void solveContactCoulombPreBlock_Static(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_StaticBlock(desc, cache);
+}
+
+void solveContactCoulombPreBlock_Conclude(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_Block(desc, cache);
+	concludeContactCoulomb4(desc, cache);
+}
+
+void solveContactCoulombPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_StaticBlock(desc, cache);
+	concludeContactCoulomb4(desc, cache);
+}
+
+void solveContactCoulombPreBlock_WriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_Block(desc, cache);
+
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+
+
+	writeBackContactCoulomb4(desc, cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveContactCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32 /*constraintCount*/, SolverContext& cache)
+{
+	solveContactCoulomb4_StaticBlock(desc, cache);
+	const PxSolverBodyData* bd0[4] = {	&cache.solverBodyArray[desc[0].bodyADataIndex], 
+										&cache.solverBodyArray[desc[1].bodyADataIndex],
+										&cache.solverBodyArray[desc[2].bodyADataIndex],
+										&cache.solverBodyArray[desc[3].bodyADataIndex]};
+
+	const PxSolverBodyData* bd1[4] = {	&cache.solverBodyArray[desc[0].bodyBDataIndex], 
+										&cache.solverBodyArray[desc[1].bodyBDataIndex],
+										&cache.solverBodyArray[desc[2].bodyBDataIndex],
+										&cache.solverBodyArray[desc[3].bodyBDataIndex]};
+
+	writeBackContactCoulomb4(desc, cache, bd0, bd1);
+
+	if(cache.mThresholdStreamIndex > (cache.mThresholdStreamLength - 4))
+	{
+		//Write back to global buffer
+		PxI32 threshIndex = physx::shdfnd::atomicAdd(cache.mSharedOutThresholdPairs, PxI32(cache.mThresholdStreamIndex)) - PxI32(cache.mThresholdStreamIndex);
+		for(PxU32 a = 0; a < cache.mThresholdStreamIndex; ++a)
+		{
+			cache.mSharedThresholdStream[a + threshIndex] = cache.mThresholdStream[a];
+		}
+		cache.mThresholdStreamIndex = 0;
+	}
+}
+
+void solveFrictionCoulombPreBlock(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_Block(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_Static(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_StaticBlock(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_Conclude(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_Block(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_ConcludeStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_StaticBlock(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_WriteBack(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_Block(desc, cache);
+}
+
+void solveFrictionCoulombPreBlock_WriteBackStatic(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxU32  /*constraintCount*/, SolverContext& cache)
+{
+	solveFriction4_StaticBlock(desc, cache);
+}
+
+
+}
+
+}
+
--- a/physx/source/lowleveldynamics/src/DySpatial.h
+++ b/physx/source/lowleveldynamics/src/DySpatial.h
@ -0,0 +1,142 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_SPATIAL_H
+#define DY_SPATIAL_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "PsMathUtils.h"
+#include "CmSpatialVector.h"
+
+namespace physx
+{
+namespace Dy
+{
+// translate a motion resolved at position p to the origin
+
+
+// should have a 'from' frame and a 'to' frame
+class SpInertia
+{
+public:
+	SpInertia() {}
+
+	SpInertia(const PxMat33& ll, const PxMat33& la, const PxMat33& aa): mLL(ll), mLA(la), mAA(aa)
+	{
+	}
+
+	static SpInertia getZero()
+	{
+		return SpInertia(PxMat33(PxZero), PxMat33(PxZero), 
+							     PxMat33(PxZero));
+	}
+
+	static SpInertia dyad(const Cm::SpatialVector& column, const Cm::SpatialVector& row) 
+	{
+		return SpInertia(dyad(column.linear, row.linear),  
+						 dyad(column.linear, row.angular),  
+					     dyad(column.angular, row.angular));
+	}
+
+
+	static SpInertia inertia(PxReal mass, const PxVec3& inertia)
+	{
+		return SpInertia(PxMat33::createDiagonal(PxVec3(mass,mass,mass)), PxMat33(PxZero),			 
+							     PxMat33::createDiagonal(inertia));
+	}
+
+
+	SpInertia operator+(const SpInertia& m) const
+	{
+		return SpInertia(mLL+m.mLL, mLA+m.mLA, mAA+m.mAA);
+	}
+
+	SpInertia operator-(const SpInertia& m) const
+	{
+		return SpInertia(mLL-m.mLL, mLA-m.mLA, mAA-m.mAA);
+	}
+
+	SpInertia operator*(PxReal r) const
+	{
+		return SpInertia(mLL*r, mLA*r, mAA*r);
+	}
+
+	void operator+=(const SpInertia& m)
+	{
+		mLL+=m.mLL; 
+		mLA+=m.mLA;		
+		mAA+=m.mAA;
+	}
+
+	void operator-=(const SpInertia& m)
+	{
+		mLL-=m.mLL; 
+		mLA-=m.mLA;		
+		mAA-=m.mAA;
+	}
+
+
+	PX_FORCE_INLINE Cm::SpatialVector operator *(const Cm::SpatialVector& v) const
+	{
+		return Cm::SpatialVector(mLL*v.linear            +mLA*v.angular,
+		 					    mLA.transformTranspose(v.linear)+mAA*v.angular);
+	}
+
+	SpInertia operator *(const SpInertia& v) const
+	{
+		return SpInertia(mLL*v.mLL             + mLA * v.mLA.getTranspose(), 
+						 mLL*v.mLA             + mLA * v.mAA,
+						 mLA.getTranspose()*v.mLA + mAA * v.mAA);
+	}
+
+
+	bool isFinite() const
+	{
+		return true;
+//		return mLL.isFinite() && mLA.isFinite() && mAA.isFinite(); 
+	}
+
+	PxMat33 mLL, mLA;		// linear force from angular motion, linear force from linear motion
+	PxMat33 mAA;		    // angular force from angular motion, mAL = mLA.transpose()
+
+private:
+	static PxMat33 dyad(PxVec3 col, PxVec3 row)	
+	{ 
+		return PxMat33(col*row.x, col*row.y, col*row.z); 
+	}
+
+
+};
+
+}
+}
+
+#endif //DY_SPATIAL_H
--- a/physx/source/lowleveldynamics/src/DyTGSContactPrep.cpp
+++ b/physx/source/lowleveldynamics/src/DyTGSContactPrep.cpp
--- a/physx/source/lowleveldynamics/src/DyTGSContactPrep.h
+++ b/physx/source/lowleveldynamics/src/DyTGSContactPrep.h
@ -0,0 +1,142 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_TGS_CONTACTPREP_H
+#define DY_TGS_CONTACTPREP_H
+
+#include "foundation/PxPreprocessor.h"
+#include "DySolverConstraintDesc.h"
+#include "PxSceneDesc.h"
+#include "DySolverContact4.h"
+
+namespace physx
+{
+
+	struct PxcNpWorkUnit;
+	class PxsConstraintBlockManager;
+	class PxcConstraintBlockStream;
+	struct PxsContactManagerOutput;
+	class FrictionPatchStreamPair;
+	struct PxSolverBody;
+	struct PxSolverBodyData;
+	struct PxSolverConstraintDesc;
+	class PxsContactManager;
+
+	namespace Dy
+	{
+		class ThreadContext;
+		struct CorrelationBuffer;
+
+		bool createFinalizeSolverContactsStep(PxTGSSolverContactDesc& contactDesc,
+			PxsContactManagerOutput& output,
+			ThreadContext& threadContext,
+			const PxReal invDtF32,
+			const PxReal invTotalDtF32,
+			PxReal bounceThresholdF32,
+			PxReal frictionOffsetThreshold,
+			PxReal correlationDistance,
+			PxConstraintAllocator& constraintAllocator);
+
+		bool createFinalizeSolverContactsStep(
+			PxTGSSolverContactDesc& contactDesc,
+			CorrelationBuffer& c,
+			const PxReal invDtF32,
+			const PxReal invTotalDtF32,
+			PxReal bounceThresholdF32,
+			PxReal frictionOffsetThreshold,
+			PxReal correlationDistance,
+			PxConstraintAllocator& constraintAllocator);
+
+		SolverConstraintPrepState::Enum setupSolverConstraintStep4
+		(PxTGSSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+			const PxReal dt, const PxReal totalDt, const PxReal recipdt, const PxReal recipTotalDt, PxU32& totalRows,
+			PxConstraintAllocator& allocator, PxU32 maxRows,
+			const PxReal lengthScale);
+
+		PxU32 SetupSolverConstraintStep(SolverConstraintShaderPrepDesc& shaderDesc,
+			PxTGSSolverConstraintPrepDesc& prepDesc,
+			PxConstraintAllocator& allocator,
+			const PxReal dt, const PxReal totalDt, const PxReal invdt, const PxReal invTotalDt,
+			const PxReal lengthScale);
+
+		PxU32 setupSolverConstraintStep(
+			const PxTGSSolverConstraintPrepDesc& prepDesc,
+			PxConstraintAllocator& allocator,
+			const PxReal dt, const PxReal totalDt, const PxReal invdt, const PxReal invTotalDt,
+			const PxReal lengthScale);
+
+		SolverConstraintPrepState::Enum setupSolverConstraintStep4
+		(SolverConstraintShaderPrepDesc* PX_RESTRICT constraintShaderDescs,
+			PxTGSSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
+			const PxReal dt, const PxReal totalDt, const PxReal recipdt, const PxReal recipTotalDt, PxU32& totalRows,
+			PxConstraintAllocator& allocator, const PxReal lengthScale);
+
+		SolverConstraintPrepState::Enum createFinalizeSolverContacts4Step(
+			PxsContactManagerOutput** cmOutputs,
+			ThreadContext& threadContext,
+			PxTGSSolverContactDesc* blockDescs,
+			const PxReal invDtF32,
+			const PxReal invTotalDtF32,
+			PxReal bounceThresholdF32,
+			PxReal	frictionOffsetThreshold,
+			PxReal correlationDistance,
+			PxReal solverOffsetSlop,
+			PxConstraintAllocator& constraintAllocator);
+
+		SolverConstraintPrepState::Enum createFinalizeSolverContacts4Step(
+			Dy::CorrelationBuffer& c,
+			PxTGSSolverContactDesc* blockDescs,
+			const PxReal invDtF32,
+			const PxReal invTotalDtF32,
+			PxReal bounceThresholdF32,
+			PxReal	frictionOffsetThreshold,
+			PxReal correlationDistance,
+			PxReal solverOffsetSlop,
+			PxConstraintAllocator& constraintAllocator);
+
+		typedef void(*TGSSolveBlockMethod) (const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* desc,
+			const PxTGSSolverBodyTxInertia* const txInertias, const PxReal minPenetration, const PxReal elapsedTime, SolverContext& cache);
+
+		typedef void (*TGSWriteBackMethod) (const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* desc, SolverContext* cache);
+		
+		typedef void (*TGSSolveConcludeMethod) (const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* desc,
+			const PxTGSSolverBodyTxInertia* const txInertias, const PxReal elapsedTime, SolverContext& cache);
+
+		extern TGSSolveBlockMethod g_SolveTGSMethods[];
+
+		extern TGSWriteBackMethod g_WritebackTGSMethods[];
+
+		extern TGSSolveConcludeMethod g_SolveConcludeTGSMethods[];
+
+	}
+
+}
+
+#endif
--- a/physx/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp
+++ b/physx/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp
--- a/physx/source/lowleveldynamics/src/DyTGSDynamics.cpp
+++ b/physx/source/lowleveldynamics/src/DyTGSDynamics.cpp
--- a/physx/source/lowleveldynamics/src/DyTGSDynamics.h
+++ b/physx/source/lowleveldynamics/src/DyTGSDynamics.h
@ -0,0 +1,517 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_TGS_DYNAMICS_H
+#define DY_TGS_DYNAMICS_H
+
+#include "PxvConfig.h"
+#include "CmSpatialVector.h"
+#include "CmTask.h"
+#include "CmPool.h"
+#include "PxcThreadCoherentCache.h"
+#include "DyThreadContext.h"
+#include "PxcConstraintBlockStream.h"
+#include "DySolverBody.h"
+#include "DyContext.h"
+#include "PxsIslandManagerTypes.h"
+#include "PxvNphaseImplementationContext.h"
+#include "solver/PxSolverDefs.h"
+#include "PxsIslandSim.h"
+
+namespace physx
+{
+
+	namespace Cm
+	{
+		class FlushPool;
+	}
+
+	namespace IG
+	{
+		class SimpleIslandManager;
+		struct Edge;
+	}
+
+	class PxsRigidBody;
+
+	class PxsStreamedThresholdTable;
+
+	struct PxsBodyCore;
+	struct PxsIslandObjects;
+	class PxsIslandIndices;
+	struct PxsIndexedInteraction;
+	class PxsIslandManager;
+	struct PxsIndexedConstraint;
+	struct PxsIndexedContactManager;
+	class PxsHeapMemoryAllocator;
+	class PxsMemoryManager;
+	class PxsDefaultMemoryManager;
+	
+
+	namespace Cm
+	{
+		class Bitmap;
+		class SpatialVector;
+	}
+
+	namespace Dy
+	{
+		class SolverCore;
+		struct SolverIslandParams;
+		struct ArticulationSolverDesc;
+		class ArticulationV;
+		class DynamicsContext;
+		struct SolverContext;
+
+		struct SolverIslandObjectsStep
+		{
+			PxsRigidBody**				bodies;
+			ArticulationV**				articulations;
+			ArticulationV**				articulationOwners;
+			PxsIndexedContactManager*	contactManagers;
+
+			const IG::IslandId*			islandIds;
+			PxU32						numIslands;
+			PxU32*						bodyRemapTable;
+			PxU32*						nodeIndexArray;
+
+			PxSolverConstraintDesc*	constraintDescs;
+			PxSolverConstraintDesc*	orderedConstraintDescs;
+			PxSolverConstraintDesc*	tempConstraintDescs;
+			PxConstraintBatchHeader*	constraintBatchHeaders;
+			Cm::SpatialVector*			motionVelocities;
+			PxsBodyCore**				bodyCoreArray;
+
+			PxU32						solverBodyOffset;
+
+			SolverIslandObjectsStep() : bodies(NULL), articulations(NULL), articulationOwners(NULL),
+				contactManagers(NULL), islandIds(NULL), numIslands(0), nodeIndexArray(NULL), constraintDescs(NULL), motionVelocities(NULL), bodyCoreArray(NULL),
+				solverBodyOffset(0)
+			{
+			}
+		};
+
+		struct IslandContextStep
+		{
+			//The thread context for this island (set in in the island start task, released in the island end task)
+			ThreadContext*		mThreadContext;
+			PxsIslandIndices	mCounts;
+			SolverIslandObjectsStep mObjects;
+			PxU32				mPosIters;
+			PxU32				mVelIters;
+			PxU32				mArticulationOffset;
+			PxReal				mStepDt;
+			PxReal				mInvStepDt;
+			PxI32				mSharedSolverIndex;
+			PxI32				mSolvedCount;
+			PxI32				mSharedRigidBodyIndex;
+			PxI32				mRigidBodyIntegratedCount;
+			PxI32				mSharedArticulationIndex;
+			PxI32				mArticulationIntegratedCount;
+		};
+
+		struct SolverIslandObjectsStep;
+
+		class SolverBodyVelDataPool : public Ps::Array<PxTGSSolverBodyVel, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxTGSSolverBodyVel> > >
+		{
+			PX_NOCOPY(SolverBodyVelDataPool)
+		public:
+			SolverBodyVelDataPool() {}
+		};
+
+		class SolverBodyTxInertiaPool : public Ps::Array<PxTGSSolverBodyTxInertia, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxTGSSolverBodyTxInertia> > >
+		{
+			PX_NOCOPY(SolverBodyTxInertiaPool)
+		public:
+			SolverBodyTxInertiaPool() {}
+		};
+
+		class SolverBodyDataStepPool : public Ps::Array<PxTGSSolverBodyData, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxTGSSolverBodyData> > >
+		{
+			PX_NOCOPY(SolverBodyDataStepPool)
+		public:
+			SolverBodyDataStepPool() {}
+		};
+
+		
+
+		class SolverStepConstraintDescPool : public Ps::Array<PxSolverConstraintDesc, Ps::AlignedAllocator<128, Ps::ReflectionAllocator<PxSolverConstraintDesc> > >
+		{
+			PX_NOCOPY(SolverStepConstraintDescPool)
+		public:
+			SolverStepConstraintDescPool() { }
+		};
+
+
+#if PX_VC 
+#pragma warning(push)
+#pragma warning( disable : 4324 ) // Padding was added at the end of a structure because of a __declspec(align) value.
+#endif
+
+
+		class DynamicsTGSContext : public Context
+		{
+			PX_NOCOPY(DynamicsTGSContext)
+		public:
+
+			/**PxBaseTask* continuation
+			\brief Creates a DynamicsContext associated with a PxsContext
+			\return A pointer to the newly-created DynamicsContext.
+			*/
+			static DynamicsTGSContext*	create(PxcNpMemBlockPool* memBlockPool,
+				PxcScratchAllocator& scratchAllocator,
+				Cm::FlushPool& taskPool,
+				PxvSimStats& simStats,
+				PxTaskManager* taskManager,
+				Ps::VirtualAllocatorCallback* allocator,
+				PxsMaterialManager* materialManager,
+				IG::IslandSim* accurateIslandSim,
+				PxU64 contextID,
+				const bool enableStabilization,
+				const bool useEnhancedDeterminism,
+				const bool useAdaptiveForce,
+				const PxReal lengthScale
+				);
+
+			/**
+			\brief Destroys this DynamicsContext
+			*/
+			void						destroy();
+
+			/**
+			\brief Returns the static world solver body
+			\return The static world solver body.
+			*/
+			//PX_FORCE_INLINE PxSolverBody&		getWorldSolverBody()					{ return mWorldSolverBody; }
+
+			PX_FORCE_INLINE Cm::FlushPool&			getTaskPool()						{ return mTaskPool; }
+
+			PX_FORCE_INLINE ThresholdStream&		getThresholdStream()					{ return *mThresholdStream; }
+
+			PX_FORCE_INLINE PxvSimStats&			getSimStats()							{ return mSimStats; }
+
+#if PX_ENABLE_SIM_STATS
+			void									addThreadStats(const ThreadContext::ThreadSimStats& stats);
+#endif
+
+			/**
+			\brief The entry point for the constraint solver.
+			\param[in]	dt	The simulation time-step
+			\param[in]	continuation The continuation task for the solver
+
+			This method is called after the island generation has completed. Its main responsibilities are:
+			(1) Reserving the solver body pools
+			(2) Initializing the static and kinematic solver bodies, which are shared resources between islands.
+			(3) Construct the solver task chains for each island
+
+			Each island is solved as an independent solver task chain in parallel.
+
+			*/
+
+			virtual void						update(IG::SimpleIslandManager& simpleIslandManager, PxBaseTask* continuation, PxBaseTask* lostTouchTask,
+				PxsContactManager** foundPatchManagers, PxU32 nbFoundPatchManagers, PxsContactManager** lostPatchManagers, PxU32 nbLostPatchManagers,
+				PxU32 maxPatchesPerCM, PxsContactManagerOutputIterator& iter, PxsContactManagerOutput* gpuOutputs, const PxReal dt, const PxVec3& gravity, const PxU32 bitMapWordCounts);
+
+			void updatePostKinematic(IG::SimpleIslandManager& simpleIslandManager, PxBaseTask* continuation, 
+				PxBaseTask* lostTouchTask);
+
+			virtual void						processLostPatches(IG::SimpleIslandManager& /*simpleIslandManager*/, PxsContactManager** /*lostPatchManagers*/, PxU32 /*nbLostPatchManagers*/, PxsContactManagerOutputIterator& /*iterator*/){}
+
+			virtual void						updateBodyCore(PxBaseTask* continuation);
+
+			virtual void						setSimulationController(PxsSimulationController* simulationController){ mSimulationController = simulationController; }
+			/**
+			\brief This method combines the results of several islands, e.g. constructing scene-level simulation statistics and merging together threshold streams for contact notification.
+			*/
+			virtual void							mergeResults();
+
+			virtual void							getDataStreamBase(void*& /*contactStreamBase*/, void*& /*patchStreamBase*/, void*& /*forceAndIndicesStreamBase*/){}
+
+			/**
+			\brief Allocates and returns a thread context object.
+			\return A thread context.
+			*/
+			PX_FORCE_INLINE ThreadContext*					getThreadContext()
+			{
+				return mThreadContextPool.get();
+			}
+
+			/**
+			\brief Returns a thread context to the thread context pool.
+			\param[in] context The thread context to return to the thread context pool.
+			*/
+			void								putThreadContext(ThreadContext* context)
+			{
+				mThreadContextPool.put(context);
+			}
+
+
+			PX_FORCE_INLINE	PxU32					getKinematicCount()		const	{ return mKinematicCount; }
+			PX_FORCE_INLINE	PxU64					getContextId()			const	{ return mContextID; }
+
+			PX_FORCE_INLINE PxReal					getLengthScale()		const	{ return mLengthScale; }
+
+		protected:
+
+			/**
+			\brief Constructor for DynamicsContext
+			*/
+			DynamicsTGSContext(PxcNpMemBlockPool* memBlockPool,
+				PxcScratchAllocator& scratchAllocator,
+				Cm::FlushPool& taskPool,
+				PxvSimStats& simStats,
+				PxTaskManager* taskManager,
+				Ps::VirtualAllocatorCallback* allocator,
+				PxsMaterialManager* materialManager,
+				IG::IslandSim* accurateIslandSim,
+				PxU64 contextID,
+				const bool enableStabilization,
+				const bool useEnhancedDeterminism,
+				const bool useAdaptiveForce,
+				const PxReal lengthScale
+				);
+			/**
+			\brief Destructor for DynamicsContext
+			*/
+			virtual								~DynamicsTGSContext();
+
+
+			// Solver helper-methods
+			/**
+			\brief Computes the unconstrained velocity for a given PxsRigidBody
+			\param[in] atom The PxsRigidBody
+			*/
+			void								computeUnconstrainedVelocity(PxsRigidBody* atom)	const;
+
+			/**
+			\brief fills in a PxSolverConstraintDesc from an indexed interaction
+			\param[in,out] desc The PxSolverConstraintDesc
+			\param[in] constraint The PxsIndexedInteraction
+			*/
+			void								setDescFromIndices(PxSolverConstraintDesc& desc,
+				const PxsIndexedInteraction& constraint, const PxU32 solverBodyOffset, PxTGSSolverBodyVel* solverBodies);
+
+
+			void								setDescFromIndices(PxSolverConstraintDesc& desc, IG::EdgeIndex edgeIndex,
+				const IG::SimpleIslandManager& islandManager, PxU32* bodyRemapTable, const PxU32 solverBodyOffset, PxTGSSolverBodyVel* solverBodies);
+
+
+			void solveIsland(const SolverIslandObjectsStep& objects,
+				const PxsIslandIndices& counts,
+				const PxU32 solverBodyOffset,
+				IG::SimpleIslandManager& islandManager,
+				PxU32* bodyRemapTable, PxsMaterialManager* materialManager,
+				PxsContactManagerOutputIterator& iterator,
+				PxBaseTask* continuation);
+
+			void prepareBodiesAndConstraints(const SolverIslandObjectsStep& objects,
+				IG::SimpleIslandManager& islandManager,
+				IslandContextStep& islandContext);
+
+			void setupDescs(IslandContextStep& islandContext, const SolverIslandObjectsStep& objects, IG::SimpleIslandManager& mIslandManager, PxU32* mBodyRemapTable, PxU32 mSolverBodyOffset,
+				PxsContactManagerOutputIterator& outputs);
+
+			void preIntegrateBodies(PxsBodyCore** bodyArray, PxsRigidBody** originalBodyArray,
+				PxTGSSolverBodyVel* solverBodyVelPool, PxTGSSolverBodyTxInertia* solverBodyTxInertia, PxTGSSolverBodyData* solverBodyDataPool2,
+				PxU32* nodeIndexArray, const PxU32 bodyCount, const PxVec3& gravity, const PxReal dt, PxU32& posIters, PxU32& velIters, PxU32 iteration);
+
+			void setupArticulations(IslandContextStep& islandContext, const PxVec3& gravity, const PxReal dt, PxU32& posIters, PxU32& velIters, PxBaseTask* continuation);
+
+			PxU32 setupArticulationInternalConstraints(IslandContextStep& islandContext, PxReal dt, PxReal invStepDt, PxSolverConstraintDesc* constraintDescs);
+
+			void createSolverConstraints(PxSolverConstraintDesc* contactDescPtr, PxConstraintBatchHeader* headers, const PxU32 nbHeaders,
+				PxsContactManagerOutputIterator& outputs, Dy::ThreadContext& islandThreadContext, Dy::ThreadContext& threadContext, PxReal stepDt, PxReal totalDt, 
+				PxReal invStepDt, PxU32 nbSubsteps);
+
+			void solveConstraintsIteration(const PxSolverConstraintDesc* const contactDescPtr, const PxConstraintBatchHeader* const batchHeaders, const PxU32 nbHeaders, PxReal invStepDt,
+				const PxTGSSolverBodyTxInertia* const solverTxInertia, const PxReal elapsedTime, const PxReal minPenetration, SolverContext& cache);
+
+			void solveConcludeConstraintsIteration(const PxSolverConstraintDesc* const contactDescPtr, const PxConstraintBatchHeader* const batchHeaders, const PxU32 nbHeaders,
+				PxTGSSolverBodyTxInertia* solverTxInertia, const PxReal elapsedTime, SolverContext& cache);
+
+			void parallelSolveConstraints(const PxSolverConstraintDesc* const contactDescPtr, const PxConstraintBatchHeader* const batchHeaders, const PxU32 nbHeaders, PxTGSSolverBodyTxInertia* solverTxInertia,
+				const PxReal elapsedTime, const PxReal minPenetration, SolverContext& cache);
+
+			void writebackConstraintsIteration(const PxConstraintBatchHeader* const hdrs, const PxSolverConstraintDesc* const contactDescPtr, const PxU32 nbHeaders);
+
+			void parallelWritebackConstraintsIteration(const PxSolverConstraintDesc* const contactDescPtr, const PxConstraintBatchHeader* const batchHeaders, const PxU32 nbHeaders);
+
+			void integrateBodies(const SolverIslandObjectsStep& objects,
+				const PxU32 count, PxTGSSolverBodyVel* vels,
+				PxTGSSolverBodyTxInertia* txInertias, const PxTGSSolverBodyData*const bodyDatas, PxReal dt);
+
+			void parallelIntegrateBodies(PxTGSSolverBodyVel* vels, PxTGSSolverBodyTxInertia* txInertias,
+				const PxTGSSolverBodyData* const bodyDatas, const PxU32 count, PxReal dt);
+
+			void copyBackBodies(const SolverIslandObjectsStep& objects,
+				PxTGSSolverBodyVel* vels, PxTGSSolverBodyTxInertia* txInertias,
+				PxTGSSolverBodyData* solverBodyData, PxReal invDt,	IG::IslandSim& islandSim,
+				PxU32 startIdx, PxU32 endIdx);
+
+			void updateArticulations(Dy::ThreadContext& threadContext, const PxU32 startIdx, const PxU32 endIdx, PxReal dt);
+
+			void stepArticulations(Dy::ThreadContext& threadContext, const PxsIslandIndices& counts, PxReal dt, PxReal stepInvDt);
+
+			void iterativeSolveIsland(const SolverIslandObjectsStep& objects, const PxsIslandIndices& counts, ThreadContext& mThreadContext,
+				const PxReal stepDt, const PxReal invStepDt, const PxU32 posIters, const PxU32 velIters, SolverContext& cache);
+
+			void iterativeSolveIslandParallel(const SolverIslandObjectsStep& objects, const PxsIslandIndices& counts, ThreadContext& mThreadContext,
+				const PxReal stepDt, const PxU32 posIters, const PxU32 velIters, PxI32* solverCounts, PxI32* integrationCounts, PxI32* articulationIntegrationCounts,
+				PxI32* solverProgressCount, PxI32* integrationProgressCount, PxI32* articulationProgressCount, PxU32 solverUnrollSize, PxU32 integrationUnrollSize);
+
+			void endIsland(ThreadContext& mThreadContext);
+
+			void finishSolveIsland(ThreadContext& mThreadContext, const SolverIslandObjectsStep& objects,
+				const PxsIslandIndices& counts, IG::SimpleIslandManager& islandManager, PxBaseTask* continuation);
+
+
+
+
+			/**
+			\brief Resets the thread contexts
+			*/
+			void									resetThreadContexts();
+
+			/**
+			\brief Returns the scratch memory allocator.
+			\return The scratch memory allocator.
+			*/
+			PX_FORCE_INLINE PxcScratchAllocator&	getScratchAllocator() { return mScratchAllocator; }
+
+			//Data
+
+			PxTGSSolverBodyVel						mWorldSolverBodyVel;
+			PxTGSSolverBodyTxInertia				mWorldSolverBodyTxInertia;
+			PxTGSSolverBodyData						mWorldSolverBodyData2;
+
+			/**
+			\brief A thread context pool
+			*/
+			PxcThreadCoherentCache<ThreadContext, PxcNpMemBlockPool> mThreadContextPool;
+
+			/**
+			\brief Solver constraint desc array
+			*/
+			SolverStepConstraintDescPool	mSolverConstraintDescPool;
+
+			SolverStepConstraintDescPool	mOrderedSolverConstraintDescPool;
+
+			SolverStepConstraintDescPool	mTempSolverConstraintDescPool;
+
+			Ps::Array<PxConstraintBatchHeader> mContactConstraintBatchHeaders;
+
+			/**
+			\brief Array of motion velocities for all bodies in the scene.
+			*/
+			Ps::Array<Cm::SpatialVector> mMotionVelocityArray;
+
+			/**
+			\brief Array of body core pointers for all bodies in the scene.
+			*/
+			Ps::Array<PxsBodyCore*>	mBodyCoreArray;
+
+			/**
+			\brief Array of rigid body pointers for all bodies in the scene.
+			*/
+			Ps::Array<PxsRigidBody*> mRigidBodyArray;
+
+			/**
+			\brief Array of articulationpointers for all articulations in the scene.
+			*/
+			Ps::Array<ArticulationV*> mArticulationArray;
+
+			SolverBodyVelDataPool		mSolverBodyVelPool;
+
+			SolverBodyTxInertiaPool		mSolverBodyTxInertiaPool;
+
+			SolverBodyDataStepPool		mSolverBodyDataPool2;
+
+
+			ThresholdStream*		mExceededForceThresholdStream[2]; //this store previous and current exceeded force thresholdStream	
+
+			Ps::Array<PxU32>		mExceededForceThresholdStreamMask;
+
+			Ps::Array<PxU32>		mSolverBodyRemapTable;				//Remaps from the "active island" index to the index within a solver island
+
+			Ps::Array<PxU32>		mNodeIndexArray;					//island node index
+
+			Ps::Array<PxsIndexedContactManager> mContactList;
+
+			/**
+			\brief The total number of kinematic bodies in the scene
+			*/
+			PxU32						mKinematicCount;
+
+			/**
+			\brief Atomic counter for the number of threshold stream elements.
+			*/
+			PxI32						mThresholdStreamOut;
+
+
+
+			PxsMaterialManager*			mMaterialManager;
+
+			PxsContactManagerOutputIterator mOutputIterator;
+
+			PxReal						mLengthScale;
+
+		private:
+			//private:
+			PxcScratchAllocator&						mScratchAllocator;
+			Cm::FlushPool&								mTaskPool;
+			PxTaskManager*								mTaskManager;
+			PxU32										mCurrentIndex; // this is the index point to the current exceeded force threshold stream
+
+			PxU64										mContextID;
+
+			friend class SetupDescsTask;
+			friend class PreIntegrateTask;
+			friend class SetupArticulationTask;
+			friend class SetupArticulationInternalConstraintsTask;
+			friend class SetupSolverConstraintsTask;
+			friend class SolveIslandTask;
+			friend class EndIslandTask;
+			friend class SetupSolverConstraintsSubTask;
+			friend class ParallelSolveTask;
+			friend class PreIntegrateParallelTask;
+			friend class CopyBackTask;
+			friend class UpdateArticTask;
+			friend class FinishSolveIslandTask;
+		};
+
+#if PX_VC 
+#pragma warning(pop)
+#endif
+
+	}
+}
+
+#endif
+
--- a/physx/source/lowleveldynamics/src/DyThreadContext.cpp
+++ b/physx/source/lowleveldynamics/src/DyThreadContext.cpp
@ -0,0 +1,110 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "DyThreadContext.h"
+#include "PsBitUtils.h"
+
+namespace physx
+{
+namespace Dy
+{
+
+ThreadContext::ThreadContext(PxcNpMemBlockPool* memBlockPool):
+	mFrictionPatchStreamPair(*memBlockPool),
+	mConstraintBlockManager	(*memBlockPool),
+	mConstraintBlockStream	(*memBlockPool),
+	mNumDifferentBodyConstraints(0),
+	mNumSelfConstraints(0),
+	mNumStaticConstraints(0),
+	mConstraintsPerPartition(PX_DEBUG_EXP("ThreadContext::mConstraintsPerPartition")),
+	mFrictionConstraintsPerPartition(PX_DEBUG_EXP("ThreadContext::frictionsConstraintsPerPartition")),
+	mPartitionNormalizationBitmap(PX_DEBUG_EXP("ThreadContext::mPartitionNormalizationBitmap")),
+	frictionConstraintDescArray(PX_DEBUG_EXP("ThreadContext::solverFrictionConstraintArray")),
+	frictionConstraintBatchHeaders(PX_DEBUG_EXP("ThreadContext::frictionConstraintBatchHeaders")),
+	compoundConstraints(PX_DEBUG_EXP("ThreadContext::compoundConstraints")),
+	orderedContactList(PX_DEBUG_EXP("ThreadContext::orderedContactList")),
+	tempContactList(PX_DEBUG_EXP("ThreadContext::tempContactList")),
+	sortIndexArray(PX_DEBUG_EXP("ThreadContext::sortIndexArray")),
+	mConstraintSize			(0),
+	mAxisConstraintCount(0),
+	mSelfConstraintBlocks(NULL),
+	mMaxPartitions(0),
+	mMaxSolverPositionIterations(0),
+	mMaxSolverVelocityIterations(0),
+	mMaxArticulationLength(0),
+	mContactDescPtr(NULL),
+	mFrictionDescPtr(NULL),
+	mArticulations(PX_DEBUG_EXP("ThreadContext::articulations"))
+	
+{
+#if PX_ENABLE_SIM_STATS
+	mThreadSimStats.clear();
+#endif
+	//Defaulted to have space for 16384 bodies
+	mPartitionNormalizationBitmap.reserve(512); 
+	//Defaulted to have space for 128 partitions (should be more-than-enough)
+	mConstraintsPerPartition.reserve(128);
+}
+
+void ThreadContext::resizeArrays(PxU32 frictionConstraintDescCount, PxU32 articulationCount)
+{
+	// resize resizes smaller arrays to the exact target size, which can generate a lot of churn
+	frictionConstraintDescArray.forceSize_Unsafe(0);
+	frictionConstraintDescArray.reserve((frictionConstraintDescCount+63)&~63);
+
+	mArticulations.forceSize_Unsafe(0);
+	mArticulations.reserve(PxMax<PxU32>(Ps::nextPowerOfTwo(articulationCount), 16));
+	mArticulations.forceSize_Unsafe(articulationCount);
+
+	mContactDescPtr = contactConstraintDescArray;
+	mFrictionDescPtr = frictionConstraintDescArray.begin();
+}
+
+void ThreadContext::reset()
+{
+	// TODO: move these to the PxcNpThreadContext
+	mFrictionPatchStreamPair.reset();
+	mConstraintBlockStream.reset();
+
+	mContactDescPtr = contactConstraintDescArray;
+	mFrictionDescPtr = frictionConstraintDescArray.begin();
+
+	mAxisConstraintCount = 0;
+	mMaxSolverPositionIterations = 0;
+	mMaxSolverVelocityIterations = 0;
+	mNumDifferentBodyConstraints = 0;
+	mNumSelfConstraints = 0;
+	mNumStaticConstraints = 0;
+	mSelfConstraintBlocks = NULL;
+	mConstraintSize = 0;
+}
+
+}
+} 
--- a/physx/source/lowleveldynamics/src/DyThreadContext.h
+++ b/physx/source/lowleveldynamics/src/DyThreadContext.h
@ -0,0 +1,208 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef DY_THREADCONTEXT_H
+#define DY_THREADCONTEXT_H
+
+#include "foundation/PxTransform.h"
+#include "geomutils/GuContactBuffer.h"
+
+#include "PxvConfig.h"
+#include "PxvDynamics.h"
+#include "PxcThreadCoherentCache.h"
+#include "PxcConstraintBlockStream.h"
+#include "CmBitMap.h"
+#include "CmMatrix34.h"
+#include "DyThresholdTable.h"
+#include "DyArticulation.h"
+#include "DyFrictionPatchStreamPair.h"
+#include "DySolverConstraintDesc.h"
+#include "DyCorrelationBuffer.h"
+#include "PsAllocator.h"
+
+namespace physx
+{
+struct PxsIndexedContactManager;
+
+namespace Dy
+{
+
+/*!
+Cache information specific to the software implementation(non common).
+
+See PxcgetThreadContext.
+
+Not thread-safe, so remember to have one object per thread!
+
+TODO! refactor this and rename(it is a general per thread cache). Move transform cache into its own class.
+*/
+class ThreadContext : 
+	public PxcThreadCoherentCache<ThreadContext, PxcNpMemBlockPool>::EntryBase
+{
+	PX_NOCOPY(ThreadContext)
+public:
+
+#if PX_ENABLE_SIM_STATS
+	struct ThreadSimStats
+	{
+		void clear()
+		{
+
+			numActiveConstraints = 0;
+			numActiveDynamicBodies = 0;
+			numActiveKinematicBodies = 0;
+			numAxisSolverConstraints = 0;
+
+		}
+
+		PxU32 numActiveConstraints;
+		PxU32 numActiveDynamicBodies;
+		PxU32 numActiveKinematicBodies;
+		PxU32 numAxisSolverConstraints;
+
+	};
+#endif
+
+	//TODO: tune cache size based on number of active objects.
+	ThreadContext(PxcNpMemBlockPool* memBlockPool);
+	void reset();
+	void resizeArrays(PxU32 frictionConstraintDescCount, PxU32 articulationCount);
+
+	PX_FORCE_INLINE	Ps::Array<ArticulationSolverDesc>&		getArticulations()								{ return mArticulations;					}
+
+
+#if PX_ENABLE_SIM_STATS
+	PX_FORCE_INLINE ThreadSimStats& getSimStats()
+	{
+		return mThreadSimStats;
+	}
+#endif
+
+	Gu::ContactBuffer mContactBuffer;
+
+		// temporary buffer for correlation
+	PX_ALIGN(16, CorrelationBuffer			mCorrelationBuffer); 
+
+	FrictionPatchStreamPair		mFrictionPatchStreamPair;	// patch streams
+
+	PxsConstraintBlockManager		mConstraintBlockManager;	// for when this thread context is "lead" on an island
+	PxcConstraintBlockStream 		mConstraintBlockStream;		// constraint block pool
+
+
+	// this stuff is just used for reformatting the solver data. Hopefully we should have a more
+	// sane format for this when the dust settles - so it's just temporary. If we keep this around
+	// here we should move these from public to private
+
+	PxU32 mNumDifferentBodyConstraints;
+	PxU32 mNumDifferentBodyFrictionConstraints;
+	PxU32 mNumSelfConstraints;
+	PxU32 mNumStaticConstraints;
+	PxU32 mNumSelfFrictionConstraints;
+	PxU32 mNumSelfConstraintFrictionBlocks;
+
+	Ps::Array<PxU32>					mConstraintsPerPartition;
+	Ps::Array<PxU32>					mFrictionConstraintsPerPartition;
+	Ps::Array<PxU32>					mPartitionNormalizationBitmap;
+	PxsBodyCore**						mBodyCoreArray;
+	PxsRigidBody**						mRigidBodyArray;
+	ArticulationV**						mArticulationArray;
+	Cm::SpatialVector*					motionVelocityArray;
+	PxU32*								bodyRemapTable;
+	PxU32*								mNodeIndexArray;
+
+	//Constraint info for normal constraint sovler
+	PxSolverConstraintDesc*				contactConstraintDescArray;
+	PxU32								contactDescArraySize;
+	PxSolverConstraintDesc*				orderedContactConstraints;
+	PxConstraintBatchHeader*			contactConstraintBatchHeaders;
+	PxU32								numContactConstraintBatches;
+
+	//Constraint info for partitioning
+	PxSolverConstraintDesc*				tempConstraintDescArray;
+
+	//Additional constraint info for 1d/2d friction model
+	Ps::Array<PxSolverConstraintDesc>	frictionConstraintDescArray;
+	Ps::Array<PxConstraintBatchHeader>	frictionConstraintBatchHeaders;
+
+	//Info for tracking compound contact managers (temporary data - could use scratch memory!)
+	Ps::Array<CompoundContactManager>	compoundConstraints;
+
+	//Used for sorting constraints. Temporary, could use scratch memory
+	Ps::Array<const PxsIndexedContactManager*>	orderedContactList;
+	Ps::Array<const PxsIndexedContactManager*>	tempContactList;
+	Ps::Array<PxU32>							sortIndexArray;
+	
+	Ps::Array<Cm::SpatialVectorF>				mZVector;
+	Ps::Array<Cm::SpatialVectorF>				mDeltaV;
+
+
+	PxU32								numDifferentBodyBatchHeaders;
+	PxU32								numSelfConstraintBatchHeaders;
+
+	
+	PxU32								mOrderedContactDescCount;
+	PxU32								mOrderedFrictionDescCount;
+
+	PxU32								mConstraintSize;
+
+	PxU32 mAxisConstraintCount;
+	SelfConstraintBlock* mSelfConstraintBlocks;
+	
+	SelfConstraintBlock* mSelfConstraintFrictionBlocks;
+
+	PxU32 mMaxPartitions;
+	PxU32 mMaxFrictionPartitions;
+	PxU32 mMaxSolverPositionIterations;
+	PxU32 mMaxSolverVelocityIterations;
+	PxU32 mMaxArticulationLength;
+	PxU32 mMaxArticulationSolverLength;
+	PxU32 mMaxArticulationLinks;
+	
+	PxSolverConstraintDesc* mContactDescPtr;
+	PxSolverConstraintDesc* mStartContactDescPtr;
+	PxSolverConstraintDesc* mFrictionDescPtr;
+
+private:
+
+	Ps::Array<ArticulationSolverDesc>	mArticulations;
+
+#if PX_ENABLE_SIM_STATS
+	ThreadSimStats				mThreadSimStats;
+#endif
+
+	public:
+
+};
+
+}
+
+}
+
+#endif //DY_THREADCONTEXT_H
--- a/physx/source/lowleveldynamics/src/DyThresholdTable.cpp
+++ b/physx/source/lowleveldynamics/src/DyThresholdTable.cpp
@ -0,0 +1,68 @@
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2021 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxMemory.h"
+#include "DyThresholdTable.h"
+#include "PsHash.h"
+#include "PsUtilities.h"
+#include "PsAllocator.h"
+
+namespace physx
+{
+	namespace Dy
+	{
+		bool ThresholdTable::check(const ThresholdStream& stream, const PxU32 nodeIndexA, const PxU32 nodeIndexB, PxReal dt)
+		{
+			PxU32* PX_RESTRICT hashes = mHash;
+			PxU32* PX_RESTRICT nextIndices = mNexts;
+			Pair* PX_RESTRICT pairs = mPairs;
+
+			/*const PxsRigidBody* b0 = PxMin(body0, body1);
+			const PxsRigidBody* b1 = PxMax(body0, body1);*/
+
+			const PxU32 nA = PxMin(nodeIndexA, nodeIndexB);
+			const PxU32 nB = PxMax(nodeIndexA, nodeIndexB);
+
+			PxU32 hashKey = computeHashKey(nodeIndexA, nodeIndexB, mHashSize);
+
+			PxU32 pairIndex = hashes[hashKey];
+			while(NO_INDEX != pairIndex)
+			{
+				Pair& pair = pairs[pairIndex];
+				const PxU32 thresholdStreamIndex = pair.thresholdStreamIndex;
+				PX_ASSERT(thresholdStreamIndex < stream.size());
+				const ThresholdStreamElement& otherElement = stream[thresholdStreamIndex];
+				if(otherElement.nodeIndexA.index()==nA && otherElement.nodeIndexB.index()==nB)
+					return (pair.accumulatedForce > (otherElement.threshold * dt));
+				pairIndex = nextIndices[pairIndex];
+			}
+			return false;
+		}
+	}
+}