Solar Nebula Simulation

This astronomical simulation shows how certain conditions in a nebula can lead to the formation of a solar system. It starts with a cloud of rigid bodies with a small initial rotation and advances through time as a solar system forms. It uses DirectX Computational Shaders to simulate rigid bodies using the graphics card (GPU). Each of the 50,000 bodies can collide with every other body and these collisions are handled in a thread-safe manner on the GPU.

The most reusable feature of this program is its thread-safe binning algorithm that runs entirely on the GPU. What I learned while optimizing this project was that when implementing complex algorithms on the GPU, it is best to break them into a series of simpler stages that can be executed on a large number of operands.

View Code

// group size
#define NUM_THREAD_GROUPS_X 32
#define NUM_THREAD_GROUPS_Y 32
#define thread_group_size_x 8
#define thread_group_size_y 8

#define VISCOUS_DAMPING 0.001
// #define VISCOUS_DAMPING 0.0

//=========================================================================
//=========================================================================
struct BodyState
{
	float3 pos;
	float3 vel;

	float3x3 orientation;
	float3 angularMomentum;

	float size;
	uint isAlive;

	uint color;
};
// Update this if BodyState changes!
#define BODY_STATE_SIZE_BYTES 84
#define BODY_STATE_POS_OFFSET_BYTES 0
#define BODY_STATE_VEL_OFFSET_BYTES 12
#define BODY_STATE_ORIENTATION_OFFSET_BYTES 24
#define BODY_STATE_ANGULAR_OFFSET_BYTES 60
#define BODY_STATE_SIZE_OFFSET_BYTES 72
#define BODY_STATE_ISALIVE_OFFSET_BYTES 76
#define BODY_STATE_COLOR_OFFSET_BYTES 80

RWByteAddressBuffer g_stateBuffer;
RWByteAddressBuffer g_tentativeBuffer; // Unused
RWByteAddressBuffer g_octreeBuffer; // Unused
RWByteAddressBuffer g_stateAdjustedBuffer;
RWByteAddressBuffer g_glomBuffer; // Unused
//=========================================================================
//=========================================================================
cbuffer consts
{
	float delta; float3 padding;
};

//=========================================================================
//=========================================================================
BodyState DerriveState( BodyState a )
{
	BodyState result;

	result.pos = a.vel;
	result.vel = -a.vel * VISCOUS_DAMPING;
	float alen = length( a.pos );
	if ( alen >= 1 )
		result.vel -= 100 * a.pos / alen;
	else 
		result.vel -= 100 * a.pos;

	result.orientation = transpose( a.orientation );

	for ( uint i = 0; i < 3; ++i ) result.orientation[ i ] =
		cross( a.angularMomentum, result.orientation[ i ] );

	result.orientation = transpose( result.orientation );

	result.angularMomentum = float3( 0, 0, 0 ); // TODO - Forces

	result.size = 0;
	result.isAlive = 0;
	result.color = 0;

	return result;
}

//=========================================================================
//=========================================================================
BodyState StateAdd( BodyState a, BodyState b )
{
	BodyState result;

	result.pos = a.pos + b.pos;
	result.vel = a.vel + b.vel;
	result.orientation = a.orientation + b.orientation;
	result.angularMomentum = a.angularMomentum + b.angularMomentum;
	result.size = a.size + b.size;

	result.isAlive = a.isAlive;
	
	result.color = a.color + b.color;

	return result;
}

//=========================================================================
//=========================================================================
BodyState StateMult( BodyState a, float scalar )
{
	BodyState result;

	result.pos = a.pos * scalar;
	result.vel = a.vel * scalar;
	result.orientation = a.orientation * scalar;
	result.angularMomentum = a.angularMomentum * scalar;
	result.size = a.size * scalar;

	result.isAlive = a.isAlive;

	result.color = a.color * scalar;

	return result;
}

//=========================================================================
//=========================================================================
void GramSchmidt( inout float3x3 a )
{
	a = transpose( a );

	a[ 0 ] = normalize( a[ 0 ] );

	float err = dot( a[ 0 ], a[ 1 ] );
	a[ 1 ] = normalize( a[ 1 ] - a[ 0 ] * err );

	float errA = dot( a[ 0 ], a[ 2 ] );
	float errB = dot( a[ 1 ], a[ 2 ] );
	a[ 2 ] = normalize( a[ 2 ] - a[ 0 ] * errA - a[ 1 ] * errB );

	a = transpose( a );
}

//=========================================================================
//=========================================================================
float3x3 Get3x3MatrixByAddress( uint addr )
{
	float3x3 result;

	result[0] = asfloat( g_stateAdjustedBuffer.Load3( addr + 0 ) );
	result[1] = asfloat( g_stateAdjustedBuffer.Load3( addr + 12 ) );
	result[2] = asfloat( g_stateAdjustedBuffer.Load3( addr + 24 ) );

	return result;
}

//=========================================================================
//=========================================================================
void Write3x3MatrixByAddress( uint addr, float3x3 val )
{
	g_stateBuffer.Store3( addr + 0, asuint( val[0] ) );
	g_stateBuffer.Store3( addr + 12, asuint( val[1] ) );
	g_stateBuffer.Store3( addr + 24, asuint( val[2] ) );
}

//=========================================================================
//=========================================================================
BodyState GetStateByVirtualIndex( uint index )
{
	BodyState result;

	uint offset = BODY_STATE_SIZE_BYTES * index;

	result.pos = asfloat( g_stateAdjustedBuffer.Load3( offset + BODY_STATE_POS_OFFSET_BYTES ) );
	result.vel = asfloat( g_stateAdjustedBuffer.Load3( offset + BODY_STATE_VEL_OFFSET_BYTES ) );

	result.orientation =
		Get3x3MatrixByAddress( offset + BODY_STATE_ORIENTATION_OFFSET_BYTES );
	result.angularMomentum =
		asfloat( g_stateAdjustedBuffer.Load3( offset + BODY_STATE_ANGULAR_OFFSET_BYTES ) );

	result.size =
		asfloat( g_stateAdjustedBuffer.Load( offset + BODY_STATE_SIZE_OFFSET_BYTES ) );

	result.isAlive =
		g_stateAdjustedBuffer.Load( offset + BODY_STATE_ISALIVE_OFFSET_BYTES );

	result.color =
		g_stateAdjustedBuffer.Load( offset + BODY_STATE_COLOR_OFFSET_BYTES );

	return result;
}

//=========================================================================
//=========================================================================
void WriteStateByVirtualIndex( uint index, BodyState state )
{
	uint offset = BODY_STATE_SIZE_BYTES * index;

	g_stateBuffer.Store3( offset + BODY_STATE_POS_OFFSET_BYTES, asuint( state.pos ) );
	g_stateBuffer.Store3( offset + BODY_STATE_VEL_OFFSET_BYTES, asuint( state.vel ) );

	Write3x3MatrixByAddress( offset + BODY_STATE_ORIENTATION_OFFSET_BYTES,
		state.orientation );

	g_stateBuffer.Store3( offset + BODY_STATE_ANGULAR_OFFSET_BYTES,
		asuint( state.angularMomentum ) );

	g_stateBuffer.Store( offset + BODY_STATE_SIZE_OFFSET_BYTES,
		asuint( state.size ) );

	g_stateBuffer.Store( offset + BODY_STATE_ISALIVE_OFFSET_BYTES,
		state.isAlive );

	g_stateBuffer.Store( offset + BODY_STATE_COLOR_OFFSET_BYTES,
		state.color );
}

//=========================================================================
//=========================================================================
BodyState ForwardEulerStep( BodyState s, float deltaTime )
{
	BodyState result;
	
	result = StateAdd( s, StateMult( DerriveState( s ), deltaTime ) );
	GramSchmidt( result.orientation );

	return result;
}

//=========================================================================
//=========================================================================
BodyState RK4Step( BodyState s, float deltaTime )
{
	BodyState a, b, c, d;

	float halfDelta = deltaTime / 2;

	a = DerriveState( s );
	b = DerriveState( StateAdd( s, StateMult( a, halfDelta ) ) );
	c = DerriveState( StateAdd( s, StateMult( b, halfDelta ) ) );
	d = DerriveState( StateAdd( s, StateMult( c, deltaTime ) ) );

	BodyState result = StateAdd(
		s, StateMult(
			StateAdd(
				StateAdd( a, StateMult( b, 2 ) ), 
				StateAdd( StateMult( c, 2 ), d )
			), ( deltaTime / 6 )
		) );

	GramSchmidt( result.orientation );

	return result;
}

//=========================================================================
//=========================================================================
[numthreads( thread_group_size_x, thread_group_size_y, 1 )]

void main( uint3 threadIDInGroup : SV_GroupThreadID, uint3 groupID : SV_GroupID, 
         uint groupIndex : SV_GroupIndex, 
         uint3 dispatchThreadID : SV_DispatchThreadID )
{
	uint stride = thread_group_size_x * thread_group_size_y;
	uint curIndex = ( groupID.y * NUM_THREAD_GROUPS_X + groupID.x ) * stride +
		threadIDInGroup.y * thread_group_size_x + threadIDInGroup.x;

	BodyState curState = GetStateByVirtualIndex( curIndex );
	
	curState = RK4Step( curState, delta );
	
	WriteStateByVirtualIndex( curIndex, curState );
}

//=========================================================================
//=========================================================================
technique11 MyTechnique
{
    pass P0
    {          
		SetComputeShader( CompileShader( cs_5_0, main() ) );
    }
}
// group size
#define NUM_THREAD_GROUPS_X 32
#define NUM_THREAD_GROUPS_Y 32
#define thread_group_size_x 8
#define thread_group_size_y 8

#define OCTREE_WIDTH 64
#define OCTREE_BOUNDS 1200
#define OCTREE_ITEMS_PER_BUCKET 16

// Must conform to ( x / 2 + 1 ) % 4 == 0
#define MAX_COLLISION_PARTNERS 30

//=========================================================================
//=========================================================================
struct TentativeBodyState
{
	float3 pos;

	//float3x3 orientation;

	float size;
	uint isAlive;

	//uint nPartners;

	//uint partners[ MAX_COLLISION_PARTNERS / 2 ];
};
// Update this if BodyState changes!
#define TENTATIVE_STATE_POS_OFFSET_BYTES 0
#define TENTATIVE_STATE_ORIENTATION_OFFSET_BYTES 12
#define TENTATIVE_STATE_SIZE_OFFSET_BYTES 48
#define TENTATIVE_STATE_ISALIVE_OFFSET_BYTES 52
#define TENTATIVE_STATE_NUMPARTNERS_OFFSET_BYTES 56
#define TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES 60
#define TENTATIVE_STATE_SIZE_BYTES ( TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES + MAX_COLLISION_PARTNERS * 2 )

RWByteAddressBuffer g_stateBuffer; // Unused
RWByteAddressBuffer g_tentativeBuffer;
RWByteAddressBuffer g_octreeBuffer;
RWByteAddressBuffer g_stateAdjustedBuffer; // Unused
RWByteAddressBuffer g_glomBuffer; // Unused

//=========================================================================
//=========================================================================
TentativeBodyState GetStateByVirtualIndex( uint index )
{
	TentativeBodyState result;

	uint offset = TENTATIVE_STATE_SIZE_BYTES * index;

	result.pos = asfloat( g_tentativeBuffer.Load3( offset + TENTATIVE_STATE_POS_OFFSET_BYTES ) );
	result.size = asfloat( g_tentativeBuffer.Load( offset + TENTATIVE_STATE_SIZE_OFFSET_BYTES ) );
	result.isAlive = g_tentativeBuffer.Load( offset + TENTATIVE_STATE_ISALIVE_OFFSET_BYTES );

	return result;
}

//=========================================================================
//=========================================================================
bool AlreadyInBucket( uint myIdx, uint newPartner, uint nPartners )
{
	[allow_uav_condition]
	for ( uint i = 0; i < nPartners; ++i )
	{
		uint existingPartner = g_tentativeBuffer.Load( myIdx *
			TENTATIVE_STATE_SIZE_BYTES + TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES +
			( i >> 1 ) * 4 );
		
		existingPartner = ( ( i & 0x1 ) == 0x1 ) ?
			existingPartner >> 16 : existingPartner & 0xFFFF;
		
		if ( existingPartner == newPartner ) return true;
	}

	return false;
}

//=========================================================================
//=========================================================================
bool OutsideBoundingSpheres( TentativeBodyState bodyA, uint idxB )
{
	TentativeBodyState bodyB = GetStateByVirtualIndex( idxB );

	if ( ( bodyB.isAlive & 0x1 ) == 0 ) return true;

	return ( dot( bodyB.pos - bodyA.pos, bodyB.pos - bodyA.pos ) >
		bodyA.size * bodyA.size + bodyA.size * bodyB.size + bodyB.size * bodyB.size );
}

//=========================================================================
// Behavior is undefined if myVirtualIndex > 65535
//=========================================================================
void FindPartnersIn( uint3 bucketCoord, uint myVirtualIndex,
	TentativeBodyState bodyA, inout uint nPartners )
{
	uint bucketIndex = bucketCoord.z * OCTREE_WIDTH * OCTREE_WIDTH +
		bucketCoord.y * OCTREE_WIDTH +
		bucketCoord.x;
	
	uint bucketByteAddress = bucketIndex * 4 * OCTREE_ITEMS_PER_BUCKET;
	
	uint nItemsInBucket = g_octreeBuffer.Load( bucketByteAddress );
	bucketByteAddress += 4;

	if ( nItemsInBucket > OCTREE_ITEMS_PER_BUCKET * 2 - 1 )
		nItemsInBucket = OCTREE_ITEMS_PER_BUCKET * 2 - 1;
	
	[allow_uav_condition]
	while( nItemsInBucket != 0 )
	{
		// Read the partner index from the octree
		--nItemsInBucket;

		uint packedBits = g_octreeBuffer.Load(
			bucketByteAddress + 4 * ( nItemsInBucket >> 1 ) );

		uint partnerIdx = ( ( nItemsInBucket & 0x1 ) == 0x1 ) ?
			packedBits >> 16 : packedBits & 0xFFFF;

		// Test the partner
		if ( partnerIdx == myVirtualIndex ) continue;
		if ( AlreadyInBucket( myVirtualIndex, partnerIdx, nPartners ) ) continue;
		if ( OutsideBoundingSpheres( bodyA, partnerIdx ) ) continue;
		
		// Write the partner to my partner list
		if ( nPartners < MAX_COLLISION_PARTNERS )
		{
			if ( ( nPartners & 0x1 ) == 0x1 ) partnerIdx <<= 16;

			// TODO - switch to load/store?
			g_tentativeBuffer.InterlockedOr( myVirtualIndex *
				TENTATIVE_STATE_SIZE_BYTES + TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES +
				( nPartners >> 1 ) * 4, partnerIdx );
		}

		++nPartners;
	}
}

//=========================================================================
//=========================================================================
[numthreads( thread_group_size_x, thread_group_size_y, 1 )]

void main( uint3 threadIDInGroup : SV_GroupThreadID, uint3 groupID : SV_GroupID, 
         uint groupIndex : SV_GroupIndex, 
         uint3 dispatchThreadID : SV_DispatchThreadID )
{
	uint stride = thread_group_size_x * NUM_THREAD_GROUPS_X;
	uint curIndex = ( threadIDInGroup.y + groupID.y * thread_group_size_y ) * stride +
		( threadIDInGroup.x + groupID.x * thread_group_size_x );

	TentativeBodyState nextState = GetStateByVirtualIndex( curIndex );

	if ( ( nextState.isAlive & 0x1 ) == 0 ) return;
	
	float3 minBound = nextState.pos;
	float3 maxBound = nextState.pos;
	
	// Grow bounds by body size and possible velocity error
	minBound -= float3( nextState.size, nextState.size, nextState.size );
	maxBound += float3( nextState.size, nextState.size, nextState.size );

	// Move bounds from [ -1200 to 1200 ] to [ 0 to 63 ]
	minBound += float3( OCTREE_BOUNDS, OCTREE_BOUNDS, OCTREE_BOUNDS );
	maxBound += float3( OCTREE_BOUNDS, OCTREE_BOUNDS, OCTREE_BOUNDS );
	
	float invOctreeSpacialWidth = 1.0 / ( OCTREE_BOUNDS * 2.0 );

	minBound *= invOctreeSpacialWidth;
	maxBound *= invOctreeSpacialWidth;
	
	uint3 firstBucket = saturate( minBound ) * OCTREE_WIDTH * 0.999;
	uint3 lastBucket = saturate( maxBound ) * OCTREE_WIDTH * 0.999;

	uint nPartners = 0;
	
	[allow_uav_condition]
	for ( uint x = firstBucket.x; x <= lastBucket.x; ++ x )
		[allow_uav_condition]
		for ( uint y = firstBucket.y; y <= lastBucket.y; ++y )
			[allow_uav_condition]
			for ( uint z = firstBucket.z; z <= lastBucket.z; ++z )
				FindPartnersIn( uint3( x, y, z ), curIndex, nextState, nPartners );

	g_tentativeBuffer.Store( curIndex * TENTATIVE_STATE_SIZE_BYTES +
		TENTATIVE_STATE_NUMPARTNERS_OFFSET_BYTES, nPartners );
}

//=========================================================================
//=========================================================================
technique11 MyTechnique
{
    pass P0
    {          
		SetComputeShader( CompileShader( cs_5_0, main() ) );
    }
}
// group size
#define NUM_THREAD_GROUPS_X 32
#define NUM_THREAD_GROUPS_Y 32
#define thread_group_size_x 8
#define thread_group_size_y 8

// Must conform to ( x / 2 + 1 ) % 4 == 0
#define MAX_COLLISION_PARTNERS 30

#define MAX_VELOCITY 1500

#define ALWAYS_GLOM_MASS 1000
#define SLOW_GLOM_SPEED 1000

//=========================================================================
//=========================================================================
struct BodyState
{
	float3 pos;
	float3 vel;

	float3x3 orientation;
	float3 angularMomentum;

	float size;
	uint isAlive;

	uint color;
};
// Update this if BodyState changes!
#define BODY_STATE_SIZE_BYTES 84
#define BODY_STATE_POS_OFFSET_BYTES 0
#define BODY_STATE_VEL_OFFSET_BYTES 12
#define BODY_STATE_ORIENTATION_OFFSET_BYTES 24
#define BODY_STATE_ANGULAR_OFFSET_BYTES 60
#define BODY_STATE_SIZE_OFFSET_BYTES 72
#define BODY_STATE_ISALIVE_OFFSET_BYTES 76
#define BODY_STATE_COLOR_OFFSET_BYTES 80

//=========================================================================
//=========================================================================
struct TentativeBodyState
{
	float3 pos;

	//float3x3 orientation;

	float size;
	uint isAlive;

	//uint nPartners;

	//uint partners[ MAX_COLLISION_PARTNERS / 2 ];
};
// Update this if BodyState changes!
#define TENTATIVE_STATE_POS_OFFSET_BYTES 0
#define TENTATIVE_STATE_ORIENTATION_OFFSET_BYTES 12
#define TENTATIVE_STATE_SIZE_OFFSET_BYTES 48
#define TENTATIVE_STATE_ISALIVE_OFFSET_BYTES 52
#define TENTATIVE_STATE_NUMPARTNERS_OFFSET_BYTES 56
#define TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES 60
#define TENTATIVE_STATE_SIZE_BYTES ( TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES + MAX_COLLISION_PARTNERS * 2 )


//=========================================================================
//=========================================================================
RWByteAddressBuffer g_stateBuffer;
RWByteAddressBuffer g_tentativeBuffer;
RWByteAddressBuffer g_octreeBuffer; // Unused
RWByteAddressBuffer g_stateAdjustedBuffer;
RWByteAddressBuffer g_glomBuffer;

//=========================================================================
//=========================================================================
float3x3 Get3x3MatrixByAddress( uint addr )
{
	float3x3 result;

	result[0] = asfloat( g_stateBuffer.Load3( addr + 0 ) );
	result[1] = asfloat( g_stateBuffer.Load3( addr + 12 ) );
	result[2] = asfloat( g_stateBuffer.Load3( addr + 24 ) );

	return result;
}

//=========================================================================
//=========================================================================
void Write3x3MatrixByAddress( uint addr, float3x3 val )
{
	g_stateAdjustedBuffer.Store3( addr + 0, asuint( val[0] ) );
	g_stateAdjustedBuffer.Store3( addr + 12, asuint( val[1] ) );
	g_stateAdjustedBuffer.Store3( addr + 24, asuint( val[2] ) );
}

//=========================================================================
//=========================================================================
BodyState GetStateByVirtualIndex( uint index )
{
	BodyState result;

	uint offset = BODY_STATE_SIZE_BYTES * index;

	result.pos = asfloat( g_stateBuffer.Load3( offset + BODY_STATE_POS_OFFSET_BYTES ) );
	result.vel = asfloat( g_stateBuffer.Load3( offset + BODY_STATE_VEL_OFFSET_BYTES ) );

	result.orientation =
		Get3x3MatrixByAddress( offset + BODY_STATE_ORIENTATION_OFFSET_BYTES );
	result.angularMomentum =
		asfloat( g_stateBuffer.Load3( offset + BODY_STATE_ANGULAR_OFFSET_BYTES ) );

	result.size =
		asfloat( g_stateBuffer.Load( offset + BODY_STATE_SIZE_OFFSET_BYTES ) );

	result.isAlive =
		g_stateBuffer.Load( offset + BODY_STATE_ISALIVE_OFFSET_BYTES );

	result.color = 
		g_stateBuffer.Load( offset + BODY_STATE_COLOR_OFFSET_BYTES );

	return result;
}

//=========================================================================
//=========================================================================
void WriteStateByVirtualIndex( uint index, BodyState state )
{
	uint offset = BODY_STATE_SIZE_BYTES * index;

	g_stateAdjustedBuffer.Store3( offset + BODY_STATE_POS_OFFSET_BYTES, asuint( state.pos ) );
	g_stateAdjustedBuffer.Store3( offset + BODY_STATE_VEL_OFFSET_BYTES, asuint( state.vel ) );

	Write3x3MatrixByAddress( offset + BODY_STATE_ORIENTATION_OFFSET_BYTES,
		state.orientation );

	g_stateAdjustedBuffer.Store3( offset + BODY_STATE_ANGULAR_OFFSET_BYTES,
		asuint( state.angularMomentum ) );

	g_stateAdjustedBuffer.Store( offset + BODY_STATE_SIZE_OFFSET_BYTES,
		asuint( state.size ) );

	g_stateAdjustedBuffer.Store( offset + BODY_STATE_ISALIVE_OFFSET_BYTES,
		state.isAlive );

	g_stateAdjustedBuffer.Store( offset + BODY_STATE_COLOR_OFFSET_BYTES,
		state.color );
}

//=========================================================================
//=========================================================================
void AdjustToCollisionWith( BodyState curState, uint myIdx, uint partnerIdx,
	inout float3 deltaVel, inout float3 deltaOmega, inout uint glomPartner )
{
	float e = 1.0;

	BodyState partnerState = GetStateByVirtualIndex( partnerIdx );

	// For now just use particle-collision system

	float3 collisionNormal = normalize( curState.pos - partnerState.pos );

	float vA = dot( collisionNormal, curState.vel );
	float vB = dot( collisionNormal, partnerState.vel );

	float massA = curState.size * curState.size * curState.size;
	float massB = partnerState.size * partnerState.size * partnerState.size;

	deltaVel += ( 0.8 * collisionNormal * ( massB * e * ( vB - vA ) + massA * vA + massB * vB ) /
		( massA + massB ) - collisionNormal * vA ) + collisionNormal * 0.4;

	// Am I taking lead on this collision? If so give glomming hints
	if ( partnerIdx > myIdx )
	{
		if ( massA < ALWAYS_GLOM_MASS || massB < ALWAYS_GLOM_MASS )
			glomPartner = partnerIdx;

		if ( dot( deltaVel, deltaVel ) < SLOW_GLOM_SPEED )
			glomPartner = partnerIdx;
	}
}

//=========================================================================
//=========================================================================
[numthreads( thread_group_size_x, thread_group_size_y, 1 )]

void main( uint3 threadIDInGroup : SV_GroupThreadID, uint3 groupID : SV_GroupID, 
         uint groupIndex : SV_GroupIndex, 
         uint3 dispatchThreadID : SV_DispatchThreadID )
{
	// Get Index
	uint stride = thread_group_size_x * NUM_THREAD_GROUPS_X;
	uint curIndex = ( threadIDInGroup.y + groupID.y * thread_group_size_y ) * stride +
		( threadIDInGroup.x + groupID.x * thread_group_size_x );

	uint glomPartner = 0xFFFFFFFF;

	// Load state
	BodyState curState = GetStateByVirtualIndex( curIndex );
	if ( ( curState.isAlive & 0x1 ) == 1 ) 
	{
		float3 deltaVel = float3( 0, 0, 0 );
		float3 deltaOmega = float3( 0, 0, 0 );
	
		uint nPartners = g_tentativeBuffer.Load( curIndex * TENTATIVE_STATE_SIZE_BYTES +
			TENTATIVE_STATE_NUMPARTNERS_OFFSET_BYTES );

		if ( nPartners > MAX_COLLISION_PARTNERS )
			nPartners = MAX_COLLISION_PARTNERS;
	
		[allow_uav_condition]
		while ( nPartners != 0 )
		{
			--nPartners;
	
			uint packedBits = g_tentativeBuffer.Load( curIndex *
				TENTATIVE_STATE_SIZE_BYTES + TENTATIVE_STATE_PARTNER_ARRAY_OFFSET_BYTES +
				( nPartners >> 1 ) * 4 );
	
			uint partnerIdx = ( nPartners & 0x1 ) == 0x1 ?
				packedBits >> 16 : packedBits & 0xFFFF;
	
			// Run GJK collision test
	
			AdjustToCollisionWith( curState, curIndex, partnerIdx,
				deltaVel, deltaOmega, glomPartner );
		}
	
		curState.vel += deltaVel;
		curState.angularMomentum += deltaOmega;

		if ( length( curState.vel ) > MAX_VELOCITY )
			curState.vel = MAX_VELOCITY * normalize( curState.vel );

	}
	WriteStateByVirtualIndex( curIndex, curState );

	if ( glomPartner != 0xFFFFFFFF )
		g_glomBuffer.Store( glomPartner * 4, curIndex );
}

//=========================================================================
//=========================================================================
technique11 MyTechnique
{
    pass P0
    {          
		SetComputeShader( CompileShader( cs_5_0, main() ) );
    }
}