From 60d436b11cfcfe6efa130e8d5103042dd8df7846 Mon Sep 17 00:00:00 2001
From: Constantine Rudenko <kostiantynr@risa.com>
Date: Tue, 16 Jul 2019 04:15:40 +0300
Subject: [PATCH] fix scale

---
 Code/DQ skinning/DualQuaternionSkinner.cs     | 75 ++++++++-----------
 .../Shaders/Compute/ComputeBoneDQ.compute     | 32 +++-----
 Code/DQ skinning/Shaders/Compute/DQ.cginc     | 24 ++++++
 3 files changed, 67 insertions(+), 64 deletions(-)
diff --git a/Code/DQ skinning/DualQuaternionSkinner.cs b/Code/DQ skinning/DualQuaternionSkinner.cs
index 2fabfb1..e63c5b8 100644
--- a/Code/DQ skinning/DualQuaternionSkinner.cs	
+++ b/Code/DQ skinning/DualQuaternionSkinner.cs	
@@ -10,7 +10,8 @@
 /// <a class="bold" href="https://docs.unity3d.com/ScriptReference/SkinnedMeshRenderer.html">SkinnedMeshRenderer</a> is required to extract some information about the mesh during <b>Start()</b> and is destroyed immediately after. 
 /// </summary>
 [RequireComponent(typeof(MeshFilter))]
-public class DualQuaternionSkinner : MonoBehaviour {
+public class DualQuaternionSkinner : MonoBehaviour
+{
 
 	struct DualQuaternion
 	{
@@ -31,11 +32,9 @@ struct BoneWeightInfo
 		public float weight3;
 	}
 
-	DualQuaternion[] poseDualQuaternions;
-	
-	const int numthreads = 1024;	// must be same in compute shader code
+	const int numthreads = 1024;    // must be same in compute shader code
 	const int textureWidth = 1024;  // no need to adjust compute shaders
-	
+
 	public ComputeShader shaderComputeBoneDQ;
 	public ComputeShader shaderDQBlend;
 	public ComputeShader shaderApplyMorph;
@@ -46,7 +45,10 @@ struct BoneWeightInfo
 	/// </summary>
 	public bool started { get; private set; } = false;
 
-	ComputeBuffer bufPoseDq;
+	DualQuaternion[] poseDualQuaternions;
+	Matrix4x4[] poseMatrices;
+
+	ComputeBuffer bufPoseMatrices;
 	ComputeBuffer bufSkinnedDq;
 	ComputeBuffer bufOriginalVertices;
 	ComputeBuffer bufOriginalNormals;
@@ -232,7 +234,7 @@ void SetMesh(Mesh mesh)
 			{
 				// could use float3 instead of float4 but NVidia says structures not aligned to 128 bits are slow
 				// https://developer.nvidia.com/content/understanding-structured-buffer-performance
-				this.arrBufMorphDeltaTangents[i] = new ComputeBuffer(this.mf.mesh.vertexCount, sizeof(float) *  4);
+				this.arrBufMorphDeltaTangents[i] = new ComputeBuffer(this.mf.mesh.vertexCount, sizeof(float) * 4);
 				for (int k = 0; k < this.mf.mesh.vertexCount; k++)
 					tempVec4[k] = deltaTangents[k];
 				this.arrBufMorphDeltaTangents[i].SetData(tempVec4);
@@ -254,6 +256,7 @@ void SetMesh(Mesh mesh)
 		this.shaderDQBlend.SetInt("textureWidth", textureWidth);
 
 		this.poseDualQuaternions = new DualQuaternion[this.mf.mesh.bindposes.Length];
+		this.poseMatrices = new Matrix4x4[this.mf.mesh.bindposes.Length];
 
 		// initiate textures and buffers
 
@@ -281,8 +284,8 @@ void SetMesh(Mesh mesh)
 		this.rtSkinnedData_3.Create();
 		this.shaderDQBlend.SetTexture(this.kernelHandleComputeBoneDQ, "skinned_data_3", this.rtSkinnedData_3);
 
-		this.bufPoseDq = new ComputeBuffer(this.mf.mesh.bindposes.Length, sizeof(float) * 8);
-		this.shaderComputeBoneDQ.SetBuffer(this.kernelHandleComputeBoneDQ, "pose_dual_quaternions", this.bufPoseDq);
+		this.bufPoseMatrices = new ComputeBuffer(this.mf.mesh.bindposes.Length, sizeof(float) * 16);
+		this.shaderComputeBoneDQ.SetBuffer(this.kernelHandleComputeBoneDQ, "pose_matrices", this.bufPoseMatrices);
 
 		this.bufSkinnedDq = new ComputeBuffer(this.mf.mesh.bindposes.Length, sizeof(float) * 8);
 		this.shaderComputeBoneDQ.SetBuffer(this.kernelHandleComputeBoneDQ, "skinned_dual_quaternions", this.bufSkinnedDq);
@@ -442,7 +445,7 @@ void ReleaseBuffers()
 		this.bufOriginalNormals?.Release();
 		this.bufOriginalVertices?.Release();
 		this.bufOriginalTangents?.Release();
-		this.bufPoseDq?.Release();
+		this.bufPoseMatrices?.Release();
 		this.bufSkinnedDq?.Release();
 		this.bufMorphedNormals?.Release();
 		this.bufMorphedVertices?.Release();
@@ -464,13 +467,13 @@ void ReleaseBuffers()
 
 	void OnDestroy()
 	{
-		this.ReleaseBuffers();	
+		this.ReleaseBuffers();
 	}
 
 	// Use this for initialization
 	void Start()
 	{
-		this.shaderComputeBoneDQ = (ComputeShader)Instantiate(this.shaderComputeBoneDQ);	// bug workaround
+		this.shaderComputeBoneDQ = (ComputeShader)Instantiate(this.shaderComputeBoneDQ);    // bug workaround
 		this.shaderDQBlend = (ComputeShader)Instantiate(this.shaderDQBlend);                // bug workaround
 		this.shaderApplyMorph = (ComputeShader)Instantiate(this.shaderApplyMorph);          // bug workaround
 
@@ -502,35 +505,16 @@ void Start()
 	}
 
 	// Update is called once per frame
-	void Update () {
+	void Update()
+	{
 		this.ApplyAllMorphs();
 
-		this.mf.mesh.MarkDynamic ();    // once or every frame? idk.
-										// at least it does not affect performance
-
-		this.shaderComputeBoneDQ.SetVector(
-			"parent_rotation_quaternion",
-			Quaternion.Inverse(this.transform.parent.rotation).ToVector4()
-		);
-
-		this.shaderComputeBoneDQ.SetVector (
-			"parent_translation_quaternion", 
-			new Vector4(
-				- this.transform.parent.position.x,
-				- this.transform.parent.position.y,
-				- this.transform.parent.position.z,
-				1
-			)
-		);
+		this.mf.mesh.MarkDynamic();    // once or every frame? idk.
+									   // at least it does not affect performance
 
-		this.shaderComputeBoneDQ.SetVector(
-			"parent_scale",
-			new Vector4(
-				this.transform.parent.lossyScale.x,
-				this.transform.parent.lossyScale.y,
-				this.transform.parent.lossyScale.z,
-				1
-			)
+		this.shaderComputeBoneDQ.SetMatrix(
+			"self_matrix",
+			this.transform.worldToLocalMatrix
 		);
 
 		for (int i = 0; i < this.bones.Length; i++)
@@ -541,17 +525,24 @@ void Update () {
 
 			// could use float3 instead of float4 for position but NVidia says structures not aligned to 128 bits are slow
 			// https://developer.nvidia.com/content/understanding-structured-buffer-performance
-			this.poseDualQuaternions[i].position = new Vector4(pos.x, pos.y, pos.z, 1);
-		}
+			this.poseDualQuaternions[i].position = new Vector4(
+				pos.x,
+				pos.y,
+				pos.z,
+				0
+			);
 
-		this.bufPoseDq.SetData(this.poseDualQuaternions);
+			this.poseMatrices[i] = this.bones[i].localToWorldMatrix;
+		}
+		
+		this.bufPoseMatrices.SetData(this.poseMatrices);
 
 		// Calculate blended quaternions
 
 		int numThreadGroups = this.bones.Length / numthreads;
 		if (this.bones.Length % numthreads != 0)
 		{
-			numThreadGroups ++;
+			numThreadGroups++;
 		}
 
 		this.shaderComputeBoneDQ.Dispatch(this.kernelHandleDQBlend, numThreadGroups, 1, 1);
diff --git a/Code/DQ skinning/Shaders/Compute/ComputeBoneDQ.compute b/Code/DQ skinning/Shaders/Compute/ComputeBoneDQ.compute
index 796f97a..ec71a34 100644
--- a/Code/DQ skinning/Shaders/Compute/ComputeBoneDQ.compute	
+++ b/Code/DQ skinning/Shaders/Compute/ComputeBoneDQ.compute	
@@ -3,10 +3,6 @@
 
 #include "DQ.cginc"
 
-float4 parent_rotation_quaternion;
-float4 parent_translation_quaternion;
-float4 parent_scale;
-
 struct boneWeight
 {
 	int boneIndex0;
@@ -20,34 +16,26 @@ struct boneWeight
 	float boneWeight3;
 };
 
-RWStructuredBuffer<dual_quaternion> pose_dual_quaternions;
+RWStructuredBuffer<float4x4> pose_matrices;
+float4x4 self_matrix;
+
 RWStructuredBuffer<dual_quaternion> bind_dual_quaternions;
 RWStructuredBuffer<dual_quaternion> skinned_dual_quaternions;
 
 [numthreads(1024,1,1)]
 void CSMain (uint3 id : SV_DispatchThreadID)
 {
-	struct dual_quaternion dq_parent;
-	struct dual_quaternion dq_pose;
 	struct dual_quaternion dq_bind;
+	dq_bind.rotation_quaternion	= bind_dual_quaternions.Load(id.x).rotation_quaternion;
+	dq_bind.translation_quaternion = bind_dual_quaternions.Load(id.x).translation_quaternion;
+	dq_bind.translation_quaternion = QuaternionMultiply(dq_bind.translation_quaternion,	dq_bind.rotation_quaternion) * 0.5;
 
-	dq_parent	.rotation_quaternion = parent_rotation_quaternion;
-	dq_pose		.rotation_quaternion = pose_dual_quaternions.Load(id.x).rotation_quaternion;
-	dq_bind		.rotation_quaternion = bind_dual_quaternions.Load(id.x).rotation_quaternion;
-
-	dq_parent	.translation_quaternion = parent_translation_quaternion;
-	dq_pose		.translation_quaternion = pose_dual_quaternions.Load(id.x).translation_quaternion;
-	dq_bind		.translation_quaternion = bind_dual_quaternions.Load(id.x).translation_quaternion;
+	float4x4 pose_matrix = transpose(pose_matrices.Load(id.x));
+	pose_matrix = mul(self_matrix, pose_matrix);
 
-	dq_parent	.translation_quaternion = QuaternionMultiply(dq_parent	.rotation_quaternion,		dq_parent	.translation_quaternion	) * 0.5;
-	dq_pose		.translation_quaternion = QuaternionMultiply(dq_pose	.translation_quaternion,	dq_pose		.rotation_quaternion	) * 0.5;
-	dq_bind		.translation_quaternion = QuaternionMultiply(dq_bind	.translation_quaternion,	dq_bind		.rotation_quaternion	) * 0.5;
-
-	struct dual_quaternion dq_skinned = DualQuaternionMultiply(dq_parent, dq_pose);
-	
-	dq_skinned.translation_quaternion /= parent_scale;
+	struct dual_quaternion dq_pose = DualQuaternionFromMatrix4x4(pose_matrix);
 	
-	dq_skinned = DualQuaternionMultiply(dq_skinned, dq_bind);
+	struct dual_quaternion dq_skinned = DualQuaternionMultiply(dq_pose, dq_bind);
 
 	skinned_dual_quaternions[id.x].rotation_quaternion		= dq_skinned.rotation_quaternion;
 	skinned_dual_quaternions[id.x].translation_quaternion	= dq_skinned.translation_quaternion;
diff --git a/Code/DQ skinning/Shaders/Compute/DQ.cginc b/Code/DQ skinning/Shaders/Compute/DQ.cginc
index 9868013..7518592 100644
--- a/Code/DQ skinning/Shaders/Compute/DQ.cginc	
+++ b/Code/DQ skinning/Shaders/Compute/DQ.cginc	
@@ -40,4 +40,28 @@ struct dual_quaternion DualQuaternionShortestPath(struct dual_quaternion dq1, st
 	dq1.rotation_quaternion		= isBadPath ? -dq1.rotation_quaternion		: dq1.rotation_quaternion;
 	dq1.translation_quaternion	= isBadPath ? -dq1.translation_quaternion	: dq1.translation_quaternion;
 	return dq1;
+}
+
+float4 QuaternionApplyRotation(float4 v, float4 rotQ)
+{
+	v = QuaternionMultiply(rotQ, v);
+	return QuaternionMultiply(v, QuaternionInvert(rotQ));
+}
+
+struct dual_quaternion DualQuaternionFromMatrix4x4(float4x4 m)
+{
+	struct  dual_quaternion dq;
+
+	dq.rotation_quaternion.w = sqrt(m[0][0] + m[1][1] + m[2][2] + 1.0)*0.5;	// assume m[3][3] = 1.0
+
+	float w4 = dq.rotation_quaternion.w * 4.0;
+
+	dq.rotation_quaternion.x = (m[2][1] - m[1][2]) / w4;
+	dq.rotation_quaternion.y = (m[0][2] - m[2][0]) / w4;
+	dq.rotation_quaternion.z = (m[1][0] - m[0][1]) / w4;
+
+	dq.translation_quaternion = float4(m[0][3], m[1][3], m[2][3], 0);
+	dq.translation_quaternion = QuaternionMultiply(dq.translation_quaternion, dq.rotation_quaternion) * 0.5;
+
+	return dq;
 }
\ No newline at end of file