diff --git a/example/sz_openmp.c b/example/sz_openmp.c
index 8ae132e4..873f9139 100755
--- a/example/sz_openmp.c
+++ b/example/sz_openmp.c
@@ -453,8 +453,25 @@ int main(int argc, char* argv[])
 				}
 				else if(parallelMode==1)//openMP
 				{
-					printf("Error: current openMP version supports only float datatype\n");
-					exit(0);
+					if(r5>0)
+						r3 = r5*r4*r3;
+					else if(r4>0)
+						r3 = r4*r3;
+					if(confparams_cpr->errorBoundMode!=ABS)
+					{
+						printf("Error: current version supports only absolute error bound (errorBoundMode=%d)\n", confparams_cpr->errorBoundMode);
+						exit(0);
+					}
+					
+					cost_start_omp();
+					if(r2==0)
+						bytes = SZ_compress_double_1D_MDQ_openmp(data, r1, confparams_cpr->absErrBound, &outSize);
+					else if(r3==0)
+						bytes = SZ_compress_double_2D_MDQ_openmp(data, r2, r1, confparams_cpr->absErrBound, &outSize);
+					else //3d
+						bytes = SZ_compress_double_3D_MDQ_openmp(data, r3, r2, r1, confparams_cpr->absErrBound, &outSize);
+					printf("outSize=%zu\n", outSize);
+					cost_end_omp();	
 				}
 
 				if(cmpPath == NULL)
diff --git a/sz/include/sz_double.h b/sz/include/sz_double.h
index bc87c1db..1c0cd92a 100644
--- a/sz/include/sz_double.h
+++ b/sz/include/sz_double.h
@@ -29,6 +29,8 @@ unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t
 unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision);
 unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength, double realPrecision);
 
+size_t SZ_compress_double_3D_MDQ_RA_block(double * block_ori_data, double * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, double * P0, double * P1, int * type, double * unpredictable_data);
+
 unsigned int optimize_intervals_double_1D_opt_MSST19(double *oriData, size_t dataLength, double realPrecision);
 unsigned int optimize_intervals_double_2D_opt_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision);
 unsigned int optimize_intervals_double_3D_opt_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
diff --git a/sz/include/sz_omp.h b/sz/include/sz_omp.h
index 9634de88..cb83acbe 100644
--- a/sz/include/sz_omp.h
+++ b/sz/include/sz_omp.h
@@ -22,17 +22,21 @@ extern "C" {
 #endif
 
 unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
-
 unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
-
 unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size);
 
 void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data);
-
 void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
-
 void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data);
 
+unsigned char * SZ_compress_double_1D_MDQ_openmp(double *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_2D_MDQ_openmp(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_3D_MDQ_openmp(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+void decompressDataSeries_double_1D_openmp(double** data, size_t r1, unsigned char* comp_data);
+void decompressDataSeries_double_2D_openmp(double** data, size_t r1, size_t r2, unsigned char* comp_data);
+void decompressDataSeries_double_3D_openmp(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
 //void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num);
 void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq);
 
diff --git a/sz/include/szd_double.h b/sz/include/szd_double.h
index 1fda9553..3fcf48bc 100644
--- a/sz/include/szd_double.h
+++ b/sz/include/szd_double.h
@@ -32,6 +32,8 @@ void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, s
 void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data, double* hist_data);
 void decompressDataSeries_double_3D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, double* hist_data);
 
+size_t decompressDataSeries_double_3D_RA_block(double * data, double mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, double * unpredictable_data);
+
 int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize, int compressionType, double* hist_data);
 
 #ifdef __cplusplus
diff --git a/sz/src/sz_double.c b/sz/src/sz_double.c
index a920a4d5..4de4027d 100644
--- a/sz/src/sz_double.c
+++ b/sz/src/sz_double.c
@@ -330,6 +330,7 @@ size_t dataLength, double realPrecision, double valueRangeSize, double medianVal
 	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
 	double interval = 2*realPrecision;
 
+	double recip_realPrecision = 1/realPrecision;
 	for(i=2;i<dataLength;i++)
 	{				
 		//printf("%.30G\n",last3CmprsData[0]);
@@ -339,7 +340,7 @@ size_t dataLength, double realPrecision, double valueRangeSize, double medianVal
 		predAbsErr = fabs(curData - pred);	
 		if(predAbsErr<checkRadius)
 		{
-			state = (predAbsErr/realPrecision+1)/2;
+			state = (predAbsErr*recip_realPrecision+1)*0.5;
 			if(curData>=pred)
 			{
 				type[i] = exe_params->intvRadius+state;
@@ -493,6 +494,7 @@ TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, si
 		decData = (double*)(multisteps->hist_data);
 #endif	
 	
+	double recip_realPrecision = 1/realPrecision;
 	unsigned int quantization_intervals;
 	if(exe_params->optQuantMode==1)
 	{
@@ -560,7 +562,7 @@ TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, si
 	pred1D = P1[0];
 	diff = spaceFillingValue[1] - pred1D;
 
-	itvNum =  fabs(diff)/realPrecision + 1;
+	itvNum =  fabs(diff)*recip_realPrecision + 1;
 
 	if (itvNum < exe_params->intvCapacity)
 	{
@@ -588,7 +590,7 @@ TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, si
 		pred1D = 2*P1[j-1] - P1[j-2];
 		diff = spaceFillingValue[j] - pred1D;
 
-		itvNum = fabs(diff)/realPrecision + 1;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
 
 		if (itvNum < exe_params->intvCapacity)
 		{
@@ -620,7 +622,7 @@ TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, si
 		pred1D = P1[0];
 		diff = spaceFillingValue[index] - pred1D;
 
-		itvNum = fabs(diff)/realPrecision + 1;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
 
 		if (itvNum < exe_params->intvCapacity)
 		{
@@ -650,7 +652,7 @@ TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, si
 
 			diff = spaceFillingValue[index] - pred2D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -780,6 +782,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 		decData = (double*)(multisteps->hist_data);
 #endif		
 
+	double recip_realPrecision = 1/realPrecision;
 	unsigned int quantization_intervals;
 	if(exe_params->optQuantMode==1)
 	{
@@ -849,7 +852,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 	pred1D = P1[0];
 	diff = spaceFillingValue[1] - pred1D;
 
-	itvNum = fabs(diff)/realPrecision + 1;
+	itvNum = fabs(diff)*recip_realPrecision + 1;
 
 	if (itvNum < exe_params->intvCapacity)
 	{
@@ -877,7 +880,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 		pred1D = 2*P1[j-1] - P1[j-2];
 		diff = spaceFillingValue[j] - pred1D;
 
-		itvNum = fabs(diff)/realPrecision + 1;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
 
 		if (itvNum < exe_params->intvCapacity)
 		{
@@ -909,7 +912,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 		pred1D = P1[index-r3];
 		diff = spaceFillingValue[index] - pred1D;
 
-		itvNum = fabs(diff)/realPrecision + 1;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
 
 		if (itvNum < exe_params->intvCapacity)
 		{
@@ -939,7 +942,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 
 			diff = spaceFillingValue[index] - pred2D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -973,7 +976,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 		pred1D = P1[0];
 		diff = spaceFillingValue[index] - pred1D;
 
-		itvNum = fabs(diff)/realPrecision + 1;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
 
 		if (itvNum < exe_params->intvCapacity)
 		{
@@ -1003,7 +1006,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 			pred2D = P0[j-1] + P1[j] - P1[j-1];
 			diff = spaceFillingValue[index] - pred2D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -1036,7 +1039,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
 			diff = spaceFillingValue[index] - pred2D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -1067,7 +1070,7 @@ TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, si
 				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
 				diff = spaceFillingValue[index] - pred3D;
 
-				itvNum = fabs(diff)/realPrecision + 1;
+				itvNum = fabs(diff)*recip_realPrecision + 1;
 
 				if (itvNum < exe_params->intvCapacity)
 				{
@@ -1186,6 +1189,7 @@ char SZ_compress_args_double_NoCkRngeNoGzip_3D(int cmprType, unsigned char** new
 
 TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, double valueRangeSize, double medianValue_d)
 {
+	double recip_realPrecision = 1/realPrecision;
 	unsigned int quantization_intervals;
 	if(exe_params->optQuantMode==1)
 	{
@@ -1287,7 +1291,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 			pred1D = 2*P1[index2D-1] - P1[index2D-2];
 			diff = spaceFillingValue[index] - pred1D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -1316,7 +1320,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 			pred1D = P1[index2D-r4];
 			diff = spaceFillingValue[index] - pred1D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -1344,7 +1348,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 
 				diff = spaceFillingValue[index] - pred2D;
 
-				itvNum = fabs(diff)/realPrecision + 1;
+				itvNum = fabs(diff)*recip_realPrecision + 1;
 
 				if (itvNum < exe_params->intvCapacity)
 				{
@@ -1376,7 +1380,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 			pred1D = P1[index2D];
 			diff = spaceFillingValue[index] - pred1D;
 
-			itvNum = fabs(diff)/realPrecision + 1;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
 
 			if (itvNum < exe_params->intvCapacity)
 			{
@@ -1404,7 +1408,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
 				diff = spaceFillingValue[index] - pred2D;
 
-				itvNum = fabs(diff)/realPrecision + 1;
+				itvNum = fabs(diff)*recip_realPrecision + 1;
 
 				if (itvNum < exe_params->intvCapacity)
 				{
@@ -1433,7 +1437,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
 				diff = spaceFillingValue[index] - pred2D;
 
-				itvNum = fabs(diff)/realPrecision + 1;
+				itvNum = fabs(diff)*recip_realPrecision + 1;
 
 				if (itvNum < exe_params->intvCapacity)
 				{
@@ -1461,7 +1465,7 @@ TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, si
 					diff = spaceFillingValue[index] - pred3D;
 
 
-					itvNum = fabs(diff)/realPrecision + 1;
+					itvNum = fabs(diff)*recip_realPrecision + 1;
 
 					if (itvNum < exe_params->intvCapacity)
 					{
@@ -4342,6 +4346,302 @@ unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t
 	return powerOf2;
 }
 
+size_t SZ_compress_double_3D_MDQ_RA_block(double * block_ori_data, double * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, double * P0, double * P1, int * type, double * unpredictable_data)
+{
+	double recip_realPrecision = 1/realPrecision;
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+
+	mean[0] = block_ori_data[0];
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	double * cur_data_pos = block_ori_data;
+	double curData;
+	double pred1D, pred2D, pred3D;
+	double itvNum;
+	double diff;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	// Process Row-0 data 0
+	pred1D = mean[0];
+	curData = *cur_data_pos;
+	diff = curData - pred1D;
+	itvNum = fabs(diff)*recip_realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[0] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[0] = pred1D + 2 * (type[0] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[0])>realPrecision){	
+			type[0] = 0;
+			P1[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[0] = 0;
+		P1[0] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = cur_data_pos[1];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)*recip_realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision){	
+			type[1] = 0;
+			P1[1] = curData;	
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[1] = 0;
+		P1[1] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = cur_data_pos[j];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision){	
+				type[j] = 0;
+				P1[j] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[j] = 0;
+			P1[j] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	}
+	cur_data_pos += dim1_offset;
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[index])>realPrecision)
+			{	
+				type[index] = 0;
+				P1[index] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P1[index] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P1[index])>realPrecision)
+				{	
+					type[index] = 0;
+					P1[index] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P1[index] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+		itvNum = fabs(diff)*recip_realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;
+				P0[0] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P0[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[j] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			else
+			{
+				type[index] = 0;
+				P0[j] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+
+		cur_data_pos += dim1_offset;
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			curData = *cur_data_pos;
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)*recip_realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[index2D])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[index2D] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P0[index2D] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				curData = cur_data_pos[j];
+				diff = curData - pred3D;
+
+				itvNum = fabs(diff)*recip_realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					
+					//ganrantee comporession error against the case of machine-epsilon
+					if(fabs(curData-P0[index2D])>realPrecision)
+					{	
+						type[index] = 0;
+						P0[index2D] = curData;	
+						unpredictable_data[unpredictable_count ++] = curData;
+					}					
+				}
+				else
+				{
+					type[index] = 0;
+					P0[index2D] = curData;
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	return unpredictable_count;
+}
+
 unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision)
 {	
 	size_t i;
@@ -4791,9 +5091,6 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 		if(mean_count > 0) mean = sum / mean_count;
 	}
 
-
-	double tmp_realPrecision = realPrecision;
-
 	// use two prediction buffers for higher performance
 	double * unpredictable_data = result_unpredictable_data;
 	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
@@ -4825,8 +5122,10 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 	int * coeff_result_type = (int *) malloc(num_blocks*3*sizeof(int));
 	double * coeff_unpred_data[3];
 	double * coeff_unpredictable_data = (double *) malloc(num_blocks*3*sizeof(double));
-	double precision[3];
+	double precision[3], recip_precision[3];
 	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c;
+	recip_precision[0] = 1/precision_a, recip_precision[1] = 1/precision_b, recip_precision[2] = 1/precision_c;
+	double recip_realPrecision = 1/realPrecision;
 	for(int i=0; i<3; i++){
 		coeff_type[i] = coeff_result_type + i * num_blocks;
 		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
@@ -4888,7 +5187,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 						for(int e=0; e<3; e++){
 							cur_coeff = reg_params_pos[e*num_blocks];
 							diff = cur_coeff - last_coeffcients[e];
-							itvNum = fabs(diff)/precision[e] + 1;
+							itvNum = fabs(diff)*recip_precision[e] + 1;
 							if (itvNum < coeff_intvCapacity_sz){
 								if (diff < 0) itvNum = -itvNum;
 								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
@@ -4920,7 +5219,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -4946,7 +5245,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -4978,7 +5277,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -5006,7 +5305,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -5060,14 +5359,14 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							{
 								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
 								diff = curData - pred2D;
-								itvNum = fabs(diff)/realPrecision + 1;
+								itvNum = fabs(diff)*recip_realPrecision + 1;
 								if (itvNum < intvCapacity_sz){
 									if (diff < 0) itvNum = -itvNum;
 									type[index] = (int) (itvNum/2) + intvRadius;
-									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
 									if(type[index] <= intvRadius) type[index] -= 1;
 									//ganrantee comporession error against the case of machine-epsilon
-									if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+									if(fabs(curData - *cur_pb_pos)>realPrecision){	
 										type[index] = 0;
 										*cur_pb_pos = curData;	
 										unpredictable_data[unpredictable_count ++] = curData;
@@ -5100,14 +5399,14 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							{
 								pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
 								diff = curData - pred2D;
-								itvNum = fabs(diff)/realPrecision + 1;
+								itvNum = fabs(diff)*recip_realPrecision + 1;
 								if (itvNum < intvCapacity_sz){
 									if (diff < 0) itvNum = -itvNum;
 									type[index] = (int) (itvNum/2) + intvRadius;
-									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+									*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
 									if(type[index] <= intvRadius) type[index] -= 1;
 									//ganrantee comporession error against the case of machine-epsilon
-									if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+									if(fabs(curData - *cur_pb_pos)>realPrecision){	
 										type[index] = 0;
 										*cur_pb_pos = curData;	
 										unpredictable_data[unpredictable_count ++] = curData;
@@ -5197,7 +5496,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 						for(int e=0; e<3; e++){
 							cur_coeff = reg_params_pos[e*num_blocks];
 							diff = cur_coeff - last_coeffcients[e];
-							itvNum = fabs(diff)/precision[e] + 1;
+							itvNum = fabs(diff)*recip_precision[e] + 1;
 							if (itvNum < coeff_intvCapacity_sz){
 								if (diff < 0) itvNum = -itvNum;
 								coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
@@ -5229,7 +5528,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -5256,7 +5555,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -5288,7 +5587,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -5317,7 +5616,7 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 							curData = *cur_data_pos;
 							pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2];
 							diff = curData - pred;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
@@ -5365,13 +5664,13 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 
 							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
 							diff = curData - pred2D;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity_sz){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
-								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
 								//ganrantee comporession error against the case of machine-epsilon
-								if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+								if(fabs(curData - *cur_pb_pos)>realPrecision){	
 									type[index] = 0;
 									*cur_pb_pos = curData;	
 									unpredictable_data[unpredictable_count ++] = curData;
@@ -5398,13 +5697,13 @@ unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(dou
 
 							pred2D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim0_offset - 1];
 							diff = curData - pred2D;
-							itvNum = fabs(diff)/realPrecision + 1;
+							itvNum = fabs(diff)*recip_realPrecision + 1;
 							if (itvNum < intvCapacity_sz){
 								if (diff < 0) itvNum = -itvNum;
 								type[index] = (int) (itvNum/2) + intvRadius;
-								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+								*cur_pb_pos = pred2D + 2 * (type[index] - intvRadius) * realPrecision;
 								//ganrantee comporession error against the case of machine-epsilon
-								if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+								if(fabs(curData - *cur_pb_pos)>realPrecision){	
 									type[index] = 0;
 									*cur_pb_pos = curData;	
 									unpredictable_data[unpredictable_count ++] = curData;
@@ -5684,8 +5983,6 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 		if(mean_count > 0) mean = sum / mean_count;
 	}
 
-	double tmp_realPrecision = realPrecision;
-
 	// use two prediction buffers for higher performance
 	double * unpredictable_data = result_unpredictable_data;
 	unsigned char * indicator = (unsigned char *) malloc(num_blocks * sizeof(unsigned char));
@@ -5721,8 +6018,11 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 	int * coeff_result_type = (int *) malloc(num_blocks*4*sizeof(int));
 	double * coeff_unpred_data[4];
 	double * coeff_unpredictable_data = (double *) malloc(num_blocks*4*sizeof(double));
-	double precision[4];
+	double precision[4], recip_precision[4];
 	precision[0] = precision_a, precision[1] = precision_b, precision[2] = precision_c, precision[3] = precision_d;
+	recip_precision[0] = 1/precision_a, recip_precision[1] = 1/precision_b, recip_precision[2] = 1/precision_c, recip_precision[3] = 1/precision_d;
+	double recip_realPrecision = 1/realPrecision;
+	
 	for(int i=0; i<4; i++){
 		coeff_type[i] = coeff_result_type + i * num_blocks;
 		coeff_unpred_data[i] = coeff_unpredictable_data + i * num_blocks;
@@ -5808,7 +6108,7 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 							for(int e=0; e<4; e++){
 								cur_coeff = reg_params_pos[e*num_blocks];
 								diff = cur_coeff - last_coeffcients[e];
-								itvNum = fabs(diff)/precision[e] + 1;
+								itvNum = fabs(diff)*recip_precision[e] + 1;
 								if (itvNum < coeff_intvCapacity_sz){
 									if (diff < 0) itvNum = -itvNum;
 									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
@@ -5841,13 +6141,13 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 									curData = *cur_data_pos;
 									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
 									diff = curData - pred;
-									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
 									if (itvNum < intvCapacity){
 										if (diff < 0) itvNum = -itvNum;
 										type[index] = (int) (itvNum/2) + intvRadius;
-										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
 										//ganrantee comporession error against the case of machine-epsilon
-										if(fabs(curData - pred)>tmp_realPrecision){	
+										if(fabs(curData - pred)>realPrecision){	
 											type[index] = 0;
 											pred = curData;
 											unpredictable_data[block_unpredictable_count ++] = curData;
@@ -5885,13 +6185,13 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 									curData = *cur_data_pos;
 									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
 									diff = curData - pred;
-									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
 									if (itvNum < intvCapacity){
 										if (diff < 0) itvNum = -itvNum;
 										type[index] = (int) (itvNum/2) + intvRadius;
-										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
 										//ganrantee comporession error against the case of machine-epsilon
-										if(fabs(curData - pred)>tmp_realPrecision){	
+										if(fabs(curData - pred)>realPrecision){	
 											type[index] = 0;
 											pred = curData;
 											unpredictable_data[block_unpredictable_count ++] = curData;
@@ -5952,14 +6252,14 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
 												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
 										diff = curData - pred3D;
-										itvNum = fabs(diff)/realPrecision + 1;
+										itvNum = fabs(diff)*recip_realPrecision + 1;
 										if (itvNum < intvCapacity_sz){
 											if (diff < 0) itvNum = -itvNum;
 											type[index] = (int) (itvNum/2) + intvRadius;
-											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
 											if(type[index] <= intvRadius) type[index] -= 1;
 											//ganrantee comporession error against the case of machine-epsilon
-											if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+											if(fabs(curData - *cur_pb_pos)>realPrecision){	
 												type[index] = 0;
 												*cur_pb_pos = curData;	
 												unpredictable_data[unpredictable_count ++] = curData;
@@ -6005,14 +6305,14 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 										pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
 												 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
 										diff = curData - pred3D;
-										itvNum = fabs(diff)/realPrecision + 1;
+										itvNum = fabs(diff)*recip_realPrecision + 1;
 										if (itvNum < intvCapacity_sz){
 											if (diff < 0) itvNum = -itvNum;
 											type[index] = (int) (itvNum/2) + intvRadius;
-											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+											*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
 											if(type[index] <= intvRadius) type[index] -= 1;
 											//ganrantee comporession error against the case of machine-epsilon
-											if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+											if(fabs(curData - *cur_pb_pos)>realPrecision){	
 												type[index] = 0;
 												*cur_pb_pos = curData;	
 												unpredictable_data[unpredictable_count ++] = curData;
@@ -6149,7 +6449,7 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 							for(int e=0; e<4; e++){
 								cur_coeff = reg_params_pos[e*num_blocks];
 								diff = cur_coeff - last_coeffcients[e];
-								itvNum = fabs(diff)/precision[e] + 1;
+								itvNum = fabs(diff)*recip_precision[e] + 1;
 								if (itvNum < coeff_intvCapacity_sz){
 									if (diff < 0) itvNum = -itvNum;
 									coeff_type[e][coeff_index] = (int) (itvNum/2) + coeff_intvRadius;
@@ -6183,13 +6483,13 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 									curData = *cur_data_pos;
 									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
 									diff = curData - pred;
-									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
 									if (itvNum < intvCapacity){
 										if (diff < 0) itvNum = -itvNum;
 										type[index] = (int) (itvNum/2) + intvRadius;
-										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
 										//ganrantee comporession error against the case of machine-epsilon
-										if(fabs(curData - pred)>tmp_realPrecision){	
+										if(fabs(curData - pred)>realPrecision){	
 											type[index] = 0;
 											pred = curData;
 											unpredictable_data[block_unpredictable_count ++] = curData;
@@ -6227,13 +6527,13 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 									curData = *cur_data_pos;
 									pred = last_coeffcients[0] * ii + last_coeffcients[1] * jj + last_coeffcients[2] * kk + last_coeffcients[3];									
 									diff = curData - pred;
-									itvNum = fabs(diff)/tmp_realPrecision + 1;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
 									if (itvNum < intvCapacity){
 										if (diff < 0) itvNum = -itvNum;
 										type[index] = (int) (itvNum/2) + intvRadius;
-										pred = pred + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										pred = pred + 2 * (type[index] - intvRadius) * realPrecision;
 										//ganrantee comporession error against the case of machine-epsilon
-										if(fabs(curData - pred)>tmp_realPrecision){	
+										if(fabs(curData - pred)>realPrecision){	
 											type[index] = 0;
 											pred = curData;
 											unpredictable_data[block_unpredictable_count ++] = curData;
@@ -6286,13 +6586,13 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
 											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
 									diff = curData - pred3D;
-									itvNum = fabs(diff)/realPrecision + 1;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
 									if (itvNum < intvCapacity_sz){
 										if (diff < 0) itvNum = -itvNum;
 										type[index] = (int) (itvNum/2) + intvRadius;
-										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
 										//ganrantee comporession error against the case of machine-epsilon
-										if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+										if(fabs(curData - *cur_pb_pos)>realPrecision){	
 											type[index] = 0;
 											*cur_pb_pos = curData;	
 											unpredictable_data[unpredictable_count ++] = curData;
@@ -6330,13 +6630,13 @@ unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(dou
 									pred3D = cur_pb_pos[-1] + cur_pb_pos[-strip_dim1_offset]+ cur_pb_pos[-strip_dim0_offset] - cur_pb_pos[-strip_dim1_offset - 1]
 											 - cur_pb_pos[-strip_dim0_offset - 1] - cur_pb_pos[-strip_dim0_offset - strip_dim1_offset] + cur_pb_pos[-strip_dim0_offset - strip_dim1_offset - 1];
 									diff = curData - pred3D;
-									itvNum = fabs(diff)/realPrecision + 1;
+									itvNum = fabs(diff)*recip_realPrecision + 1;
 									if (itvNum < intvCapacity_sz){
 										if (diff < 0) itvNum = -itvNum;
 										type[index] = (int) (itvNum/2) + intvRadius;
-										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * tmp_realPrecision;
+										*cur_pb_pos = pred3D + 2 * (type[index] - intvRadius) * realPrecision;
 										//ganrantee comporession error against the case of machine-epsilon
-										if(fabs(curData - *cur_pb_pos)>tmp_realPrecision){	
+										if(fabs(curData - *cur_pb_pos)>realPrecision){	
 											type[index] = 0;
 											*cur_pb_pos = curData;	
 											unpredictable_data[unpredictable_count ++] = curData;
diff --git a/sz/src/sz_omp.c b/sz/src/sz_omp.c
index c171de4c..555efe79 100644
--- a/sz/src/sz_omp.c
+++ b/sz/src/sz_omp.c
@@ -11,13 +11,6 @@
 #include <math.h>
 #include <time.h>
 
-unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size){
-	return NULL;
-}
-unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
-	return NULL;
-}
-
 double sz_wtime(){
 #ifdef _OPENMP
     return omp_get_wtime();
@@ -51,6 +44,13 @@ void sz_set_num_threads(int nthreads){
 #endif
 }
 
+unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
 unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size){
 
 	float elapsed_time = 0.0;
@@ -319,10 +319,10 @@ unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_
 
 void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data){
 }
+
 void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data){
 }
 
-
 void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
 	
 	if(confparams_dec==NULL)
@@ -508,6 +508,438 @@ void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, si
 	SZ_ReleaseHuffman(huffmanTree);
 }
 
+//Double Precision
+
+unsigned char * SZ_compress_double_1D_MDQ_openmp(double *oriData, size_t r1, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
+unsigned char * SZ_compress_double_2D_MDQ_openmp(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
+unsigned char * SZ_compress_double_3D_MDQ_openmp(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	float elapsed_time = 0.0;
+
+	elapsed_time = -sz_wtime();
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		// quantization_intervals = optimize_intervals_float_3D(oriData, r1, realPrecision);
+		quantization_intervals = optimize_intervals_double_3D_opt(oriData, r1, r2, r3, realPrecision);
+		//quantization_intervals = 32768;
+		printf("3D number of bins: %d\nerror bound %.20f\n", quantization_intervals, realPrecision);
+		// exit(0);		
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+	elapsed_time += sz_wtime();
+	printf("opt interval time: %.4f\n", elapsed_time);
+
+	elapsed_time = -sz_wtime();
+	int thread_num = sz_get_max_threads();
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+		thread_num = num_x * num_y * num_z;
+	}
+	sz_set_num_threads(thread_num);
+	// calculate block dims
+	printf("number of blocks: %zu %zu %zu\n", num_x, num_y, num_z);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+	// printf("max_num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	
+	// printf("malloc blockinfo array start\n");
+	// fflush(stdout);
+
+	size_t buffer_size = early_blockcount_y * early_blockcount_z * sizeof(double);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	double * result_unpredictable_data = (double *) malloc(unpred_data_max_size * sizeof(double) * num_blocks);
+	unsigned int * unpredictable_count = (unsigned int *) malloc(num_blocks * sizeof(unsigned int));
+	double * mean = malloc(num_blocks * sizeof(double));
+	double * buffer0, * buffer1;
+	buffer0 = (double *) malloc(buffer_size * thread_num);
+	buffer1 = (double *) malloc(buffer_size * thread_num);
+	unsigned char * result = (unsigned char *) malloc(num_elements * (sizeof(int) + sizeof(double)));
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	unsigned char * encoding_buffer = (unsigned char *) malloc(max_num_block_elements * sizeof(int) * num_blocks);
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t *freq = (size_t *)malloc(thread_num*quantization_intervals*4*sizeof(size_t));
+	memset(freq, 0, thread_num*quantization_intervals*4*sizeof(size_t));
+	
+	size_t stateNum = quantization_intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int num_yz = num_y * num_z;
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", sz_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		double * data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		double * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		double *P0, *P1; // buffer
+
+		P0 = buffer0 + id * early_blockcount_y * early_blockcount_z;
+		P1 = buffer1 + id * early_blockcount_y * early_blockcount_z;
+		unpredictable_count[id] = SZ_compress_double_3D_MDQ_RA_block(data_pos, mean + id, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, P0, P1, type, unpredictable_data);
+	}
+	elapsed_time += sz_wtime();
+	printf("compression and quantization time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	// printf("unpred count:\n");
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d ", unpredictable_count[i]);
+	// }
+	// printf("\n");
+	// printf("total_unpred num: %d\n", total_unpred);
+	// printf("Block wise compression end, num_elements %ld\n", num_elements);
+	// huffman encode
+
+	size_t nodeCount = 0;
+	Huffman_init_openmp(huffmanTree, result_type, num_elements, thread_num, freq);
+	elapsed_time += sz_wtime();
+	printf("Build Huffman: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	for (size_t i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++;
+	nodeCount = nodeCount*2-1;
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		total_unpred += unpredictable_count[i];
+		// printf("%d: %d mean %.2f\n", i, unpredictable_count[i], mean[i]);
+	}
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	size_t enCodeSize = 0;
+
+	intToBytes_bigEndian(result_pos, thread_num);
+	result_pos += sizeof(int);
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += sizeof(double);
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += 4;
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+
+	memcpy(result_pos, unpredictable_count, num_blocks * sizeof(unsigned int));
+	result_pos += num_blocks * sizeof(unsigned int);
+	memcpy(result_pos, mean, num_blocks * sizeof(double));
+	result_pos += num_blocks * sizeof(double);	
+
+	unpred_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		unpred_offset[t] = unpredictable_count[t-1] + unpred_offset[t-1];
+	}
+	
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		double * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		memcpy(result_pos + unpred_offset[id] * sizeof(double), unpredictable_data, unpredictable_count[id] * sizeof(double));		
+	}
+	result_pos += total_unpred * sizeof(double);
+
+	elapsed_time += sz_wtime();
+	printf("write misc time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+
+	size_t * block_pos = (size_t *) result_pos;
+	result_pos += num_blocks * sizeof(size_t);
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		unsigned char * encoding_buffer_pos = encoding_buffer + id * max_num_block_elements * sizeof(int);
+		size_t enCodeSize = 0;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		encode(huffmanTree, type, current_block_elements, encoding_buffer_pos, &enCodeSize);
+		block_pos[id] = enCodeSize;
+	}
+	elapsed_time += sz_wtime();
+	printf("Parallel Huffman encoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	// for(int t=0; t<thread_num; t++){
+	// 	memcpy(result_pos, encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);
+	// 	result_pos += block_pos[t];
+	// }
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		memcpy(result_pos + block_offset[id], encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);		
+	}
+	result_pos += block_offset[thread_num - 1] + block_pos[thread_num - 1];
+
+	elapsed_time += sz_wtime();
+	printf("Final copy elapsed time: %.4f\n", elapsed_time);
+	// {
+	// 	int status;
+	// 	writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/comp001_type.dat", &status);
+	// }
+
+	// int status;
+	// writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/omp_type.dat", &status);
+	// printf("type array size: %ld\n", enCodeSize);
+	result_pos += enCodeSize;
+	size_t totalEncodeSize = 0;
+	totalEncodeSize = result_pos - result;
+	// printf("Total size %ld\n", totalEncodeSize);
+	free(freq);
+	free(buffer0);
+	free(buffer1);
+	free(treeBytes);
+	free(unpred_offset);
+	free(block_offset);
+	free(encoding_buffer);
+	free(mean);
+	free(result_unpredictable_data);
+	free(unpredictable_count);
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+void decompressDataSeries_double_1D_openmp(double** data, size_t r1, unsigned char* comp_data){
+}
+
+void decompressDataSeries_double_2D_openmp(double** data, size_t r1, size_t r2, unsigned char* comp_data){
+}
+
+void decompressDataSeries_double_3D_openmp(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));	
+	
+	// printf("num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+	// fflush(stdout);
+	double elapsed_time = 0.0;
+	elapsed_time = -sz_wtime();
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+	
+	unsigned char * comp_data_pos = comp_data;
+	//int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	//comp_data_pos += meta_data_offset;
+
+	int thread_num = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(int);
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+	}
+	
+	printf("number of blocks: %zu %zu %zu, thread_num %d\n", num_x, num_y, num_z, thread_num);
+	sz_set_num_threads(thread_num);
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	*data = (double*)malloc(sizeof(double)*num_elements);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += sizeof(double);
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(double);
+
+	size_t stateNum = intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	updateQuantizationInfo(intervals);
+	// exe_params->intvRadius = (int)((tdps->intervals - 1)/ 2);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += sizeof(unsigned int);
+	size_t huffman_nodes = bytesToInt_bigEndian(comp_data_pos);
+	huffmanTree->allNodes = huffman_nodes;
+	// printf("Reconstruct huffman tree with node count %ld\n", nodeCount);
+	// fflush(stdout);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, huffmanTree->allNodes);
+
+	comp_data_pos += 4 + tree_size;
+	unsigned int * unpred_count = (unsigned int *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(unsigned int);
+	double * mean_pos = (double *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(double);
+	double * result_unpredictable_data = (double *) comp_data_pos;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		unpred_offset[i] = total_unpred;
+		total_unpred += unpred_count[i];
+	}
+	comp_data_pos += total_unpred * sizeof(double);
+
+	size_t * block_pos = (size_t *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(size_t);
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	int num_yz = num_y * num_z;
+	elapsed_time += sz_wtime();
+	printf("Read data info elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		decode(comp_data_pos + block_offset[id], current_blockcount_x*current_blockcount_y*current_blockcount_z, root, type);
+	}
+	elapsed_time += sz_wtime();
+	printf("Parallel Huffman decoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -sz_wtime();
+
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = sz_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", sz_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		double * data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		double * unpredictable_data = result_unpredictable_data + unpred_offset[id];
+		double mean = mean_pos[id];
+
+		decompressDataSeries_double_3D_RA_block(data_pos, mean, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, type, unpredictable_data);
+	}	
+	elapsed_time += sz_wtime();
+	printf("Parallel decompress elapsed time: %.4f\n", elapsed_time);
+
+	free(block_offset);
+	free(result_type);
+	free(unpred_offset);
+	SZ_ReleaseHuffman(huffmanTree);
+}
+
 void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq){
 
 	size_t i;
diff --git a/sz/src/szd_double.c b/sz/src/szd_double.c
index 14a194bd..7a10b342 100644
--- a/sz/src/szd_double.c
+++ b/sz/src/szd_double.c
@@ -2784,6 +2784,159 @@ void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, s
 	}
 }
 
+size_t decompressDataSeries_double_3D_RA_block(double * data, double mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, double * unpredictable_data)
+{
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	double * cur_data_pos = data;
+	double * last_row_pos;
+	double pred1D, pred2D, pred3D;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	int type_;
+	// Process Row-0 data 0
+	pred1D = mean;
+	type_ = type[0];
+	// printf("Type 0 %d, mean %.4f\n", type_, mean);
+	if (type_ != 0){
+		cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = cur_data_pos[0];
+	type_ = type[1];
+	if (type_ != 0){
+		cur_data_pos[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[1] = unpredictable_data[unpredictable_count ++];
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*cur_data_pos[j-1] - cur_data_pos[j-2];
+		type_ = type[j];
+		if (type_ != 0){
+			cur_data_pos[j] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+		}
+	}
+
+	last_row_pos = cur_data_pos;
+	cur_data_pos += dim1_offset;
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = last_row_pos[0];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = cur_data_pos[j-1] + last_row_pos[j] - last_row_pos[j-1];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f last_row_data %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], last_row_pos[j], last_row_pos[j-1], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = cur_data_pos[- dim0_offset];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = cur_data_pos[j-1] + cur_data_pos[j - dim0_offset] - cur_data_pos[j - 1 - dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+
+	    /* Process Row-1 --> Row-r2-1 */
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			pred2D = last_row_pos[0] + cur_data_pos[- dim0_offset] - last_row_pos[- dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[0] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				pred3D = cur_data_pos[j-1] + last_row_pos[j]+ cur_data_pos[j - dim0_offset] - last_row_pos[j-1] - last_row_pos[j - dim0_offset] - cur_data_pos[j-1 - dim0_offset] + last_row_pos[j-1 - dim0_offset];
+				type_ = type[index];
+				if (type_ != 0){
+					cur_data_pos[j] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else{
+					cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+				}
+			}
+			last_row_pos = cur_data_pos;
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+	}
+
+	return unpredictable_count;
+}
+
 void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data, double* hist_data){
 
 	size_t dim0_offset = r2;