diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index ab770b14..69f6c2a6 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -296,8 +296,8 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE double *rupp = bins->edges; int nrpbin = bins->nedges; double rpmin=bins->edges[0], rpmax=bins->edges[bins->nedges-1]; - if( ! (rpmin > 0.0 && rpmax > 0.0 && rpmin < rpmax && nrpbin > 0)) { - fprintf(stderr,"Error: Could not setup with R bins correctly. (rmin = %lf, rmax = %lf, with nbins = %d). Expected non-zero rmin/rmax with rmax > rmin and nbins >=1 \n", + if( ! (rpmin >= 0.0 && rpmax > 0.0 && rpmin < rpmax && nrpbin > 0)) { + fprintf(stderr,"Error: Could not setup with r bins correctly. (rmin = %lf, rmax = %lf, with nbins = %d). Expected positive rmin/rmax with rmax > rmin and nbins >=1 \n", rpmin, rpmax, nrpbin); return EXIT_FAILURE; } @@ -333,7 +333,7 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE const double zmax = czmax * inv_speed_of_light + 0.01; const int workspace_size = 10000; - double *interp_redshift = my_calloc(sizeof(*interp_redshift), workspace_size);//the interpolation is done in 'z' and not in 'cz' + double *interp_redshift = my_calloc(sizeof(*interp_redshift), workspace_size);//the interpolation is done in 'z' and not in 'cz' double *interp_comoving_dist = my_calloc(sizeof(*interp_comoving_dist),workspace_size); int Nzdc = set_cosmo_dist(zmax, workspace_size, interp_redshift, interp_comoving_dist, cosmology); if(Nzdc < 0) { @@ -344,7 +344,7 @@ int countpairs_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, DOUBLE gsl_interp *interpolation; gsl_interp_accel *accelerator; accelerator = gsl_interp_accel_alloc(); - interpolation = gsl_interp_alloc (gsl_interp_linear,Nzdc); + interpolation = gsl_interp_alloc(gsl_interp_linear,Nzdc); gsl_interp_init(interpolation, interp_redshift, interp_comoving_dist, Nzdc); for(int64_t i=0;iweights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND1; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; pair.weights1[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; } - weightavg[1] += weight_func(&pair); + weightavg[npibin+1] += weight_func(&pair); } } } diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index cae0a6db..93c44e46 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -297,8 +297,8 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D double *supp = bins->edges; int nsbin = bins->nedges; double smin=bins->edges[0], smax=bins->edges[bins->nedges-1]; - if( ! (smin > 0.0 && smax > 0.0 && smin < smax && nsbin > 0)) { - fprintf(stderr,"Error: Could not setup with S bins correctly. (smin = %lf, smax = %lf, with nbins = %d). Expected non-zero smin/smax with smax > smin and nbins >=1 \n", + if( ! (smin >= 0.0 && smax > 0.0 && smin < smax && nsbin > 0)) { + fprintf(stderr,"Error: Could not setup with s bins correctly. (smin = %lf, smax = %lf, with nbins = %d). Expected non-zero smin/smax with smax > smin and nbins >=1 \n", smin, smax, nsbin); return EXIT_FAILURE; } @@ -761,24 +761,22 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D /* Then, add all the self-pairs. This ensures that a cross-correlation with two identical datasets produces the same result as the auto-correlation */ - npairs[1] += ND1; //npairs[1] contains the first valid bin. + npairs[nmu_bins+1] += ND1; //npairs[1] contains the first valid bin. // Increasing npairs affects rpavg and weightavg. // We don't need to add anything to rpavg; all the self-pairs have 0 separation! // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND1; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; pair.weights1[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; } - weightavg[1] += weight_func(&pair); + weightavg[nmu_bins+1] += weight_func(&pair); } } } @@ -800,9 +798,9 @@ int countpairs_mocks_s_mu_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, D results->mu_max = max_mu;//NOTE max_mu which is double and not mu_max (which might be float) results->mu_min = ZERO; results->npairs = my_malloc(sizeof(*(results->npairs)), totnbins); - results->supp = my_malloc(sizeof(*(results->supp)) , nsbin); - results->savg = my_malloc(sizeof(*(results->savg)) , totnbins); - results->weightavg = my_calloc(sizeof(double) , totnbins); + results->supp = my_malloc(sizeof(*(results->supp)), nsbin); + results->savg = my_malloc(sizeof(*(results->savg)), totnbins); + results->weightavg = my_calloc(sizeof(double), totnbins); if(results->npairs == NULL || results->supp == NULL || results->savg == NULL || results->weightavg == NULL) { free_results_mocks_s_mu(results); return EXIT_FAILURE; diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src index 23d8db2b..c4809425 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_kernels.c.src @@ -268,14 +268,18 @@ static inline int countpairs_s_mu_mocks_avx512_intrinsics_DOUBLE(const int64_t N AVX512_SQUARE_FLOAT(m_parz))); // \mu^2 := cos^2(\theta_between_s_and_l) = |s.l|^2 / (|s|^2 * |l|^2) + const AVX512_FLOATS m_zero = AVX512_SETZERO_FLOAT(); + const AVX512_FLOATS m_one = AVX512_SET_FLOAT((DOUBLE) 1); const AVX512_FLOATS m_sqr_norm_l_norm_s = AVX512_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s); + const AVX512_FLOATS m_mask_szero = AVX512_COMPARE_FLOATS(m_zero, m_sqr_s, _CMP_LT_OQ); + m_sqr_norm_l_norm_s = AVX512_BLEND_FLOATS_WITH_MASK(m_one, m_sqr_norm_l_norm_s, m_mask_szero); // to avoid division by 0 AVX512_FLOATS m_sqr_mu = AVX512_SETZERO_FLOAT(); CHECK_AND_FAST_DIVIDE_AVX512(m_sqr_mu, m_sqr_s_dot_l, m_sqr_norm_l_norm_s, m_mask_left, fast_divide_and_NR_steps); const AVX512_FLOATS m_mu = AVX512_SQRT_FLOAT(m_sqr_mu); //Do the mask filters in a separate scope { - const AVX512_MASK m_mask_mumax = AVX512_MASK_COMPARE_FLOATS(m_mask_left, m_sqr_mu,m_sqr_mumax,_CMP_LT_OQ); + const AVX512_MASK m_mask_mumax = AVX512_MASK_COMPARE_FLOATS(m_mask_left, m_sqr_mu,m_sqr_mumax, _CMP_LT_OQ); const AVX512_MASK m_smax_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_left, m_sqr_s, m_sqr_smax, _CMP_LT_OQ); const AVX512_MASK m_smin_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_left, m_sqr_s, m_sqr_smin, _CMP_GE_OQ); const AVX512_MASK m_s_mask = AVX512_MASK_BITWISE_AND(m_smax_mask, m_smin_mask); @@ -611,8 +615,10 @@ static inline int countpairs_s_mu_mocks_avx_intrinsics_DOUBLE(const int64_t N0, AVX_SQUARE_FLOAT(m_parz))); // \mu^2 := cos^2(\theta_between_s_and_l) = |s.l|^2 / (|s|^2 * |l|^2) + AVX_FLOATS m_sqr_norm_l_norm_s = AVX_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s); + const AVX_FLOATS m_mask_szero = AVX_COMPARE_FLOATS(m_zero, m_sqr_s, _CMP_LT_OQ); + m_sqr_norm_l_norm_s = AVX_BLEND_FLOATS_WITH_MASK(m_one, m_sqr_norm_l_norm_s, m_mask_szero); // to avoid division by 0 AVX_FLOATS m_sqr_mu = AVX_SETZERO_FLOAT(); - const AVX_FLOATS m_sqr_norm_l_norm_s = AVX_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s); CHECK_AND_FAST_DIVIDE_AVX(m_sqr_mu, m_sqr_s_dot_l, m_sqr_norm_l_norm_s, fast_divide_and_NR_steps); const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(m_sqr_mu); @@ -620,7 +626,7 @@ static inline int countpairs_s_mu_mocks_avx_intrinsics_DOUBLE(const int64_t N0, AVX_FLOATS m_mask_left; //Do the mask filters in a separate scope { - const AVX_FLOATS m_mask_mumax = AVX_COMPARE_FLOATS(m_sqr_mu,m_sqr_mumax,_CMP_LT_OQ); + const AVX_FLOATS m_mask_mumax = AVX_COMPARE_FLOATS(m_sqr_mu,m_sqr_mumax, _CMP_LT_OQ); const AVX_FLOATS m_smax_mask = AVX_COMPARE_FLOATS(m_sqr_s, m_sqr_smax, _CMP_LT_OQ); const AVX_FLOATS m_smin_mask = AVX_COMPARE_FLOATS(m_sqr_s, m_sqr_smin, _CMP_GE_OQ); const AVX_FLOATS m_s_mask = AVX_BITWISE_AND(m_smax_mask, m_smin_mask); @@ -630,7 +636,7 @@ static inline int countpairs_s_mu_mocks_avx_intrinsics_DOUBLE(const int64_t N0, continue; } m_sqr_s = AVX_BLEND_FLOATS_WITH_MASK(m_zero,m_sqr_s,m_mask_left); - m_sqr_mu = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax,m_sqr_mu,m_mask_left); + m_sqr_mu = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax,m_sqr_mu,m_mask_left); } if(need_savg || bin_type == BIN_LIN) { @@ -731,7 +737,8 @@ static inline int countpairs_s_mu_mocks_avx_intrinsics_DOUBLE(const int64_t N0, const DOUBLE s_dot_l = parx*perpx + pary*perpy + parz*perpz; const DOUBLE sqr_l = parx*parx + pary*pary + parz*parz;// := |l|^2 const DOUBLE sqr_s_dot_l = s_dot_l * s_dot_l; - const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s); + //const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s); + const DOUBLE sqr_mu = sqr_s > 0. ? sqr_s_dot_l/(sqr_l * sqr_s) : 0.; const int mubin = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu); DOUBLE s = ZERO, pairweight = ZERO; if(need_savg || bin_type == BIN_LIN) { @@ -1005,7 +1012,6 @@ static inline int countpairs_s_mu_mocks_sse_intrinsics_DOUBLE(const int64_t N0, j = N1;/* but do not break out of the loop because this chunk might contain valid pairs */ } - SSE_FLOATS m_sqr_s, m_sqr_mu; { const SSE_FLOATS m_term1 = SSE_MULTIPLY_FLOATS(m_parx, m_perpx); @@ -1027,8 +1033,10 @@ static inline int countpairs_s_mu_mocks_sse_intrinsics_DOUBLE(const int64_t N0, } // \mu^2 = \pi^2 / s^2 - const SSE_FLOATS m_sqr_norm_l_norm_s = SSE_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s); - m_sqr_mu = SSE_DIVIDE_FLOATS(m_sqr_s_dot_l,m_sqr_norm_l_norm_s); + SSE_FLOATS m_sqr_norm_l_norm_s = SSE_MULTIPLY_FLOATS(m_sqr_norm_l, m_sqr_s); + const SSE_FLOATS m_mask_szero = SSE_COMPARE_FLOATS_LT(m_zero, m_sqr_s); + m_sqr_norm_l_norm_s = SSE_BLEND_FLOATS_WITH_MASK(m_one, m_sqr_norm_l_norm_s, m_mask_szero); // to avoid division by 0 + m_sqr_mu = SSE_DIVIDE_FLOATS(m_sqr_s_dot_l, m_sqr_norm_l_norm_s); } @@ -1037,18 +1045,18 @@ static inline int countpairs_s_mu_mocks_sse_intrinsics_DOUBLE(const int64_t N0, SSE_FLOATS m_mask_left; //Do the mask filters in a separate scope { - const SSE_FLOATS m_mask_mumax = SSE_COMPARE_FLOATS_LT(m_sqr_mu,m_sqr_mumax); + const SSE_FLOATS m_mask_mumax = SSE_COMPARE_FLOATS_LT(m_sqr_mu, m_sqr_mumax); const SSE_FLOATS m_smax_mask = SSE_COMPARE_FLOATS_LT(m_sqr_s, m_sqr_smax); const SSE_FLOATS m_smin_mask = SSE_COMPARE_FLOATS_GE(m_sqr_s, m_sqr_smin); - const SSE_FLOATS m_s_mask = SSE_BITWISE_AND(m_smax_mask,m_smin_mask); + const SSE_FLOATS m_s_mask = SSE_BITWISE_AND(m_smax_mask, m_smin_mask); m_mask_left = SSE_BITWISE_AND(m_mask_mumax, m_s_mask); - if(SSE_TEST_COMPARISON(m_mask_left)==0) { + if(SSE_TEST_COMPARISON(m_mask_left) == 0) { continue; } - m_sqr_s = SSE_BLEND_FLOATS_WITH_MASK(m_zero,m_sqr_s,m_mask_left); - m_sqr_mu = SSE_BLEND_FLOATS_WITH_MASK(m_sqr_mumax,m_sqr_mu,m_mask_left); + m_sqr_s = SSE_BLEND_FLOATS_WITH_MASK(m_zero, m_sqr_s, m_mask_left); + m_sqr_mu = SSE_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, m_sqr_mu, m_mask_left); } if(need_savg || bin_type == BIN_LIN) { union_msep.m_Dperp = SSE_SQRT_FLOAT(m_sqr_s); @@ -1076,9 +1084,9 @@ static inline int countpairs_s_mu_mocks_sse_intrinsics_DOUBLE(const int64_t N0, } else { for(int kbin=nsbin-1;kbin>=1;kbin--) { - const SSE_FLOATS m_mask_low = SSE_COMPARE_FLOATS_GE(m_sqr_s,m_supp_sqr[kbin-1]); - const SSE_FLOATS m_bin_mask = SSE_BITWISE_AND(m_mask_low,m_mask_left); - m_sbin = SSE_BLEND_FLOATS_WITH_MASK(m_sbin,m_kbin[kbin], m_bin_mask); + const SSE_FLOATS m_mask_low = SSE_COMPARE_FLOATS_GE(m_sqr_s, m_supp_sqr[kbin-1]); + const SSE_FLOATS m_bin_mask = SSE_BITWISE_AND(m_mask_low, m_mask_left); + m_sbin = SSE_BLEND_FLOATS_WITH_MASK(m_sbin, m_kbin[kbin], m_bin_mask); m_mask_left = SSE_COMPARE_FLOATS_LT(m_sqr_s, m_supp_sqr[kbin-1]); if(SSE_TEST_COMPARISON(m_mask_left) == 0) { break; @@ -1139,7 +1147,8 @@ static inline int countpairs_s_mu_mocks_sse_intrinsics_DOUBLE(const int64_t N0, const DOUBLE s_dot_l = parx*perpx + pary*perpy + parz*perpz; const DOUBLE sqr_l = parx*parx + pary*pary + parz*parz; const DOUBLE sqr_s_dot_l = s_dot_l * s_dot_l; - const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s); + //const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s); + const DOUBLE sqr_mu = sqr_s > 0. ? sqr_s_dot_l/(sqr_l * sqr_s) : 0.; const int mubin = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu); DOUBLE s = ZERO, pairweight = ZERO; if(need_savg || bin_type == BIN_LIN) { @@ -1374,7 +1383,8 @@ static inline int countpairs_s_mu_mocks_fallback_DOUBLE(const int64_t N0, DOUBLE const DOUBLE s_dot_l = parx*perpx + pary*perpy + parz*perpz; const DOUBLE sqr_l = parx*parx + pary*pary + parz*parz; const DOUBLE sqr_s_dot_l = s_dot_l * s_dot_l; - const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s); + //const DOUBLE sqr_mu = sqr_s_dot_l/(sqr_l * sqr_s); + const DOUBLE sqr_mu = sqr_s > 0. ? sqr_s_dot_l/(sqr_l * sqr_s) : 0.; const int mubin = (sqr_mu >= sqr_mumax) ? nmu_bins:(int) (SQRT(sqr_mu)*inv_dmu); DOUBLE s = ZERO, pairweight = ZERO; if(need_savg || bin_type == BIN_LIN) { diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src index 99f1a293..fa23ec01 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src @@ -513,7 +513,7 @@ int countpairs_theta_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, } } - if(autocorr==0) { + if(autocorr == 0) { int status = check_ra_dec_DOUBLE(ND2, ra2, dec2); if(status != EXIT_SUCCESS) { return status; @@ -534,7 +534,7 @@ int countpairs_theta_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, int nthetabin = bins->nedges; double thetamin=bins->edges[0], thetamax=bins->edges[bins->nedges-1]; if( ! (thetamin >= 0.0 && thetamax > 0.0 && thetamin < thetamax && thetamax <= 180.0 && nthetabin >= 1) ) { - fprintf(stderr,"Error: Could not setup with theta bins correctly. (thetamin = %lf, thetamax = %lf, with nbins = %d). Expected non-zero rmin/rmax with thetamax > " + fprintf(stderr,"Error: Could not setup with theta bins correctly. (thetamin = %lf, thetamax = %lf, with nbins = %d). Expected positive rmin/rmax with thetamax > " "thetamin and nbins >=1 \n",thetamin, thetamax, nthetabin); return EXIT_FAILURE; } @@ -545,7 +545,7 @@ int countpairs_theta_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, if((options->link_in_ra == 0) && (options->bin_refine_factors[1] < numthreads) && (get_bin_refine_scheme(options) == BINNING_DFL)) { - options->bin_refine_factors[1]=numthreads; + options->bin_refine_factors[1] = numthreads; } #endif @@ -1148,11 +1148,9 @@ int countpairs_theta_mocks_DOUBLE(const int64_t ND1, DOUBLE *ra1, DOUBLE *dec1, // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND1; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index e6bd239d..ddfc03c2 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -507,7 +507,7 @@ int countpairs_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1, free_cellarray_DOUBLE(lattice1, totncells); - if(autocorr==0) { + if(autocorr == 0) { free_cellarray_DOUBLE(lattice2, totncells); } if(abort_status != EXIT_SUCCESS || interrupt_status_DOUBLE != EXIT_SUCCESS) { @@ -583,7 +583,7 @@ int countpairs_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1, /* Is the min. requested separation 0.0 ?*/ /* The comparison is '<=' rather than '==' only to silence the compiler */ - if(rupp[0] <= 0.0) { + if(rupp[0] <= 0.) { /* Then, add all the self-pairs. This ensures that a cross-correlation with two identical datasets produces the same result as the auto-correlation */ @@ -594,11 +594,9 @@ int countpairs_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1, // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND1; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; diff --git a/theory/DDrppi/countpairs_rp_pi_impl.c.src b/theory/DDrppi/countpairs_rp_pi_impl.c.src index 19c10a19..23b019ec 100644 --- a/theory/DDrppi/countpairs_rp_pi_impl.c.src +++ b/theory/DDrppi/countpairs_rp_pi_impl.c.src @@ -197,7 +197,7 @@ int countpairs_rp_pi_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z int nrpbin = bins->nedges; double rpmin=bins->edges[0], rpmax=bins->edges[bins->nedges-1]; if( ! (rpmin >= 0.0 && rpmax > 0.0 && rpmin < rpmax && nrpbin > 0)) { - fprintf(stderr,"Error: Could not setup with R bins correctly. (rmin = %lf, rmax = %lf, with nbins = %d). Expected non-zero rmin/rmax with rmax > rmin and nbins >=1 \n", + fprintf(stderr,"Error: Could not setup with r bins correctly. (rmin = %lf, rmax = %lf, with nbins = %d). Expected positive rmin/rmax with rmax > rmin and nbins >=1 \n", rpmin, rpmax, nrpbin); return EXIT_FAILURE; } @@ -576,28 +576,25 @@ int countpairs_rp_pi_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z /* The comparison is '<=' rather than '==' only to silence the compiler */ if(rupp[0] <= 0.0) { - int index = (npibin+1);//first valid rp bin (with 0-dpi depth in pi) /* Then, add all the self-pairs. This ensures that a cross-correlation with two identical datasets produces the same result as the auto-correlation */ - npairs[index] += ND1; + npairs[npibin+1] += ND1; // Increasing npairs affects rpavg and weightavg. // We don't need to add anything to rpavg; all the self-pairs have 0 separation! // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND1; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; pair.weights1[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; } - weightavg[1] += weight_func(&pair); + weightavg[npibin+1] += weight_func(&pair); } } } diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index e49cbee4..4e0b9e98 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -597,28 +597,25 @@ int countpairs_s_mu_DOUBLE(const int64_t ND1, DOUBLE *X1, DOUBLE *Y1, DOUBLE *Z1 /* The comparison is '<=' rather than '==' only to silence the compiler */ if(supp[0] <= 0.0) { - int index = (nmu_bins + 1);//first valid s bin (with 0-dpi depth in pi) /* Then, add all the self-pairs. This ensures that a cross-correlation with two identical datasets produces the same result as the auto-correlation */ - npairs[index] += ND1; + npairs[nmu_bins+1] += ND1; // Increasing npairs affects savg and weightavg. // We don't need to add anything to savg; all the self-pairs have 0 separation! // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND1; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; pair.weights1[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; } - weightavg[1] += weight_func(&pair); + weightavg[nmu_bins+1] += weight_func(&pair); } } } diff --git a/theory/DDsmu/countpairs_s_mu_kernels.c.src b/theory/DDsmu/countpairs_s_mu_kernels.c.src index 7374f87b..0538406c 100644 --- a/theory/DDsmu/countpairs_s_mu_kernels.c.src +++ b/theory/DDsmu/countpairs_s_mu_kernels.c.src @@ -232,6 +232,8 @@ static inline int countpairs_s_mu_avx512_intrinsics_DOUBLE(const int64_t N0, DOU const AVX512_FLOATS m_sqr_smax = m_supp_sqr[nsbin-1]; const AVX512_FLOATS m_sqr_smin = m_supp_sqr[0]; const AVX512_FLOATS m_sqr_mumax = AVX512_SET_FLOAT(sqr_mumax); + const AVX512_FLOATS m_zero = AVX_SETZERO_FLOAT(); + const AVX512_FLOATS m_one = AVX_SET_FLOAT((DOUBLE) 1); const AVX512_FLOATS m_xdiff = AVX512_SUBTRACT_FLOATS(m_x1, m_xpos); //(x1[j:j+NVEC-1] - x0) const AVX512_FLOATS m_ydiff = AVX512_SUBTRACT_FLOATS(m_y1, m_ypos); //(y1[j:j+NVEC-1] - y0) @@ -239,7 +241,7 @@ static inline int countpairs_s_mu_avx512_intrinsics_DOUBLE(const int64_t N0, DOU const AVX512_FLOATS m_sqr_zdiff = AVX512_SQUARE_FLOAT(m_zdiff); //(z[j] - z0)^2 const AVX512_FLOATS m_sqr_z_p_sqr_y = AVX512_FMA_ADD_FLOATS(m_ydiff, m_ydiff, m_sqr_zdiff); //dy*dy + dz^2 - const AVX512_FLOATS s2 = AVX512_FMA_ADD_FLOATS(m_xdiff, m_xdiff, m_sqr_z_p_sqr_y);//s^2 = dx*dx + (dz^2 + dy^2) + const AVX512_FLOATS s2 = AVX512_FMA_ADD_FLOATS(m_xdiff, m_xdiff, m_sqr_z_p_sqr_y);//s^2 = dx*dx + (dz^2 + dy^2) const AVX512_FLOATS max_sqr_dz = AVX512_MULTIPLY_FLOATS(s2, m_sqr_mumax); @@ -253,7 +255,7 @@ static inline int countpairs_s_mu_avx512_intrinsics_DOUBLE(const int64_t N0, DOU j=N1;//but do not break yet, there might be valid pairs in this chunk } - const AVX512_MASK m_mu_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_left, m_sqr_zdiff, max_sqr_dz, _CMP_LT_OQ); + const AVX512_MASK m_mu_mask = AVX512_MASK_COMPARE_FLOATS(m_sqr_zdiff, m_mask_left, max_sqr_dz, _CMP_GE_OQ); const AVX512_MASK m_smax_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_left, s2, m_sqr_smax, _CMP_LT_OQ);//check for s2 < sqr_smax const AVX512_MASK m_smin_mask = AVX512_MASK_COMPARE_FLOATS(m_mask_left, s2, m_sqr_smin, _CMP_GE_OQ);//check for s2 >= sqr_smin const AVX512_MASK m_s2_mask = AVX512_MASK_BITWISE_AND(m_smax_mask, m_smin_mask); @@ -270,7 +272,9 @@ static inline int countpairs_s_mu_avx512_intrinsics_DOUBLE(const int64_t N0, DOU /*m_mu := sqrt(dz^2/s2) (with masked elements set to mu_max */ AVX512_FLOATS m_sqr_mu = AVX512_SETZERO_FLOAT(); - CHECK_AND_FAST_DIVIDE_AVX512(m_sqr_mu, m_sqr_zdiff, s2, m_mask_left, fast_divide_and_NR_steps); + const AVX512_FLOATS m_mask_szero = AVX512_COMPARE_FLOATS(m_zero, s2, _CMP_LT_OQ); + const AVX512_FLOATS m_sqr_norm_s = AVX512_BLEND_FLOATS_WITH_MASK(m_one, s2, m_mask_szero); // to avoid division by 0 + CHECK_AND_FAST_DIVIDE_AVX512(m_sqr_mu, m_sqr_zdiff, m_sqr_norm_s, m_mask_left, fast_divide_and_NR_steps); const AVX512_FLOATS m_mu = AVX512_MASKZ_SQRT_FLOAT(m_mask_left, m_sqr_mu); if(need_savg || bin_type == BIN_LIN) { @@ -376,7 +380,6 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE int32_t need_costheta = need_weightavg; const DOUBLE sqr_smin=smin*smin, sqr_smax=smax*smax; DOUBLE inv_sstep=0., smin_invstep=0.; - const AVX_FLOATS m_zero = AVX_SETZERO_FLOAT(); AVX_FLOATS m_inv_sstep = AVX_SETZERO_FLOAT(); AVX_FLOATS m_smin_invstep = AVX_SETZERO_FLOAT(); if (bin_type == BIN_LIN) { @@ -559,11 +562,12 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE const AVX_FLOATS m_max_dz = AVX_SET_FLOAT(max_dz); const AVX_FLOATS m_sqr_smax = m_supp_sqr[nsbin-1]; const AVX_FLOATS m_sqr_smin = m_supp_sqr[0]; - const AVX_FLOATS m_inv_dmu = AVX_SET_FLOAT(inv_dmu); + const AVX_FLOATS m_inv_dmu = AVX_SET_FLOAT(inv_dmu); const AVX_FLOATS m_sqr_mumax = AVX_SET_FLOAT(sqr_mumax); - const AVX_FLOATS m_nmu_bins = AVX_SET_FLOAT((DOUBLE) nmu_bins); - const AVX_FLOATS m_one = AVX_SET_FLOAT((DOUBLE) 1); + const AVX_FLOATS m_nmu_bins = AVX_SET_FLOAT((DOUBLE) nmu_bins); + const AVX_FLOATS m_zero = AVX_SETZERO_FLOAT(); + const AVX_FLOATS m_one = AVX_SET_FLOAT((DOUBLE) 1); const AVX_FLOATS m_xdiff = AVX_SUBTRACT_FLOATS(m_x1, m_xpos); //(x[j] - x0) const AVX_FLOATS m_ydiff = AVX_SUBTRACT_FLOATS(m_y1, m_ypos); //(y[j] - y0) @@ -573,7 +577,7 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE const AVX_FLOATS m_sqr_ydiff = AVX_SQUARE_FLOAT(m_ydiff); //(y0 - y[j])^2 const AVX_FLOATS m_sqr_zdiff = AVX_SQUARE_FLOAT(m_zdiff); //(z0 - z[j])^2 - AVX_FLOATS s2 = AVX_ADD_FLOATS(m_sqr_zdiff, AVX_ADD_FLOATS(m_sqr_xdiff, m_sqr_ydiff));//s^2 = dz^2 + dx^2 + dy^2 + AVX_FLOATS s2 = AVX_ADD_FLOATS(m_sqr_zdiff, AVX_ADD_FLOATS(m_sqr_xdiff, m_sqr_ydiff));//s^2 = dz^2 + dx^2 + dy^2 AVX_FLOATS m_mask_left; AVX_FLOATS max_sqr_dz = AVX_MULTIPLY_FLOATS(s2, m_sqr_mumax); @@ -590,7 +594,7 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE j=N1;//but do not break yet, this chunk might contain valid pairs } - const AVX_FLOATS m_mu_mask = AVX_COMPARE_FLOATS(m_sqr_zdiff, max_sqr_dz, _CMP_LT_OQ); + const AVX_FLOATS m_mu_mask = AVX_COMPARE_FLOATS(max_sqr_dz, m_sqr_zdiff, _CMP_GE_OQ); const AVX_FLOATS m_smax_mask = AVX_COMPARE_FLOATS(s2, m_sqr_smax, _CMP_LT_OQ);//check for s2 < sqr_smax const AVX_FLOATS m_smin_mask = AVX_COMPARE_FLOATS(s2, m_sqr_smin, _CMP_GE_OQ);//check for s2 >= sqr_smin const AVX_FLOATS m_s2_mask = AVX_BITWISE_AND(m_smax_mask,m_smin_mask); @@ -610,6 +614,8 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE //There is some s2 that satisfies sqr_smin <= s2 < sqr_smax && mu_min <= |dz| < mu_max s2 = AVX_BLEND_FLOATS_WITH_MASK(m_sqr_smax, s2, m_mask_left); /*m_sqr_mu := dz^2/s^2 (with masked elements set to mu_max */ + const AVX_FLOATS m_mask_szero = AVX_COMPARE_FLOATS(m_zero, s2, _CMP_LT_OQ); + const AVX_FLOATS m_sqr_norm_s = AVX_BLEND_FLOATS_WITH_MASK(m_one, s2, m_mask_szero); // to avoid division by 0 AVX_FLOATS m_sqr_mu = AVX_SETZERO_FLOAT(); /* Check if fast_divide is enabled and either use the normal divide or @@ -618,7 +624,7 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE macro is defined in `avx_calls.h` */ - CHECK_AND_FAST_DIVIDE_AVX(m_sqr_mu, m_sqr_zdiff, s2, fast_divide_and_NR_steps); + CHECK_AND_FAST_DIVIDE_AVX(m_sqr_mu, m_sqr_zdiff, m_sqr_norm_s, fast_divide_and_NR_steps); const AVX_FLOATS m_mu = AVX_SQRT_FLOAT(AVX_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, m_sqr_mu, m_mask_left)); @@ -697,9 +703,8 @@ static inline int countpairs_s_mu_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE const DOUBLE s2 = sqr_dx_dy + sqr_dz; if(s2 >= sqr_smax || s2 < sqr_smin) continue; - if(sqr_dz >= s2 * sqr_mumax) continue; - - const DOUBLE mu = SQRT(sqr_dz/s2); + if(sqr_dz > s2 * sqr_mumax) continue; // gt in case s2 (hence sqr_dz) is 0. + const DOUBLE mu = s2 > 0. ? SQRT(sqr_dz/s2) : 0.; DOUBLE s = ZERO, pairweight = ZERO; if(need_savg || bin_type == BIN_LIN) { @@ -972,9 +977,10 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE const SSE_FLOATS m_sqr_smax = m_supp_sqr[nsbin-1]; const SSE_FLOATS m_sqr_smin = m_supp_sqr[0]; const SSE_FLOATS m_sqr_mumax = SSE_SET_FLOAT(sqr_mumax); - const SSE_FLOATS m_inv_dmu = SSE_SET_FLOAT(inv_dmu); - const SSE_FLOATS m_nmu_bins = SSE_SET_FLOAT((DOUBLE) nmu_bins); - const SSE_FLOATS m_one = SSE_SET_FLOAT((DOUBLE) 1); + const SSE_FLOATS m_inv_dmu = SSE_SET_FLOAT(inv_dmu); + const SSE_FLOATS m_nmu_bins = SSE_SET_FLOAT((DOUBLE) nmu_bins); + const SSE_FLOATS m_zero = SSE_SETZERO_FLOAT(); + const SSE_FLOATS m_one = SSE_SET_FLOAT((DOUBLE) 1); const SSE_FLOATS m_xdiff = SSE_SUBTRACT_FLOATS(m_x1, m_xpos); //(x[j] - x0) const SSE_FLOATS m_ydiff = SSE_SUBTRACT_FLOATS(m_y1, m_ypos); //(y[j] - y0) @@ -984,7 +990,7 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE const SSE_FLOATS m_sqr_ydiff = SSE_SQUARE_FLOAT(m_ydiff); const SSE_FLOATS m_sqr_zdiff = SSE_SQUARE_FLOAT(m_zdiff); - SSE_FLOATS s2 = SSE_ADD_FLOATS(m_sqr_zdiff, SSE_ADD_FLOATS(m_sqr_xdiff, m_sqr_ydiff));//s^2 = dx^2 + dy^2 + dz^2 + SSE_FLOATS s2 = SSE_ADD_FLOATS(m_sqr_zdiff, SSE_ADD_FLOATS(m_sqr_xdiff, m_sqr_ydiff));//s^2 = dx^2 + dy^2 + dz^2 SSE_FLOATS m_mask_left; SSE_FLOATS max_sqr_dz = SSE_MULTIPLY_FLOATS(s2, m_sqr_mumax); @@ -996,15 +1002,15 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE //that implies the zdiff values are also monotonically increasing //Therefore, if any of the zdiff values are >= max_dz, then //no future iteration in j can produce a zdiff value less than pimax. - SSE_FLOATS m_mask_geq_pimax = SSE_COMPARE_FLOATS_GE(m_zdiff,m_max_dz); + SSE_FLOATS m_mask_geq_pimax = SSE_COMPARE_FLOATS_GE(m_zdiff, m_max_dz); if(SSE_TEST_COMPARISON(m_mask_geq_pimax) > 0) { j=N1;//but do not break yet, this chunk might contain valid pairs } - const SSE_FLOATS m_mu_mask = SSE_COMPARE_FLOATS_LT(m_sqr_zdiff, max_sqr_dz); + const SSE_FLOATS m_mu_mask = SSE_COMPARE_FLOATS_GE(max_sqr_dz, m_sqr_zdiff); const SSE_FLOATS m_smax_mask = SSE_COMPARE_FLOATS_LT(s2, m_sqr_smax); const SSE_FLOATS m_smin_mask = SSE_COMPARE_FLOATS_GE(s2, m_sqr_smin); - const SSE_FLOATS m_s2_mask = SSE_BITWISE_AND(m_smax_mask,m_smin_mask); + const SSE_FLOATS m_s2_mask = SSE_BITWISE_AND(m_smax_mask, m_smin_mask); //Create a combined mask by bitwise and of m1 and m_mask_left. //This gives us the mask for all sqr_smin <= s2 < sqr_smax @@ -1020,7 +1026,9 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE //There is some s2 that satisfies sqr_smin <= s2 < sqr_smax && mu_min <= |dz| < mu_max s2 = SSE_BLEND_FLOATS_WITH_MASK(m_sqr_smax, s2, m_mask_left); - const SSE_FLOATS m_mu = SSE_SQRT_FLOAT(SSE_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, SSE_DIVIDE_FLOATS(m_sqr_zdiff, s2), m_mask_left)); + const SSE_FLOATS m_mask_szero = SSE_COMPARE_FLOATS_LT(m_zero, s2); + const SSE_FLOATS m_sqr_norm_s = SSE_BLEND_FLOATS_WITH_MASK(m_one, s2, m_mask_szero); // to avoid division by 0 + const SSE_FLOATS m_mu = SSE_SQRT_FLOAT(SSE_BLEND_FLOATS_WITH_MASK(m_sqr_mumax, SSE_DIVIDE_FLOATS(m_sqr_zdiff, m_sqr_norm_s), m_mask_left)); if(need_savg || bin_type == BIN_LIN) { union_mDperp.m_Dperp = SSE_SQRT_FLOAT(s2); @@ -1048,7 +1056,7 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE //XOR with 0xFFFF... gives the bins that are smaller than m_supp_sqr[kbin] (and is faster than cmp_p(s/d) in theory) //m_mask_left = SSE_XOR_FLOATS(m_mask_low, m_all_ones); const int test = SSE_TEST_COMPARISON(m_mask_left); - if(test==0) { + if(test == 0) { break; } } @@ -1094,9 +1102,10 @@ static inline int countpairs_s_mu_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE const DOUBLE sqr_dx_dy = dx*dx + dy*dy; const DOUBLE sqr_dz = dz*dz; const DOUBLE s2 = sqr_dx_dy + sqr_dz; + if(s2 >= sqr_smax || s2 < sqr_smin) continue; - if(sqr_dz >= s2 * sqr_mumax) continue; - const DOUBLE mu = SQRT(sqr_dz/s2); + if(sqr_dz > s2 * sqr_mumax) continue; + const DOUBLE mu = s2 > 0. ? SQRT(sqr_dz/s2) : 0.; DOUBLE s = ZERO, pairweight = ZERO; if(need_weightavg){ @@ -1336,12 +1345,12 @@ static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, const DOUBLE sqr_dx_dy = dx*dx + dy*dy; const DOUBLE sqr_dz = dz*dz; - const DOUBLE s2 = sqr_dx_dy + sqr_dz; + const DOUBLE s2 = sqr_dx_dy + sqr_dz; if(s2 >= sqr_smax || s2 < sqr_smin) continue; - if(sqr_dz >= s2 * sqr_mu_max) continue; + if(sqr_dz > s2 * sqr_mu_max) continue; // gt in case s2 (hence sqr_dz) is 0. + const DOUBLE mu = s2 > 0. ? SQRT(sqr_dz/s2) : 0.; - const DOUBLE mu = SQRT(sqr_dz/s2); DOUBLE s = ZERO, pairweight = ZERO; if(need_savg || bin_type == BIN_LIN) { s = SQRT(s2); @@ -1355,7 +1364,7 @@ static inline int countpairs_s_mu_fallback_DOUBLE(const int64_t N0, DOUBLE *x0, } int mu_bin = (int) (mu*inv_dmu); - mu_bin = mu_bin > nmu_bins ? nmu_bins:mu_bin; + mu_bin = mu_bin > nmu_bins ? nmu_bins : mu_bin; int kbin = 0; if (bin_type == BIN_LIN) { diff --git a/theory/wp/countpairs_wp_impl.c.src b/theory/wp/countpairs_wp_impl.c.src index 0dae5469..acfbbf70 100644 --- a/theory/wp/countpairs_wp_impl.c.src +++ b/theory/wp/countpairs_wp_impl.c.src @@ -562,11 +562,9 @@ int countpairs_wp_DOUBLE(const int64_t ND, DOUBLE * restrict X, DOUBLE * restric // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; diff --git a/theory/xi/countpairs_xi_impl.c.src b/theory/xi/countpairs_xi_impl.c.src index 358f8825..be0c244d 100644 --- a/theory/xi/countpairs_xi_impl.c.src +++ b/theory/xi/countpairs_xi_impl.c.src @@ -525,11 +525,9 @@ int countpairs_xi_DOUBLE(const int64_t ND, DOUBLE * restrict X, DOUBLE * restric // The self-pairs have non-zero weight, though. So, fix that here. if(need_weightavg){ // Keep in mind this is an autocorrelation (i.e. only one particle set to consider) + pair_struct_DOUBLE pair; + set_pair_struct_DOUBLE(&pair, (weight_struct_DOUBLE *) &(extra->weights0), (weight_struct_DOUBLE *) &(extra->weights0), &(extra->pair_weight)); weight_func_t_DOUBLE weight_func = get_weight_func_by_method_DOUBLE(extra->weight_method); - pair_struct_DOUBLE pair = {.num_weights = extra->weights0.num_weights, - .num_integer_weights = extra->weights0.num_integer_weights, - .dx.d=0., .dy.d=0., .dz.d=0., // always 0 separation - .parx.d=0., .pary.d=0., .parz.d=0.}; for(int64_t j = 0; j < ND; j++){ for(int w = 0; w < pair.num_weights; w++){ pair.weights0[w].d = ((DOUBLE *) extra->weights0.weights[w])[j]; diff --git a/utils/weight_functions.h.src b/utils/weight_functions.h.src index 9052039c..60b751a0 100644 --- a/utils/weight_functions.h.src +++ b/utils/weight_functions.h.src @@ -64,14 +64,17 @@ typedef struct // set pair_struct_DOUBLE from two weight_struct_DOUBLE -static inline int set_pair_struct_DOUBLE(pair_struct_DOUBLE *pair, const weight_struct_DOUBLE *local_w0, const weight_struct_DOUBLE *local_w1, const pair_weight_struct *pair_w) { - pair->num_weights = local_w0->num_weights; - pair->num_integer_weights = local_w1->num_integer_weights; +static inline int set_pair_struct_DOUBLE(pair_struct_DOUBLE *pair, const weight_struct_DOUBLE *weights0, const weight_struct_DOUBLE *weights1, const pair_weight_struct *pair_w) { + pair->num_weights = weights0->num_weights; + pair->num_integer_weights = weights1->num_integer_weights; pair->pair_weight.weight = pair_w->weight; pair->pair_weight.sep = pair_w->sep; pair->pair_weight.num = pair_w->num; pair->noffset = pair_w->noffset; pair->default_value = (DOUBLE) pair_w->default_value; + pair->dx.d = 0.; pair->dy.d = 0.; pair->dz.d = 0.; + pair->parx.d = 0.; pair->pary.d = 0.; pair->parz.d = 0.; + pair->costheta.d = 1.; return EXIT_SUCCESS; } @@ -127,20 +130,20 @@ static inline DOUBLE inverse_bitwise_DOUBLE(pair_struct_DOUBLE *pair){ nbits += POPCOUNT(*((LONG *) &(pair->weights0[w].d)) & *((LONG *) &(pair->weights1[w].d))); } DOUBLE weight = (nbits == 0) ? pair->default_value : 1./nbits; - for (int w=pair->num_integer_weights;wnum_weights;w++) { + for (int w=pair->num_integer_weights; wnum_weights; w++) { weight *= pair->weights0[w].d*pair->weights1[w].d; } int num = pair->pair_weight.num; if (num) { DOUBLE costheta = pair->costheta.d; DOUBLE *pair_sep = pair->pair_weight.sep; - if (costheta >= pair_sep[num-1] || (costheta < pair_sep[0])) { + if (costheta > pair_sep[num-1] || (costheta <= pair_sep[0])) { return weight; } else { DOUBLE *pair_weight = pair->pair_weight.weight; - for (int kbin=num-2;kbin>=0;kbin--) { - if(costheta >= pair_sep[kbin]) { + for (int kbin=0;kbinweights0[w].da512[jj])) & *((LONG *) &(pair->weights1[w].da512[jj]))); } DOUBLE weight = (nbits == 0) ? pair->default_value : 1./nbits; - for (int w=pair->num_integer_weights;wnum_weights;w++) { + for (int w=pair->num_integer_weights; wnum_weights; w++) { weight *= pair->weights0[w].da512[jj]*pair->weights1[w].da512[jj]; } int num = pair->pair_weight.num; if (num) { DOUBLE costheta = pair->costheta.da512[jj]; DOUBLE *pair_sep = pair->pair_weight.sep; - if (costheta >= pair_sep[num-1] || (costheta < pair_sep[0])) { + if (costheta > pair_sep[num-1] || (costheta <= pair_sep[0])) { ; } else { - DOUBLE *pair_weight = pair->pair_weight.weight; - for (int kbin=num-2;kbin>=0;kbin--) { - if(costheta >= pair_sep[kbin]) { + for (int kbin=0;kbinweights0[w].da[jj])) & *((LONG *) &(pair->weights1[w].da[jj]))); } DOUBLE weight = (nbits == 0) ? pair->default_value : 1./nbits; - for (int w=pair->num_integer_weights;wnum_weights;w++) { + for (int w=pair->num_integer_weights; wnum_weights; w++) { weight *= pair->weights0[w].da[jj]*pair->weights1[w].da[jj]; } int num = pair->pair_weight.num; if (num) { DOUBLE costheta = pair->costheta.da[jj]; DOUBLE *pair_sep = pair->pair_weight.sep; - if (costheta >= pair_sep[num-1] || (costheta < pair_sep[0])) { + if (costheta > pair_sep[num-1] || (costheta <= pair_sep[0])) { ; } else { DOUBLE *pair_weight = pair->pair_weight.weight; - for (int kbin=num-2;kbin>=0;kbin--) { - if(costheta >= pair_sep[kbin]) { + for (int kbin=0;kbinweights0[w].ds[jj])) & *((LONG *) &(pair->weights1[w].ds[jj]))); } DOUBLE weight = (nbits == 0) ? pair->default_value : 1./nbits; - for (int w=pair->num_integer_weights;wnum_weights;w++) { + for (int w=pair->num_integer_weights; wnum_weights; w++) { weight *= pair->weights0[w].ds[jj]*pair->weights1[w].ds[jj]; } int num = pair->pair_weight.num; if (num) { DOUBLE costheta = pair->costheta.ds[jj]; DOUBLE *pair_sep = pair->pair_weight.sep; - if (costheta >= pair_sep[num-1] || (costheta < pair_sep[0])) { + if (costheta > pair_sep[num-1] || (costheta <= pair_sep[0])) { ; } else { DOUBLE *pair_weight = pair->pair_weight.weight; - for (int kbin=num-2;kbin>=0;kbin--) { - if(costheta >= pair_sep[kbin]) { + for (int kbin=0;kbin