diff --git a/src/main_nep/fitness.cu b/src/main_nep/fitness.cu index 5c63b077e..56d4b7055 100644 --- a/src/main_nep/fitness.cu +++ b/src/main_nep/fitness.cu @@ -85,10 +85,18 @@ Fitness::~Fitness() { fclose(fid_loss_out); } void Fitness::compute( const int generation, Parameters& para, const float* population, float* fitness) { + if (generation == 0) { + std::vector dummy_solution(para.number_of_variables, 1.0f); + for (int n = 0; n < num_batches; ++n) { + potential->find_force(para, dummy_solution.data(), train_set[n], true); + } + potential->find_force(para, dummy_solution.data(), test_set, true); + } + int batch_id = generation % num_batches; for (int n = 0; n < para.population_size; ++n) { const float* individual = population + n * para.number_of_variables; - potential->find_force(para, individual, train_set[batch_id]); + potential->find_force(para, individual, train_set[batch_id], false); fitness[n + 0 * para.population_size] = train_set[batch_id].get_rmse_energy(); fitness[n + 1 * para.population_size] = train_set[batch_id].get_rmse_force(); fitness[n + 2 * para.population_size] = train_set[batch_id].get_rmse_virial(); @@ -121,7 +129,7 @@ void Fitness::report_error( { if (0 == (generation + 1) % 100) { - potential->find_force(para, elite, test_set); + potential->find_force(para, elite, test_set, false); float rmse_energy_test = test_set.get_rmse_energy(); float rmse_force_test = test_set.get_rmse_force(); float rmse_virial_test = test_set.get_rmse_virial(); @@ -143,6 +151,7 @@ void Fitness::report_error( for (int m = 0; m < para.number_of_variables; ++m) { fprintf(fid_nep, "%15.7e\n", elite[m]); } + para.q_scaler_gpu.copy_to_host(para.q_scaler_cpu.data()); for (int d = 0; d < para.q_scaler_cpu.size(); ++d) { fprintf(fid_nep, "%15.7e\n", para.q_scaler_cpu[d]); } @@ -197,7 +206,7 @@ void Fitness::report_error( FILE* fid_virial = my_fopen(file_virial, "w"); for (int batch_id = 0; batch_id < num_batches; ++batch_id) { - potential->find_force(para, elite, train_set[batch_id]); + potential->find_force(para, elite, train_set[batch_id], false); update_energy_force_virial(fid_energy, fid_force, fid_virial, train_set[batch_id]); } diff --git a/src/main_nep/nep.cu b/src/main_nep/nep.cu index 72c572c12..6f3c19015 100644 --- a/src/main_nep/nep.cu +++ b/src/main_nep/nep.cu @@ -238,6 +238,46 @@ void NEP2::update_potential(const float* parameters, ANN& ann) } } +static void __global__ find_max_min(const int N, const float* g_q, float* g_q_scaler) +{ + const int tid = threadIdx.x; + const int bid = blockIdx.x; + __shared__ float s_max[1024]; + __shared__ float s_min[1024]; + s_max[tid] = -1000000.0f; // a small number + s_min[tid] = +1000000.0f; // a large number + const int stride = 1024; + const int number_of_rounds = (N - 1) / stride + 1; + for (int round = 0; round < number_of_rounds; ++round) { + const int n = round * stride + tid; + if (n < N) { + const int m = n + N * bid; + float q = g_q[m]; + if (q > s_max[tid]) { + s_max[tid] = q; + } + if (q < s_min[tid]) { + s_min[tid] = q; + } + } + } + __syncthreads(); + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { + if (tid < offset) { + if (s_max[tid] < s_max[tid + offset]) { + s_max[tid] = s_max[tid + offset]; + } + if (s_min[tid] > s_min[tid + offset]) { + s_min[tid] = s_min[tid + offset]; + } + } + __syncthreads(); + } + if (tid == 0) { + g_q_scaler[bid] = min(g_q_scaler[bid], 1.0f / (s_max[0] - s_min[0])); + } +} + static __device__ void apply_ann_one_layer(const NEP2::ANN& ann, float* q, float& energy, float* energy_derivative) { @@ -451,7 +491,8 @@ static __global__ void find_force_angular( } } -void NEP2::find_force(Parameters& para, const float* parameters, Dataset& dataset) +void NEP2::find_force( + Parameters& para, const float* parameters, Dataset& dataset, bool calculate_q_scaler) { CHECK(cudaMemcpyToSymbol(c_parameters, parameters, sizeof(float) * annmb.num_para)); float* address_c_parameters; @@ -485,6 +526,12 @@ void NEP2::find_force(Parameters& para, const float* parameters, Dataset& datase nep_data.z12_angular.data(), nep_data.descriptors.data(), nep_data.sum_fxyz.data()); CUDA_CHECK_KERNEL + if (calculate_q_scaler) { + find_max_min<<>>( + dataset.N, nep_data.descriptors.data(), para.q_scaler_gpu.data()); + CUDA_CHECK_KERNEL + } + apply_ann<<>>( dataset.N, paramb, annmb, nep_data.descriptors.data(), para.q_scaler_gpu.data(), dataset.energy.data(), nep_data.Fp.data()); diff --git a/src/main_nep/nep.cuh b/src/main_nep/nep.cuh index 1526c2439..62b6c983e 100644 --- a/src/main_nep/nep.cuh +++ b/src/main_nep/nep.cuh @@ -66,7 +66,8 @@ public: int N, int N_times_max_NN_radial, int N_times_max_NN_angular); - void find_force(Parameters& para, const float* parameters, Dataset& dataset); + void + find_force(Parameters& para, const float* parameters, Dataset& dataset, bool calculate_q_scaler); private: ParaMB paramb; diff --git a/src/main_nep/parameters.cu b/src/main_nep/parameters.cu index feff45b65..2e55a273b 100644 --- a/src/main_nep/parameters.cu +++ b/src/main_nep/parameters.cu @@ -103,14 +103,7 @@ Parameters::Parameters(char* input_dir) } int dim = (n_max_radial + 1) + (n_max_angular + 1) * L_max; - q_scaler_cpu.resize(dim, 1.0f); - float factor_cutoff = rc_radial * rc_radial * rc_radial / (rc_angular * rc_angular * rc_angular); - for (int l = 1; l <= L_max; ++l) { - float factor = 4.0f * 3.1415927f / (2 * l + 1) * factor_cutoff; - for (int n = 0; n <= n_max_angular; ++n) { - q_scaler_cpu[(l - 1) * (n_max_angular + 1) + n + n_max_radial + 1] = factor; - } - } + q_scaler_cpu.resize(dim, 1.0e10f); q_scaler_gpu.resize(dim); q_scaler_gpu.copy_from_host(q_scaler_cpu.data()); diff --git a/src/main_nep/potential.cuh b/src/main_nep/potential.cuh index 3f5585516..ffa3db8d7 100644 --- a/src/main_nep/potential.cuh +++ b/src/main_nep/potential.cuh @@ -23,5 +23,6 @@ class Potential { public: virtual ~Potential() = default; - virtual void find_force(Parameters& para, const float* parameters, Dataset& dataset) = 0; + virtual void find_force( + Parameters& para, const float* parameters, Dataset& dataset, bool calculate_q_scaler) = 0; };