Skip to content

Commit

Permalink
Merge pull request #181 from awslabs/sjg/libceed-gpu-dev
Browse files Browse the repository at this point in the history
Collection of minor performance fixes during profiling and GPU testing
  • Loading branch information
sebastiangrimberg authored Feb 7, 2024
2 parents 2777527 + 3323dda commit 7d26775
Show file tree
Hide file tree
Showing 18 changed files with 249 additions and 199 deletions.
3 changes: 2 additions & 1 deletion palace/fem/fespace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ class AuxiliaryFiniteElementSpaceHierarchy
std::vector<const Operator *> GetDiscreteInterpolators() const
{
std::vector<const Operator *> G_(GetNumLevels());
for (std::size_t l = 0; l < G_.size(); l++)
G_[0] = nullptr; // No discrete interpolator for coarsest level
for (std::size_t l = 1; l < G_.size(); l++)
{
G_[l] = &GetDiscreteInterpolatorAtLevel(l);
}
Expand Down
2 changes: 1 addition & 1 deletion palace/fem/libceed/basis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ void InitInterpolatorBasis(const mfem::FiniteElement &trial_fe,
if constexpr (false)
{
std::cout << "New interpolator basis (" << ceed << ", " << &trial_fe << ", " << &test_fe
<< ")\n";
<< ", " << (trial_fe.GetMapType() == test_fe.GetMapType()) << ")\n";
}
if constexpr (false)
{
Expand Down
5 changes: 4 additions & 1 deletion palace/fem/libceed/ceed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "ceed.hpp"

#include <string_view>
#include "utils/omp.hpp"

#if defined(MFEM_USE_OPENMP)
Expand Down Expand Up @@ -31,7 +32,9 @@ void Initialize(const char *resource, const char *jit_source_dir)
PalacePragmaOmp(master)
{
#if defined(MFEM_USE_OPENMP)
const int nt = omp_get_num_threads();
// Only parallelize libCEED operators over threads when not using the GPU.
const int nt =
!std::string_view(resource).compare(0, 4, "/cpu") ? omp_get_num_threads() : 1;
#else
const int nt = 1;
#endif
Expand Down
7 changes: 6 additions & 1 deletion palace/fem/libceed/integrator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI

void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
CeedBasis mesh_basis, CeedVector mesh_nodes,
CeedVector geom_data, CeedElemRestriction geom_data_restr)
CeedElemRestriction attr_restr, CeedBasis attr_basis,
CeedVector elem_attr, CeedVector geom_data,
CeedElemRestriction geom_data_restr)
{
CeedInt dim, space_dim, num_elem, num_qpts;
PalaceCeedCall(ceed, CeedBasisGetDimension(mesh_basis, &dim));
Expand Down Expand Up @@ -389,6 +391,7 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
}

// Inputs
PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "attr", 1, CEED_EVAL_INTERP));
PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "q_w", 1, CEED_EVAL_WEIGHT));
PalaceCeedCall(
ceed, CeedQFunctionAddInput(build_qf, "grad_x", space_dim * dim, CEED_EVAL_GRAD));
Expand All @@ -409,6 +412,8 @@ void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
PalaceCeedCall(ceed, CeedOperatorCreate(ceed, build_qf, nullptr, nullptr, &build_op));
PalaceCeedCall(ceed, CeedQFunctionDestroy(&build_qf));

PalaceCeedCall(ceed,
CeedOperatorSetField(build_op, "attr", attr_restr, attr_basis, elem_attr));
PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "q_w", CEED_ELEMRESTRICTION_NONE,
mesh_basis, CEED_VECTOR_NONE));
PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "grad_x", mesh_restr, mesh_basis,
Expand Down
4 changes: 3 additions & 1 deletion palace/fem/libceed/integrator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedI
// libCEED operator.
void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr,
CeedBasis mesh_basis, CeedVector mesh_nodes,
CeedVector geom_data, CeedElemRestriction geom_data_restr);
CeedElemRestriction attr_restr, CeedBasis attr_basis,
CeedVector elem_attr, CeedVector geom_data,
CeedElemRestriction geom_data_restr);

// Construct libCEED operator using the given quadrature data, element restriction, and
// basis objects.
Expand Down
106 changes: 48 additions & 58 deletions palace/fem/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,8 @@ auto GetElementIndices(const mfem::ParMesh &mesh, bool use_bdr, int start, int s
return element_indices;
}

template <typename T>
auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
mfem::Geometry::Type geom, std::vector<int> &indices,
T GetCeedAttribute)
auto AssembleGeometryData(Ceed ceed, mfem::Geometry::Type geom, std::vector<int> &indices,
const mfem::GridFunction &mesh_nodes, const Vector &elem_attr)
{
const mfem::FiniteElementSpace &mesh_fespace = *mesh_nodes.FESpace();
const mfem::Mesh &mesh = *mesh_fespace.GetMesh();
Expand All @@ -151,68 +149,50 @@ auto AssembleGeometryData(const mfem::GridFunction &mesh_nodes, Ceed ceed,
data.indices = std::move(indices);
const std::size_t num_elem = data.indices.size();

// Allocate storage for geometry factor data (stored as attribute + quadrature weight +
// Jacobian, column-major).
// Construct mesh node element restriction and basis.
CeedElemRestriction mesh_restr =
FiniteElementSpace::BuildCeedElemRestriction(mesh_fespace, ceed, geom, data.indices);
CeedBasis mesh_basis = FiniteElementSpace::BuildCeedBasis(mesh_fespace, ceed, geom);
CeedInt num_qpts, geom_data_size = 2 + data.space_dim * data.dim;
CeedVector mesh_nodes_vec;
ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec);
CeedInt num_qpts;
PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts));
PalaceCeedCall(
ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data));

// Data for quadrature point i, component j, element k is found at index i * strides[0] +
// j * strides[1] + k * strides[2].
CeedMemType mem;
CeedInt strides[3];
PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem));
if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE)
{
// GPU backends have CEED_STRIDES_BACKEND = {1, num_elem * num_qpts, num_qpts}.
strides[0] = 1;
strides[1] = num_elem * num_qpts;
strides[2] = num_qpts;
}
else
// Construct element attribute element restriction and basis.
CeedElemRestriction attr_restr;
CeedBasis attr_basis;
PalaceCeedCall(ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, 1, 1, num_elem,
CEED_STRIDES_BACKEND, &attr_restr));
{
// CPU backends have CEED_STRIDES_BACKEND = {1, num_qpts, num_qpts * geom_data_size}.
strides[0] = 1;
strides[1] = num_qpts;
strides[2] = num_qpts * geom_data_size;
Vector Bt(num_qpts);
Bt = 1.0;
PalaceCeedCall(ceed,
CeedBasisCreateH1(ceed, ceed::GetCeedTopology(geom), 1, 1, num_qpts,
Bt.GetData(), nullptr, nullptr, nullptr, &attr_basis));
}
PalaceCeedCall(ceed,
CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size,
num_elem * num_qpts * geom_data_size,
strides, &data.geom_data_restr));

// Compute the required geometry factors at quadrature points.
CeedVector mesh_nodes_vec;
ceed::InitCeedVector(mesh_nodes, ceed, &mesh_nodes_vec);
CeedVector elem_attr_vec;
ceed::InitCeedVector(elem_attr, ceed, &elem_attr_vec);

ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec,
data.geom_data, data.geom_data_restr);
// Allocate storage for geometry factor data (stored as attribute + quadrature weight +
// Jacobian, column-major).
CeedInt geom_data_size = 2 + data.space_dim * data.dim;
PalaceCeedCall(
ceed, CeedVectorCreate(ceed, num_elem * num_qpts * geom_data_size, &data.geom_data));
PalaceCeedCall(
ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, geom_data_size,
num_elem * num_qpts * geom_data_size,
CEED_STRIDES_BACKEND, &data.geom_data_restr));

// Compute the required geometry factors at quadrature points.
ceed::AssembleCeedGeometryData(ceed, mesh_restr, mesh_basis, mesh_nodes_vec, attr_restr,
attr_basis, elem_attr_vec, data.geom_data,
data.geom_data_restr);
PalaceCeedCall(ceed, CeedVectorDestroy(&mesh_nodes_vec));
PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_restr));
PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_basis));

// Compute element attribute quadrature data. All inputs to a QFunction require the same
// number of quadrature points, so we store the attribute at each quadrature point. This
// is the first component of the quadrature data.
{
CeedScalar *geom_data_array;
PalaceCeedCall(ceed,
CeedVectorGetArray(data.geom_data, CEED_MEM_HOST, &geom_data_array));
for (std::size_t k = 0; k < num_elem; k++)
{
const auto attr = GetCeedAttribute(data.indices[k]);
for (CeedInt i = 0; i < num_qpts; i++)
{
geom_data_array[i * strides[0] + k * strides[2]] = attr;
}
}
CeedVectorRestoreArray(data.geom_data, &geom_data_array);
}
PalaceCeedCall(ceed, CeedVectorDestroy(&elem_attr_vec));
PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&attr_restr));
PalaceCeedCall(ceed, CeedBasisDestroy(&attr_basis));

return data;
}
Expand Down Expand Up @@ -278,8 +258,13 @@ auto BuildCeedGeomFactorData(
}();
for (auto &[geom, indices] : element_indices)
{
geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom,
indices, GetCeedAttribute));
Vector elem_attr(indices.size());
for (std::size_t k = 0; k < indices.size(); k++)
{
elem_attr[k] = GetCeedAttribute(indices[k]);
}
geom_data_map.emplace(
geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr));
}
}

Expand All @@ -304,8 +289,13 @@ auto BuildCeedGeomFactorData(
};
for (auto &[geom, indices] : element_indices)
{
geom_data_map.emplace(geom, AssembleGeometryData(*mesh.GetNodes(), ceed, geom,
indices, GetCeedAttribute));
Vector elem_attr(indices.size());
for (std::size_t k = 0; k < indices.size(); k++)
{
elem_attr[k] = GetCeedAttribute(indices[k]);
}
geom_data_map.emplace(
geom, AssembleGeometryData(ceed, geom, indices, *mesh.GetNodes(), elem_attr));
}
}

Expand Down
12 changes: 6 additions & 6 deletions palace/fem/qfunctions/21/geom_21_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@
CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[2], adjJt_loc[2];
MatUnpack21(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt21<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
}
return 0;
}
Expand Down
16 changes: 8 additions & 8 deletions palace/fem/qfunctions/22/geom_22_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[4], adjJt_loc[4];
MatUnpack22(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt22<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
}
return 0;
}
Expand Down
20 changes: 10 additions & 10 deletions palace/fem/qfunctions/32/geom_32_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,23 @@
CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[6], adjJt_loc[6];
MatUnpack32(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt32<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
}
return 0;
}
Expand Down
26 changes: 13 additions & 13 deletions palace/fem/qfunctions/33/geom_33_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,26 @@
CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *const *in,
CeedScalar *const *out)
{
const CeedScalar *qw = in[0], *J = in[1];
CeedScalar *attr = out[0], *wdetJ = out[0] + Q, *adjJt = out[0] + 2 * Q;
const CeedScalar *attr = in[0], *qw = in[1], *J = in[2];
CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q;

CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++)
{
CeedScalar J_loc[9], adjJt_loc[9];
MatUnpack33(J + i, Q, J_loc);
const CeedScalar detJ = AdjJt33<true>(J_loc, adjJt_loc);

attr[i] = 0;
wdetJ[i] = qw[i] * detJ;
adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
adjJt[i + Q * 6] = adjJt_loc[6] / detJ;
adjJt[i + Q * 7] = adjJt_loc[7] / detJ;
adjJt[i + Q * 8] = adjJt_loc[8] / detJ;
qd_attr[i] = attr[i];
qd_wdetJ[i] = qw[i] * detJ;
qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ;
qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ;
qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ;
qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ;
qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ;
qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ;
qd_adjJt[i + Q * 6] = adjJt_loc[6] / detJ;
qd_adjJt[i + Q * 7] = adjJt_loc[7] / detJ;
qd_adjJt[i + Q * 8] = adjJt_loc[8] / detJ;
}
return 0;
}
Expand Down
5 changes: 3 additions & 2 deletions palace/fem/qfunctions/geom_qf.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

// libCEED QFunction for building geometry factors for integration and transformations.
// At every quadrature point, compute qw * det(J) and adj(J)^T / |J| and store the result.
// in[0] is quadrature weights, shape [Q]
// in[1] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q]
// in[0] is element attributes, shape [Q]
// in[1] is quadrature weights, shape [Q]
// in[2] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q]
// out[0] is quadrature data, stored as {attribute, Jacobian determinant, (transpose)
// adjugate Jacobian} quadrature data, shape [ncomp=2+space_dim*dim, Q]

Expand Down
Loading

0 comments on commit 7d26775

Please sign in to comment.