Skip to content

Commit

Permalink
Playing with Highway code
Browse files Browse the repository at this point in the history
  • Loading branch information
sherm1 committed Jan 10, 2025
1 parent 50f906f commit 8df8522
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 19 deletions.
4 changes: 2 additions & 2 deletions .bazeliskrc
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# When bazelisk in use (as is typical, per Drake install_prereqs), this dotfile
# specifies which version of Bazel should be used to build and test Drake.
# Keep the in sync with doc/_pages/from_source.md (only the major.minor part).
#USE_BAZEL_VERSION=8.0.0
USE_BAZEL_VERSION=7.4.1
USE_BAZEL_VERSION=8.0.0
#USE_BAZEL_VERSION=7.4.1

# For some reason the google mirrors are very flaky in Drake CI in EC2, so
# we'll point to the GitHub mirrors instead.
Expand Down
48 changes: 31 additions & 17 deletions math/fast_pose_composition_functions_avx2_fma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <algorithm>
#include <cstdint>
#include <iostream>

// This is the magic juju that compiles our impl functions for multiple CPUs.
#undef HWY_TARGET_INCLUDE
Expand Down Expand Up @@ -615,14 +616,21 @@ We want to perform two matrix-vector products:
We can do this in 6 SIMD instructions. We end up doing 40 flops and throwing
10 of them away.
llvm-mca says 620 cycles / 100 iterations
*/
void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
double* V_A) {
const hn::FixedTag<double, 4> tag;

const auto abc_ = hn::LoadU(tag, R_AB); // (d is loaded but unused)
const auto def_ = hn::LoadU(tag, R_AB + 3); // (g is loaded but unused)
const auto ghi_ = hn::LoadN(tag, R_AB + 6, 3);

// Llvm-mca rates this two-step implementation as a half-cycle better than
// the equivalent `ghi0 = hn::LoadN(tag, R_AB + 6, 3)` which gives 670/100
// cycles (gcc 11.4 & clang 14.0.0).
const auto fghi = hn::LoadU(tag, R_AB + 5); // (f not wanted)
const auto ghi0 = hn::SlideDownLanes(tag, fghi, 1);

const auto xxx_ = hn::Set(tag, V_B[0]);
const auto yyy_ = hn::Set(tag, V_B[1]);
Expand All @@ -631,7 +639,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
// Vector XYZ: X Y Z _
auto XYZ_ = hn::Mul(abc_, xxx_); // ax bx cx _
XYZ_ = hn::MulAdd(def_, yyy_, XYZ_); // +dy +ey +fy _
XYZ_ = hn::MulAdd(ghi_, zzz_, XYZ_); // +gz +hz +iz _
XYZ_ = hn::MulAdd(ghi0, zzz_, XYZ_); // +gz +hz +iz _

const auto rrr_ = hn::Set(tag, V_B[3]);
const auto sss_ = hn::Set(tag, V_B[4]);
Expand All @@ -640,7 +648,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
// Vector RST: R S T _
auto RST_ = hn::Mul(abc_, rrr_); // ar br cr _
RST_ = hn::MulAdd(def_, sss_, RST_); // +ds +es +fs _
RST_ = hn::MulAdd(ghi_, ttt_, RST_); // +gt +ht +it _
RST_ = hn::MulAdd(ghi0, ttt_, RST_); // +gt +ht +it _

hn::StoreU(XYZ_, tag, V_A); // 4-wide write temporarily overwrites R
hn::StoreN(RST_, tag, V_A + 3, 3); // 3-wide write to stay in bounds
Expand All @@ -667,7 +675,7 @@ void CrossProductImpl(const double* w, const double* r, double* wXr) {
hn::StoreN(wXr_, tag, wXr, 3);
}


/*
// w x w x r
void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
const hn::FixedTag<double, 4> tag;
Expand All @@ -691,32 +699,38 @@ void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
hn::StoreN(wXwXr_, tag, wXwXr, 3);
}
*/

/*
// TODO(sherm1) Untested -- does this even work?
// G is a - - but symmetric, so we need columns abc, bde, cef
// b d -
// c e f
/* This is 522 cycles according to llvm-mca */
// Caution: symmetric elements might be NaN; don't compute with them.
// This is 619 cycles according to llvm-mca
void SymTimesVectorImpl(const double* G, const double* w, double* Gw) {
const hn::FixedTag<double, 4> tag;
const auto abc_ = hn::LoadU(tag, G);
const auto uuu_ = hn::Set(tag, w[0]);
auto Gw_ = hn::Mul(abc_, uuu_); // au bu cu _

const auto abc0 = hn::LoadN(tag, G, 3); // Avoid the NaN
const auto c_de = hn::LoadU(tag, G + 2);
const auto abde = hn::ConcatUpperLower(tag, c_de, abc_);
const auto bde0 = hn::ShiftLeftLanes<1>(tag, abde);
const auto vvv_ = hn::Set(tag, w[1]);
Gw_ = hn::MulAdd(bde0, vvv_, Gw_); // +bv +dv +ev
const double f = G[8];
const auto uuuu = hn::Set(tag, w[0]);
const auto vvvv = hn::Set(tag, w[1]);
const auto wwww = hn::Set(tag, w[2]);
const auto abde = hn::ConcatUpperLower(tag, c_de, abc0);
const auto bde0 = hn::SlideUpLanes(tag, abde, 1);
const auto ced_ = hn::Per4LaneBlockShuffle<2, 1, 3, 0>(c_de);
const double f = G[8];
const auto cef_ = hn::InsertLane(ced_, 1, f);
const auto www_ = hn::Set(tag, w[2]);
Gw_ = hn::MulAdd(cef_, www_, Gw_); // +cw +ew +fw
const auto cef0 = hn::InsertLane(cef_, 0, 0.0);
hn::StoreN(Gw_, tag, Gw, 3);
auto Gw0 = hn::Mul(abc0, uuuu); // au bu cu 0
Gw0 = hn::MulAdd(bde0, vvvv, Gw0); // +bv +dv +ev 0
Gw0 = hn::MulAdd(cef0, wwww, Gw0); // +cw +ew +fw 0
hn::StoreN(Gw0, tag, Gw, 3);
}
*/

/* This is considerably slower (617 cycles)
void SymTimesVectorImpl2(const double* G, const double* w, double* Gw) {
Expand Down

0 comments on commit 8df8522

Please sign in to comment.