diff --git a/include/oklt/core/target_backends.h b/include/oklt/core/target_backends.h index bff3463a..075c82e5 100644 --- a/include/oklt/core/target_backends.h +++ b/include/oklt/core/target_backends.h @@ -14,6 +14,7 @@ enum struct TargetBackend : unsigned char { CUDA, ///< CUDA backend. HIP, ///< HIP backend. DPCPP, ///< DPCPP backend. + METAL, ///< Metal backend. _LAUNCHER, ///< Launcher backend. }; diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 182f1e03..957bf743 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -105,6 +105,20 @@ set (OCCA_TRANSPILER_SOURCES attributes/backend/dpcpp/common.cpp attributes/backend/dpcpp/common.h + # Metal + attributes/backend/metal/kernel.cpp + attributes/backend/metal/translation_unit.cpp + attributes/backend/metal/outer.cpp + attributes/backend/metal/inner.cpp + attributes/backend/metal/tile.cpp + attributes/backend/metal/shared.cpp + attributes/backend/metal/restrict.cpp + attributes/backend/metal/atomic.cpp + attributes/backend/metal/barrier.cpp + attributes/backend/metal/exclusive.cpp + attributes/backend/metal/common.cpp + attributes/backend/metal/common.h + # Serial subset attributes/utils/serial_subset/empty.cpp attributes/utils/serial_subset/kernel.cpp diff --git a/lib/attributes/backend/metal/atomic.cpp b/lib/attributes/backend/metal/atomic.cpp new file mode 100644 index 00000000..bf0ebd62 --- /dev/null +++ b/lib/attributes/backend/metal/atomic.cpp @@ -0,0 +1,24 @@ +#include "attributes/backend/metal/common.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +HandleResult handleAtomicStmtAttribute(SessionStage& s, const Stmt& stmt, const Attr& a) { + SPDLOG_DEBUG("Handle attribute [{}]", a.getNormalizedFullName()); + + removeAttribute(s, a); + return {}; +} + +__attribute__((constructor)) void registerAttrBackend() { + auto ok = + registerBackendHandler(TargetBackend::METAL, ATOMIC_ATTR_NAME, handleAtomicStmtAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", ATOMIC_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/barrier.cpp b/lib/attributes/backend/metal/barrier.cpp new file mode 100644 index 00000000..ff01c9b4 --- /dev/null +++ b/lib/attributes/backend/metal/barrier.cpp @@ -0,0 +1,31 @@ +#include "attributes/backend/metal/common.h" + +#include +#include + +#include + +namespace { +using namespace oklt; +using namespace clang; + +oklt::HandleResult handleBarrierAttribute(SessionStage& stage, + const clang::Stmt& stmt, + const clang::Attr& attr) { + SPDLOG_DEBUG("Handle [@barrier] attribute"); + + auto range = getAttrFullSourceRange(attr); + stage.getRewriter().ReplaceText(range, metal::SYNC_THREADS_BARRIER); + + return {}; +} + +__attribute__((constructor)) void registerAttrBackend() { + auto ok = + registerBackendHandler(TargetBackend::METAL, BARRIER_ATTR_NAME, handleBarrierAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", BARRIER_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/common.cpp b/lib/attributes/backend/metal/common.cpp new file mode 100644 index 00000000..cd521551 --- /dev/null +++ b/lib/attributes/backend/metal/common.cpp @@ -0,0 +1,62 @@ +#include "attributes/backend/metal/common.h" +#include "core/sema/okl_sema_ctx.h" +#include "core/utils/range_to_string.h" +#include "util/string_utils.hpp" + +#include + +namespace oklt::metal { +using namespace clang; + +std::string axisToStr(const Axis& axis) { + static std::map mapping{{Axis::X, "x"}, {Axis::Y, "y"}, {Axis::Z, "z"}}; + return mapping[axis]; +} + +std::string getIdxVariable(const AttributedLoop& loop) { + auto strAxis = axisToStr(loop.axis); + switch (loop.type) { + case (LoopType::Inner): + return util::fmt("_occa_thread_position.{}", strAxis).value(); + case (LoopType::Outer): + return util::fmt(" _occa_group_position.{}", strAxis).value(); + default: // Incorrect case + return ""; + } +} + +std::string getTiledVariableName(const OklLoopInfo& forLoop) { + return "_occa_tiled_" + forLoop.var.name; +} + +std::string buildInnerOuterLoopIdxLine(const OklLoopInfo& forLoop, + const AttributedLoop& loop, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + static_cast(openedScopeCounter); + auto idx = getIdxVariable(loop); + auto op = forLoop.IsInc() ? "+" : "-"; + + std::string res; + if (forLoop.isUnary()) { + res = std::move(util::fmt("{} {} = ({}) {} {};\n", + forLoop.var.typeName, + forLoop.var.name, + getLatestSourceText(forLoop.range.start, rewriter), + op, + idx) + .value()); + } else { + res = std::move(util::fmt("{} {} = ({}) {} (({}) * {});\n", + forLoop.var.typeName, + forLoop.var.name, + getLatestSourceText(forLoop.range.start, rewriter), + op, + getLatestSourceText(forLoop.inc.val, rewriter), + idx) + .value()); + } + return res; +} + +} // namespace oklt::metal diff --git a/lib/attributes/backend/metal/common.h b/lib/attributes/backend/metal/common.h new file mode 100644 index 00000000..b6dbe71f --- /dev/null +++ b/lib/attributes/backend/metal/common.h @@ -0,0 +1,36 @@ +#include "attributes/attribute_names.h" +#include "attributes/utils/code_gen.h" +#include "attributes/utils/default_handlers.h" +#include "attributes/utils/kernel_utils.h" +#include "attributes/utils/utils.h" +#include "core/handler_manager/backend_handler.h" +#include "core/rewriter/rewriter_proxy.h" +#include "core/sema/okl_sema_ctx.h" +#include "core/transpiler_session/session_stage.h" +#include "core/utils/attributes.h" +#include "core/utils/range_to_string.h" + +#include + +namespace clang { +class Rewriter; +} + +namespace oklt { +struct OklLoopInfo; +} + +namespace oklt::metal { +std::string axisToStr(const Axis& axis); +std::string getIdxVariable(const AttributedLoop& loop); +std::string getTiledVariableName(const OklLoopInfo& forLoop); + +// Produces something like: int i = start +- (inc * _occa_group_position.x); +// or: int i = start +- (inc * _occa_thread_position.x); +std::string buildInnerOuterLoopIdxLine(const OklLoopInfo& forLoop, + const AttributedLoop& loop, + int& openedScopeCounter, + oklt::Rewriter& rewriter); + +const std::string SYNC_THREADS_BARRIER = "threadgroup_barrier(mem_flags::mem_threadgroup)"; +} // namespace oklt::metal diff --git a/lib/attributes/backend/metal/exclusive.cpp b/lib/attributes/backend/metal/exclusive.cpp new file mode 100644 index 00000000..177bb3b9 --- /dev/null +++ b/lib/attributes/backend/metal/exclusive.cpp @@ -0,0 +1,53 @@ +#include "attributes/backend/metal/common.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +HandleResult handleExclusiveDeclAttribute(SessionStage& s, const Decl& decl, const Attr& a) { + SPDLOG_DEBUG("Handle [@exclusive] attribute (Decl)"); + + removeAttribute(s, a); + return {}; +} + +HandleResult handleExclusiveVarAttribute(SessionStage& s, const VarDecl& decl, const Attr& a) { + SPDLOG_DEBUG("Handle [@exclusive] attribute"); + + removeAttribute(s, a); + + auto& sema = s.tryEmplaceUserCtx(); + auto loopInfo = sema.getLoopInfo(); + if (loopInfo && loopInfo->isRegular()) { + loopInfo = loopInfo->getAttributedParent(); + } + if (loopInfo && loopInfo->has(LoopType::Inner)) { + return tl::make_unexpected( + Error{{}, "Cannot define [@exclusive] variables inside an [@inner] loop"}); + } + + auto child = loopInfo ? loopInfo->getFirstAttributedChild() : nullptr; + bool isInnerChild = child && child->has(LoopType::Inner); + if (!loopInfo || !loopInfo->has(LoopType::Outer) || !isInnerChild) { + return tl::make_unexpected( + Error{{}, "Must define [@exclusive] variables between [@outer] and [@inner] loops"}); + } + + return defaultHandleExclusiveDeclAttribute(s, decl, a); +} + +__attribute__((constructor)) void registerAttrBackend() { + auto ok = registerBackendHandler( + TargetBackend::METAL, EXCLUSIVE_ATTR_NAME, handleExclusiveDeclAttribute); + ok &= registerBackendHandler( + TargetBackend::METAL, EXCLUSIVE_ATTR_NAME, handleExclusiveVarAttribute); + ok &= registerBackendHandler( + TargetBackend::METAL, EXCLUSIVE_ATTR_NAME, defaultHandleExclusiveStmtAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", EXCLUSIVE_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/inner.cpp b/lib/attributes/backend/metal/inner.cpp new file mode 100644 index 00000000..5f14befd --- /dev/null +++ b/lib/attributes/backend/metal/inner.cpp @@ -0,0 +1,49 @@ +#include "attributes/backend/metal/common.h" +#include "attributes/frontend/params/loop.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +HandleResult handleInnerAttribute(SessionStage& s, + const clang::ForStmt& forStmt, + const clang::Attr& a, + const AttributedLoop* params) { + SPDLOG_DEBUG("Handle [@inner] attribute"); + handleChildAttr(s, forStmt, NO_BARRIER_ATTR_NAME); + + auto& sema = s.tryEmplaceUserCtx(); + auto loopInfo = sema.getLoopInfo(forStmt); + if (!loopInfo) { + return tl::make_unexpected( + Error{std::error_code(), "@inner: failed to fetch loop meta data from sema"}); + } + + // Auto Axis in loopInfo are replaced with specific. + // TODO: maybe somehow update params earlier? + auto updatedParams = *params; + updatedParams.axis = loopInfo->axis.front(); + + std::string afterRBraceCode = ""; + if (loopInfo->shouldSync()) { + afterRBraceCode += metal::SYNC_THREADS_BARRIER + ";\n"; + } + + int openedScopeCounter = 0; + auto prefixCode = metal::buildInnerOuterLoopIdxLine( + *loopInfo, updatedParams, openedScopeCounter, s.getRewriter()); + auto suffixCode = buildCloseScopes(openedScopeCounter); + + return replaceAttributedLoop(s, forStmt, a, suffixCode, afterRBraceCode, prefixCode, true); +} + +__attribute__((constructor)) void registerBackendHandler() { + auto ok = registerBackendHandler(TargetBackend::METAL, INNER_ATTR_NAME, handleInnerAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", INNER_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/kernel.cpp b/lib/attributes/backend/metal/kernel.cpp new file mode 100644 index 00000000..bf90b505 --- /dev/null +++ b/lib/attributes/backend/metal/kernel.cpp @@ -0,0 +1,147 @@ +#include +#include "util/string_utils.hpp" + +#include "attributes/backend/metal/common.h" +#include "core/rewriter/rewriter_proxy.h" +#include "core/transpiler_session/attributed_type_map.h" +#include "core/utils/type_converter.h" +#include "pipeline/core/error_codes.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +const std::string FUNC_PREFIX = "kernel"; + +std::string getFunctionName(const FunctionDecl& func, size_t n) { + return util::fmt("_occa_{}_{}", func.getNameAsString(), n).value(); +} + +std::string getFunctionParamStr(SessionStage& stage, + const FunctionDecl& func, + KernelInfo& kernelInfo) { + std::stringstream out; + + auto& r = stage.getRewriter(); + auto& m = stage.tryEmplaceUserCtx(); + + kernelInfo.args.reserve(func.getNumParams() + 2); + + out << "("; + + size_t n = 0; + for (auto p : func.parameters()) { + if (!p) { + continue; + } + + auto qt = p->getType(); + std::string typeStr = qt.getNonReferenceType().getAsString(); + if (qt->isPointerType()) { + auto qtStr = getCleanTypeString( + QualType(qt.getNonReferenceType().getTypePtr()->getUnqualifiedDesugaredType(), 0)); + typeStr = util::fmt("device {}", qtStr).value(); + } else { + qt.removeLocalConst(); + auto qtStr = getCleanTypeString( + QualType(qt.getNonReferenceType().getTypePtr()->getUnqualifiedDesugaredType(), 0)); + typeStr = + util::fmt("constant {} {}", qtStr, (qt.getTypePtrOrNull() ? "&" : "*")).value(); + } + + if (m.has(func.getASTContext(), qt, {RESTRICT_ATTR_NAME})) { + typeStr += " __restrict__"; + } + + if (n != 0) { + out << ", "; + } + + out << util::fmt("{} {} [[buffer({})]]", typeStr, p->getNameAsString(), n).value(); + + ++n; + } + + if (n != 0) { + out << ", "; + } + + out << util::fmt( + "{} {} [[{}]]", "uint3", "_occa_group_position", "threadgroup_position_in_grid") + .value(); + out << ", "; + out << util::fmt( + "{} {} [[{}]]", "uint3", "_occa_thread_position", "thread_position_in_threadgroup") + .value(); + + out << ")"; + + return out.str(); +} + +HandleResult handleKernelAttribute(SessionStage& s, + const clang::FunctionDecl& func, + const clang::Attr& a) { + SPDLOG_DEBUG("Handle [@kernel] attribute for function '{}'", func.getNameAsString()); + + auto& sema = s.tryEmplaceUserCtx(); + auto& rewriter = s.getRewriter(); + + auto oklKernelInfo = toOklKernelInfo(func); + if (!sema.getParsingKernelInfo() || !oklKernelInfo) { + return tl::make_unexpected( + Error{OkltPipelineErrorCode::INTERNAL_ERROR_KERNEL_INFO_NULL, "handleKernelAttribute"}); + } + + auto kernelInfo = *sema.getParsingKernelInfo(); + auto& kernels = sema.getProgramMetaData().kernels; + + auto typeStr = rewriter.getRewrittenText(func.getReturnTypeSourceRange()); + auto paramStr = getFunctionParamStr(s, func, oklKernelInfo.value()); + + if (auto verified = verifyLoops(s, kernelInfo); !verified) { + return tl::make_unexpected(std::move(verified.error())); + } + + auto startPos = getAttrFullSourceRange(a).getBegin(); + size_t n = 0; + for (auto* child : kernelInfo.topLevelOuterLoops) { + if (!child) { + continue; + } + kernels.push_back(oklKernelInfo.value()); + auto& meta = kernels.back(); + meta.name = getFunctionName(func, n); + + handleChildAttr(s, child->stmt, MAX_INNER_DIMS_NAME); + + std::stringstream out; + if (n != 0) { + out << "}\n\n"; + } + out << FUNC_PREFIX << " "; + out << typeStr << " " << getFunctionName(func, n) << paramStr << " {\n"; + + auto endPos = getAttrFullSourceRange(*child->attr).getBegin(); + rewriter.ReplaceText(SourceRange{startPos, endPos}, out.str()); + + auto body = dyn_cast_or_null(child->stmt.getBody()); + startPos = (body ? body->getEndLoc() : child->stmt.getRParenLoc()).getLocWithOffset(1); + ++n; + } + + rewriter.ReplaceText(SourceRange{startPos, func.getEndLoc()}, "\n}\n"); + + return {}; +} + +__attribute__((constructor)) void registerKernelHandler() { + auto ok = registerBackendHandler(TargetBackend::METAL, KERNEL_ATTR_NAME, handleKernelAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", KERNEL_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/outer.cpp b/lib/attributes/backend/metal/outer.cpp new file mode 100644 index 00000000..171b4b1b --- /dev/null +++ b/lib/attributes/backend/metal/outer.cpp @@ -0,0 +1,43 @@ +#include "attributes/backend/metal/common.h" +#include "attributes/frontend/params/loop.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +HandleResult handleOuterAttribute(SessionStage& s, + const clang::ForStmt& forStmt, + const clang::Attr& a, + const AttributedLoop* params) { + SPDLOG_DEBUG("Handle [@outer] attribute"); + + auto& sema = s.tryEmplaceUserCtx(); + auto loopInfo = sema.getLoopInfo(forStmt); + if (!loopInfo) { + return tl::make_unexpected(Error{ + .ec = std::error_code(), .desc = "@outer: failed to fetch loop meta data from sema"}); + } + + // Auto Axis in loopInfo are replaced with specific. + // TODO: maybe somehow update params earlier? + auto updatedParams = *params; + updatedParams.axis = loopInfo->axis.front(); + + int openedScopeCounter = 0; + auto prefixCode = metal::buildInnerOuterLoopIdxLine( + *loopInfo, updatedParams, openedScopeCounter, s.getRewriter()); + auto suffixCode = buildCloseScopes(openedScopeCounter); + + return replaceAttributedLoop(s, forStmt, a, suffixCode, prefixCode, true); +} + +__attribute__((constructor)) void registerBackendHandler() { + auto ok = registerBackendHandler(TargetBackend::METAL, OUTER_ATTR_NAME, handleOuterAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", OUTER_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/restrict.cpp b/lib/attributes/backend/metal/restrict.cpp new file mode 100644 index 00000000..9e29a203 --- /dev/null +++ b/lib/attributes/backend/metal/restrict.cpp @@ -0,0 +1,31 @@ +#include "attributes/backend/metal/common.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; +const std::string RESTRICT_MODIFIER = "__restrict__ "; + +HandleResult handleRestrictAttribute(SessionStage& s, const Decl& decl, const Attr& a) { + SPDLOG_DEBUG("Handle [@restrict] attribute"); + + removeAttribute(s, a); + if (isa(decl)) { + s.getRewriter().InsertTextBefore(decl.getLocation(), RESTRICT_MODIFIER); + } + + return {}; +} + +__attribute__((constructor)) void registerCUDARestrictHandler() { + auto ok = + registerBackendHandler(TargetBackend::METAL, RESTRICT_ATTR_NAME, handleRestrictAttribute); + ok &= + registerBackendHandler(TargetBackend::METAL, RESTRICT_ATTR_NAME, emptyHandleStmtAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", RESTRICT_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/shared.cpp b/lib/attributes/backend/metal/shared.cpp new file mode 100644 index 00000000..97edcb63 --- /dev/null +++ b/lib/attributes/backend/metal/shared.cpp @@ -0,0 +1,61 @@ +#include "attributes/backend/metal/common.h" + +#include "core/transpiler_session/session_stage.h" +#include "core/utils/attributes.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +const std::string SHARED_MODIFIER = "threadgroup"; + +HandleResult handleSharedDeclAttribute(SessionStage& s, const Decl& var, const Attr& a) { + SPDLOG_DEBUG("Handle [@shared] attribute"); + + return removeAttribute(s, a); +} + +HandleResult handleSharedVarAttribute(SessionStage& s, const VarDecl& d, const Attr& a) { + SPDLOG_DEBUG("Handle [@shared] attribute"); + + removeAttribute(s, a); + + auto& sema = s.tryEmplaceUserCtx(); + auto loopInfo = sema.getLoopInfo(); + if (loopInfo && loopInfo->isRegular()) { + loopInfo = loopInfo->getAttributedParent(); + } + if (loopInfo && loopInfo->has(LoopType::Inner)) { + return tl::make_unexpected( + Error{{}, "Cannot define [@shared] variables inside an [@inner] loop"}); + } + auto child = loopInfo ? loopInfo->getFirstAttributedChild() : nullptr; + bool isInnerChild = child && child->has(LoopType::Inner); + + // This diagnostic is applied only to variable declaration + if (!loopInfo || !loopInfo->has(LoopType::Outer) || !isInnerChild) { + return tl::make_unexpected( + Error{{}, "Must define [@shared] variables between [@outer] and [@inner] loops"}); + } + + s.getRewriter().InsertTextBefore(d.getTypeSpecStartLoc(), SHARED_MODIFIER + " "); + + return defaultHandleSharedDeclAttribute(s, d, a); +} + +__attribute__((constructor)) void registerCUDASharedAttrBackend() { + auto ok = + registerBackendHandler(TargetBackend::METAL, SHARED_ATTR_NAME, handleSharedDeclAttribute); + ok &= registerBackendHandler(TargetBackend::METAL, SHARED_ATTR_NAME, handleSharedVarAttribute); + + // Empty Stmt handler since @shared variable is of attributed type, it is called on DeclRefExpr + ok &= registerBackendHandler( + TargetBackend::METAL, SHARED_ATTR_NAME, defaultHandleSharedStmtAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", SHARED_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/tile.cpp b/lib/attributes/backend/metal/tile.cpp new file mode 100644 index 00000000..88eba068 --- /dev/null +++ b/lib/attributes/backend/metal/tile.cpp @@ -0,0 +1,254 @@ +#include "attributes/frontend/params/tile.h" +#include "attributes/backend/metal/common.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +// Produces something like: int _occa_tiled_i = init +- ((tileSize * inc) * _occa_group_position.x); +// or: int _occa_tiled_i = init +- ((tileSize * inc) * +// _occa_thread_position.x); +std::string buildIinnerOuterLoopIdxLineFirst(const OklLoopInfo& forLoop, + const AttributedLoop& loop, + const TileParams* params, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + auto tiledVar = metal::getTiledVariableName(forLoop); + auto idx = metal::getIdxVariable(loop); + auto op = forLoop.IsInc() ? "+" : "-"; + + std::string res; + if (forLoop.isUnary()) { + res = std::move(util::fmt("{} {} = ({}) {} (({}) * {});\n", + forLoop.var.typeName, + tiledVar, + getLatestSourceText(forLoop.range.start, rewriter), + op, + params->tileSize, + idx) + .value()); + } else { + res = std::move(util::fmt("{} {} = ({}) {} ((({}) * {}) * {});\n", + forLoop.var.typeName, + tiledVar, + getLatestSourceText(forLoop.range.start, rewriter), + op, + params->tileSize, + getLatestSourceText(forLoop.inc.val, rewriter), + idx) + .value()); + } + ++openedScopeCounter; + return " {\n" + res; +} + +// Produces something like: int i = _occa_tiled_i +- (inc * _occa_group_position.x); +// or: int i = _occa_tiled_i +- (inc * _occa_thread_position.x); +std::string buildInnerOuterLoopIdxLineSecond(const OklLoopInfo& forLoop, + const AttributedLoop& loop, + const TileParams* params, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + static_cast(params); + auto tiledVar = metal::getTiledVariableName(forLoop); + auto idx = metal::getIdxVariable(loop); + auto op = forLoop.IsInc() ? "+" : "-"; + + std::string res; + if (forLoop.isUnary()) { + res = std::move( + util::fmt( + "{} {} = {} {} {};\n", forLoop.var.typeName, forLoop.var.name, tiledVar, op, idx) + .value()); + } else { + res = std::move(util::fmt("{} {} = {} {} (({}) * {});\n", + forLoop.var.typeName, + forLoop.var.name, + tiledVar, + op, + getLatestSourceText(forLoop.inc.val, rewriter), + idx) + .value()); + } + ++openedScopeCounter; + return " {\n" + res; // Open new scope +} + +// Produces something like: +// for (int _occa_tiled_i = start; _occa_tiled_i < end; _occa_tiled_i += tileSize) { +// or: for (int _occa_tiled_i = start; _occa_tiled_i > end; _occa_tiled_i -= tileSize) { +std::string buildRegularLoopIdxLineFirst(const OklLoopInfo& forLoop, + const AttributedLoop& regularLoop, + const TileParams* params, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + auto tiledVar = metal::getTiledVariableName(forLoop); + auto assignUpdate = forLoop.IsInc() ? "+=" : "-="; + auto cmpOpStr = getCondCompStr(forLoop.condition.op); + + auto res = util::fmt("for ({} {} = {}; {} {} {}; {} {} ({}))", + forLoop.var.typeName, + tiledVar, + getLatestSourceText(forLoop.range.start, rewriter), + tiledVar, + cmpOpStr, + getLatestSourceText(forLoop.range.end, rewriter), + tiledVar, + assignUpdate, + params->tileSize) + .value(); // shouldn't fail + + ++openedScopeCounter; + return res + " {\n"; // Open new scope (Note: after line unlike @outer and @inner) +} + +// Produces something like: for (int i = _occa_tiled_i; i < (_occa_tiled_i + tileSize); ++i) +// Produces something like: for (int i = _occa_tiled_i; i < (_occa_tiled_i + tileSize); i+=inc) +// or: for (int i = _occa_tiled_i; i > (_occa_tiled_i - tileSize); --i) +// or: for (int i = _occa_tiled_i; i > (_occa_tiled_i - tileSize); i-=inc) +std::string buildRegularLoopIdxLineSecond(const OklLoopInfo& forLoop, + const AttributedLoop& regularLoop, + const TileParams* params, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + auto tiledVar = metal::getTiledVariableName(forLoop); + auto& stmt = forLoop.stmt; + auto op = forLoop.IsInc() ? "+" : "-"; + auto cmp = forLoop.IsInc() ? "<" : ">"; + + std::string res; + if (forLoop.isUnary()) { + auto unaryStr = getUnaryStr(forLoop.inc.op.uo, forLoop.var.name); // ++i/i++/--i/i-- + res = util::fmt("for ({} {} = {}; {} {} ({} {} ({})); {})", + forLoop.var.typeName, + forLoop.var.name, + tiledVar, + forLoop.var.name, + cmp, + tiledVar, + op, + params->tileSize, + unaryStr) + .value(); + } else { + auto assignUpdate = forLoop.IsInc() ? "+=" : "-="; + res = util::fmt("for ({} {} = {}; {} {} ({} {} ({})); {} {} {})", + forLoop.var.typeName, + forLoop.var.name, + tiledVar, + forLoop.var.name, + cmp, + tiledVar, + op, + params->tileSize, + forLoop.var.name, + assignUpdate, + getLatestSourceText(forLoop.inc.val, rewriter)) + .value(); + } + + if (params->check || !llvm::isa(stmt.getBody())) { + ++openedScopeCounter; + res += " {\n"; + } + + return res; +} + +std::string buildLoopIdxLine(const OklLoopInfo& forLoop, + const TileParams* params, + const LoopOrder& ord, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + static std::map< + std::tuple, + std::function> + mapping{ + {{LoopType::Inner, LoopOrder::First}, buildIinnerOuterLoopIdxLineFirst}, + {{LoopType::Outer, LoopOrder::First}, buildIinnerOuterLoopIdxLineFirst}, + {{LoopType::Regular, LoopOrder::First}, buildRegularLoopIdxLineFirst}, + {{LoopType::Inner, LoopOrder::Second}, buildInnerOuterLoopIdxLineSecond}, + {{LoopType::Outer, LoopOrder::Second}, buildInnerOuterLoopIdxLineSecond}, + {{LoopType::Regular, LoopOrder::Second}, buildRegularLoopIdxLineSecond}, + }; + auto& loop = ord == LoopOrder::First ? params->firstLoop : params->secondLoop; + return mapping[{loop.type, ord}](forLoop, loop, params, openedScopeCounter, rewriter); +} + +std::string buildCheckLine(const OklLoopInfo& forLoop, + const TileParams* params, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + if (!params->check) { + return ""; + } + auto cmpStr = getCondCompStr(forLoop.condition.op); + + auto res = util::fmt("if ({} {} {})", + forLoop.var.name, + cmpStr, + getLatestSourceText(forLoop.range.end, rewriter)) + .value(); + + auto& stmt = forLoop.stmt; + if (!isa(stmt.getBody())) { + ++openedScopeCounter; + res += " {\n"; + } + + return res; +} + +std::string buildPreffixTiledCode(const OklLoopInfo& forLoop, + const TileParams* params, + int& openedScopeCounter, + oklt::Rewriter& rewriter) { + std::string res; + res += buildLoopIdxLine(forLoop, params, LoopOrder::First, openedScopeCounter, rewriter); + res += buildLoopIdxLine(forLoop, params, LoopOrder::Second, openedScopeCounter, rewriter); + res += buildCheckLine(forLoop, params, openedScopeCounter, rewriter); + return res; +} + +HandleResult handleTileAttribute(SessionStage& s, + const clang::ForStmt& forStmt, + const clang::Attr& a, + const TileParams* params) { + SPDLOG_DEBUG("Handle [@tile] attribute"); + + auto& sema = s.tryEmplaceUserCtx(); + auto loopInfo = sema.getLoopInfo(forStmt); + if (!loopInfo) { + return tl::make_unexpected(Error{{}, "@tile: failed to fetch loop meta data from sema"}); + } + + auto updatedParams = *params; + // Auto Axis in loopInfo are replaced with specific. TODO: maybe somehow update params earlier? + updatedParams.firstLoop.axis = loopInfo->axis[0]; + updatedParams.secondLoop.axis = loopInfo->axis[1]; + + int openedScopeCounter = 0; + auto prefixCode = + buildPreffixTiledCode(*loopInfo, &updatedParams, openedScopeCounter, s.getRewriter()); + auto suffixCode = buildCloseScopes(openedScopeCounter); + std::string afterRBraceCode = ""; + if (loopInfo->shouldSync()) { + afterRBraceCode += metal::SYNC_THREADS_BARRIER + ";"; + } + + handleChildAttr(s, forStmt, NO_BARRIER_ATTR_NAME); + + return replaceAttributedLoop(s, forStmt, a, suffixCode, afterRBraceCode, prefixCode, false); +} + +__attribute__((constructor)) void registerHIPTileAttrBackend() { + auto ok = registerBackendHandler(TargetBackend::METAL, TILE_ATTR_NAME, handleTileAttribute); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register {} attribute handler", TILE_ATTR_NAME); + } +} +} // namespace diff --git a/lib/attributes/backend/metal/translation_unit.cpp b/lib/attributes/backend/metal/translation_unit.cpp new file mode 100644 index 00000000..03dbe4b1 --- /dev/null +++ b/lib/attributes/backend/metal/translation_unit.cpp @@ -0,0 +1,25 @@ +#include "attributes/utils/replace_attribute.h" +#include "core/handler_manager/implicid_handler.h" + +#include + +namespace { +using namespace oklt; +using namespace clang; + +const std::string_view METAL_INCLUDE1 = ""; +const std::string_view METAL_INCLUDE2 = ""; +const std::string_view METAL_NS = "metal"; + +HandleResult handleTranslationUnit(SessionStage& s, const clang::TranslationUnitDecl& decl) { + return oklt::handleTranslationUnit(s, decl, {METAL_INCLUDE1, METAL_INCLUDE2}, {METAL_NS}); +} + +__attribute__((constructor)) void registerTranslationUnitAttrBackend() { + auto ok = registerImplicitHandler(TargetBackend::METAL, handleTranslationUnit); + + if (!ok) { + SPDLOG_ERROR("[METAL] Failed to register implicit handler for translation unit"); + } +} +} // namespace diff --git a/lib/core/target_backends.cpp b/lib/core/target_backends.cpp index 9a435edf..6e3e4857 100644 --- a/lib/core/target_backends.cpp +++ b/lib/core/target_backends.cpp @@ -12,6 +12,7 @@ tl::expected backendFromString(const std::string& ty {"cuda", TargetBackend::CUDA}, {"hip", TargetBackend::HIP}, {"dpcpp", TargetBackend::DPCPP}, + {"metal", TargetBackend::METAL}, {"launcher", TargetBackend::_LAUNCHER}, }; @@ -34,6 +35,8 @@ std::string backendToString(TargetBackend backend) { return std::string{"hip"}; case TargetBackend::DPCPP: return std::string{"dpcpp"}; + case TargetBackend::METAL: + return std::string{"metal"}; case TargetBackend::_LAUNCHER: return std::string{"launcher"}; } @@ -55,6 +58,7 @@ bool isDeviceCategory(TargetBackend backend) { case TargetBackend::CUDA: case TargetBackend::HIP: case TargetBackend::DPCPP: + case TargetBackend::METAL: return true; default: return false; diff --git a/script/regenerate_test_ref.py b/script/regenerate_test_ref.py index 63876c91..4ebc7c9b 100644 --- a/script/regenerate_test_ref.py +++ b/script/regenerate_test_ref.py @@ -9,7 +9,8 @@ class Backend(Enum): CUDA = 2 HIP = 3 DPCPP = 4 - LAUNCHER = 5 + METAL = 5 + LAUNCHER = 6 def from_str(s: str) -> "Backend": s = s.lower() @@ -19,6 +20,8 @@ def from_str(s: str) -> "Backend": return Backend.OPENMP if s == "dpcpp": return Backend.DPCPP + if s == "dpcpp": + return Backend.METAL if s == "cuda": return Backend.CUDA if s == "hip": @@ -33,6 +36,8 @@ def to_str(self) -> str: return "openmp" if self == Backend.DPCPP: return "dpcpp" + if self == Backend.METAL: + return "metal" if self == Backend.CUDA: return "cuda" if self == Backend.HIP: @@ -63,7 +68,7 @@ def main(occa_tool: str, data_path: str, backend: Backend, verbose: bool): "--data", "-d", type=str, required=True, help="Test data directory path" ) parser.add_argument( - "--backend", "-b", type=str, required=True, help="serial/openmp/cuda/hip/dpcppp" + "--backend", "-b", type=str, required=True, help="serial/openmp/cuda/hip/dpcppp/metal" ) parser.add_argument( "--verbose", "-v", default=False, action="store_const", const=True diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/atomic.json b/tests/functional/configs/test_suite_transpiler/backends/metal/atomic.json new file mode 100644 index 00000000..6bc75e1d --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/atomic.json @@ -0,0 +1,112 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_add.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_add_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_sub.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_sub_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_exch.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_exch_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_and.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_and_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_or.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_or_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_xor.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_xor_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_inc.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_inc_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_dec.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_dec_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/atomic_compound_statement.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/atomic_compound_statement_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/atomic/issue_case.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/atomic/issue_case_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/barrier.json b/tests/functional/configs/test_suite_transpiler/backends/metal/barrier.json new file mode 100644 index 00000000..f306472c --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/barrier.json @@ -0,0 +1,24 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/barrier/barrier_builtin.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/barrier/barrier_builtin_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/barrier/barrier_warp.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/barrier/barrier_warp_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/const_global.json b/tests/functional/configs/test_suite_transpiler/backends/metal/const_global.json new file mode 100644 index 00000000..b93912e3 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/const_global.json @@ -0,0 +1,46 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/const_global_variable.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/const_global_variable_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/const_global_const_size_array.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/const_global_const_size_array_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/const_global_pointer.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/const_global_pointer_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/extern_const_global_array.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/extern_const_global_array_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/exclusive.json b/tests/functional/configs/test_suite_transpiler/backends/metal/exclusive.json new file mode 100644 index 00000000..86959c2d --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/exclusive.json @@ -0,0 +1,24 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/exclusive/exclusive_builtin.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/exclusive/exclusive_builtin_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/exclusive/exclusive_in_typedecl.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/exclusive/exclusive_in_typedecl_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/implicit.json b/tests/functional/configs/test_suite_transpiler/backends/metal/implicit.json new file mode 100644 index 00000000..c843f7f5 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/implicit.json @@ -0,0 +1,68 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/const_global_const_size_array.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/const_global_const_size_array_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/const_global_pointer.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/const_global_pointer_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/const_global_variable.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/const_global_variable_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/constexpr.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/constexpr_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/extern_const_global_array.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/extern_const_global_array_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/non_kernel_function.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/non_kernel_function_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/inner_outer.json b/tests/functional/configs/test_suite_transpiler/backends/metal/inner_outer.json new file mode 100644 index 00000000..57a8bfc0 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/inner_outer.json @@ -0,0 +1,57 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/outer_inner/outer_inner_inc.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/outer_inner/outer_inner_inc_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/outer_inner/outer_inner_dec.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/outer_inner/outer_inner_dec_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/outer_inner/outer_inner_multiple.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/outer_inner/outer_inner_multiple_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/outer_inner/outer_inner_split.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/outer_inner/outer_inner_split_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/max_inner_dims.json b/tests/functional/configs/test_suite_transpiler/backends/metal/max_inner_dims.json new file mode 100644 index 00000000..debb8544 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/max_inner_dims.json @@ -0,0 +1,13 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/max_inner_loops/outer_inner_split_max.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/max_inner_loops/outer_inner_split_max_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/nobarrier.json b/tests/functional/configs/test_suite_transpiler/backends/metal/nobarrier.json new file mode 100644 index 00000000..bdf1f565 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/nobarrier.json @@ -0,0 +1,13 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/nobarrier/nobarrier_builtin.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/nobarrier/nobarrier_builtin_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/non_kernel_function.json b/tests/functional/configs/test_suite_transpiler/backends/metal/non_kernel_function.json new file mode 100644 index 00000000..c67cc666 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/non_kernel_function.json @@ -0,0 +1,13 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/implicit/non_kernel_function.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/implicit/non_kernel_function_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/restrict.json b/tests/functional/configs/test_suite_transpiler/backends/metal/restrict.json new file mode 100644 index 00000000..114a3a73 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/restrict.json @@ -0,0 +1,46 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/restrict/restrict_builtin_types.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/restrict/restrict_builtin_types_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/restrict/restrict_complex_types.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/restrict/restrict_complex_types_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/restrict/restrict_namespaced_types.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/restrict/restrict_namespaced_types_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/restrict/restrict_return_type.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/restrict/restrict_return_type_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/shared.json b/tests/functional/configs/test_suite_transpiler/backends/metal/shared.json new file mode 100644 index 00000000..dbb67e6b --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/shared.json @@ -0,0 +1,57 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/shared/shared_builtin_types.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/shared/shared_builtin_types_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/shared/shared_struct_types.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/shared/shared_struct_types_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/shared/shared_template_type.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/shared/shared_template_type_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/shared/shared_between_tiles.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/shared/shared_between_tiles_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/shared/shared_in_typedecl.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/shared/shared_in_typedecl_ref.cpp" + } +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/suite.json b/tests/functional/configs/test_suite_transpiler/backends/metal/suite.json new file mode 100644 index 00000000..413936e0 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/suite.json @@ -0,0 +1,14 @@ +[ + "implicit.json", + "const_global.json", + "non_kernel_function.json", + "max_inner_dims.json", + "tile.json", + "inner_outer.json", + "atomic.json", + "barrier.json", + "nobarrier.json", + "restrict.json", + "exclusive.json", + "shared.json" +] diff --git a/tests/functional/configs/test_suite_transpiler/backends/metal/tile.json b/tests/functional/configs/test_suite_transpiler/backends/metal/tile.json new file mode 100644 index 00000000..eea02bc7 --- /dev/null +++ b/tests/functional/configs/test_suite_transpiler/backends/metal/tile.json @@ -0,0 +1,46 @@ +[ + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/tile/outer_inner_inc.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/tile/outer_inner_inc_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/tile/outer_inner_dec.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/tile/outer_inner_dec_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/tile/outer_inner_regular_inc.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/tile/outer_inner_regular_inc_ref.cpp" + }, + { + "action": "normalize_and_transpile", + "action_config": { + "backend": "metal", + "source": "transpiler/backends/metal/tile/outer_inner_regular_dec.cpp", + "includes": [], + "defs": [], + "launcher": "" + }, + "reference": "transpiler/backends/metal/tile/outer_inner_regular_dec_ref.cpp" + } +] diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_add.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_add.cpp new file mode 100644 index 00000000..ace9abb7 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_add.cpp @@ -0,0 +1,37 @@ +@kernel void atomic_add_builtin(const int* iVec, int* iSum, const float* fVec, float* fSum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* iSum += iVec[0]; + @atomic* fSum += fVec[0]; + } + } +} + +struct ComplexTypeF32 { + float real; + float imag; +}; + +@kernel void atomic_add_struct(const ComplexTypeF32* vec, ComplexTypeF32* sum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic sum->real += vec[0].real; + @atomic sum->imag += vec[0].imag; + } + } +} + +template +struct ComplexType { + T real; + T imag; +}; + +@kernel void atomic_add_template(const ComplexType* vec, ComplexType* sum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic sum->real += vec[0].real; + @atomic sum->imag += vec[0].imag; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_add_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_add_ref.cpp new file mode 100644 index 00000000..1b661605 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_add_ref.cpp @@ -0,0 +1,61 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_add_builtin_0(device const int* iVec [[buffer(0)]], + device int* iSum [[buffer(1)]], + device const float* fVec [[buffer(2)]], + device float* fSum [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *iSum += iVec[0]; + *fSum += fVec[0]; + } + } +} + +struct ComplexTypeF32 { + float real; + float imag; +}; + +kernel void _occa_atomic_add_struct_0(device const ComplexTypeF32* vec [[buffer(0)]], + device ComplexTypeF32* sum [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + sum->real += vec[0].real; + sum->imag += vec[0].imag; + } + } +} + +template +struct ComplexType { + T real; + T imag; +}; + +kernel void _occa_atomic_add_template_0(device const ComplexType* vec [[buffer(0)]], + device ComplexType* sum [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + sum->real += vec[0].real; + sum->imag += vec[0].imag; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_and.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_and.cpp new file mode 100644 index 00000000..4e3ee114 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_and.cpp @@ -0,0 +1,21 @@ +@kernel void atomic_and_builtin(const unsigned int* masks, unsigned int* mask) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* mask &= masks[0]; + } + } +} + +struct ComplexMaskType { + unsigned int mask1; + unsigned int mask2; +}; + +@kernel void atomic_and_struct(const ComplexMaskType* masks, ComplexMaskType* mask) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic mask->mask1 &= masks[0].mask1; + @atomic mask->mask2 &= masks[0].mask2; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_and_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_and_ref.cpp new file mode 100644 index 00000000..94aeb538 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_and_ref.cpp @@ -0,0 +1,37 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_and_builtin_0(device const unsigned int* masks [[buffer(0)]], + device unsigned int* mask [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *mask &= masks[0]; + } + } +} + +struct ComplexMaskType { + unsigned int mask1; + unsigned int mask2; +}; + +kernel void _occa_atomic_and_struct_0(device const ComplexMaskType* masks [[buffer(0)]], + device ComplexMaskType* mask [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + mask->mask1 &= masks[0].mask1; + mask->mask2 &= masks[0].mask2; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_compound_statement.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_compound_statement.cpp new file mode 100644 index 00000000..f72bfccb --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_compound_statement.cpp @@ -0,0 +1,11 @@ +@kernel void test_kernel() { + @outer for (int i = 0; i < 32; ++i) { + @shared float shm[32]; + @inner for (int j = 0; j < 32; ++j) { + @atomic { + shm[i * j]++; + j += 32; + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_compound_statement_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_compound_statement_ref.cpp new file mode 100644 index 00000000..3d35d253 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_compound_statement_ref.cpp @@ -0,0 +1,18 @@ +#include +#include +using namespace metal; + +kernel void _occa_test_kernel_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup float shm[32]; + { + int j = (0) + _occa_thread_position.x; + { + shm[i * j]++; + j += 32; + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_dec.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_dec.cpp new file mode 100644 index 00000000..6ba31fa1 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_dec.cpp @@ -0,0 +1,23 @@ + +@kernel void atomic_dec_builtin(unsigned int* value) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic--(*value); + } + } + // @atomic (*value)--; normalizer issue +} + +struct ComplexMaskType { + unsigned int val1; + int val2; +}; + +@kernel void atomic_dec_struct(ComplexMaskType* value) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic-- value->val1; + @atomic value->val2--; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_dec_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_dec_ref.cpp new file mode 100644 index 00000000..115600d6 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_dec_ref.cpp @@ -0,0 +1,35 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_dec_builtin_0(device unsigned int* value [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + --(*value); + } + } +} + +struct ComplexMaskType { + unsigned int val1; + int val2; +}; + +kernel void _occa_atomic_dec_struct_0(device ComplexMaskType* value [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + --value->val1; + value->val2--; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_exch.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_exch.cpp new file mode 100644 index 00000000..ceff6d67 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_exch.cpp @@ -0,0 +1,36 @@ + +@kernel void atomic_exch_builtin(const int* iVec, int* iSum, const float* fVec, float* fSum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* iSum = iVec[0]; + @atomic* fSum = fVec[0]; + } + } +} + +struct ComplexTypeF32 { + float real; + float imag; +}; + +@kernel void atomic_exch_struct(const ComplexTypeF32* vec, ComplexTypeF32* result) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* result = vec[0]; + } + } +} + +template +struct ComplexType { + T real; + T imag; +}; + +@kernel void atomic_exch_template(const ComplexType* vec, ComplexType* result) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* result = vec[0]; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_exch_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_exch_ref.cpp new file mode 100644 index 00000000..9d6969e3 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_exch_ref.cpp @@ -0,0 +1,60 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_exch_builtin_0(device const int* iVec [[buffer(0)]], + device int* iSum [[buffer(1)]], + device const float* fVec [[buffer(2)]], + device float* fSum [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *iSum = iVec[0]; + *fSum = fVec[0]; + } + } +} + +struct ComplexTypeF32 { + float real; + float imag; +}; + +kernel void _occa_atomic_exch_struct_0(device const ComplexTypeF32* vec [[buffer(0)]], + device ComplexTypeF32* result [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *result = vec[0]; + } + } +} + +template +struct ComplexType { + T real; + T imag; +}; + +kernel void _occa_atomic_exch_template_0(device const ComplexType* vec [[buffer(0)]], + device ComplexType* result [[buffer(1)]], + uint3 _occa_group_position + [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *result = vec[0]; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_inc.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_inc.cpp new file mode 100644 index 00000000..b3b44f3e --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_inc.cpp @@ -0,0 +1,23 @@ + +@kernel void atomic_inc_builtin(unsigned int* value) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic++(*value); + } + } + // @atomic (*value)++; normalizer issue +} + +struct ComplexMaskType { + unsigned int val1; + int val2; +}; + +@kernel void atomic_inc_struct(ComplexMaskType* value) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic++ value->val1; + @atomic value->val2++; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_inc_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_inc_ref.cpp new file mode 100644 index 00000000..f7c49c0f --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_inc_ref.cpp @@ -0,0 +1,35 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_inc_builtin_0(device unsigned int* value [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + ++(*value); + } + } +} + +struct ComplexMaskType { + unsigned int val1; + int val2; +}; + +kernel void _occa_atomic_inc_struct_0(device ComplexMaskType* value [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + ++value->val1; + value->val2++; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_or.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_or.cpp new file mode 100644 index 00000000..8fd2e00a --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_or.cpp @@ -0,0 +1,22 @@ + +@kernel void atomic_and_builtin(const unsigned int* masks, unsigned int* mask) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* mask |= masks[0]; + } + } +} + +struct ComplexMaskType { + unsigned int mask1; + unsigned int mask2; +}; + +@kernel void atomic_and_struct(const ComplexMaskType* masks, ComplexMaskType* mask) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic mask->mask1 |= masks[0].mask1; + @atomic mask->mask2 |= masks[0].mask2; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_or_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_or_ref.cpp new file mode 100644 index 00000000..c19f0ddc --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_or_ref.cpp @@ -0,0 +1,37 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_and_builtin_0(device const unsigned int* masks [[buffer(0)]], + device unsigned int* mask [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *mask |= masks[0]; + } + } +} + +struct ComplexMaskType { + unsigned int mask1; + unsigned int mask2; +}; + +kernel void _occa_atomic_and_struct_0(device const ComplexMaskType* masks [[buffer(0)]], + device ComplexMaskType* mask [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + mask->mask1 |= masks[0].mask1; + mask->mask2 |= masks[0].mask2; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_sub.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_sub.cpp new file mode 100644 index 00000000..df1b335d --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_sub.cpp @@ -0,0 +1,38 @@ + +@kernel void atomic_sub_builtin(const int* iVec, int* iSum, const float* fVec, float* fSum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* iSum -= iVec[0]; + @atomic* fSum -= fVec[0]; + } + } +} + +struct ComplexTypeF32 { + float real; + float imag; +}; + +@kernel void atomic_sub_struct(const ComplexTypeF32* vec, ComplexTypeF32* sum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic sum->real -= vec[0].real; + @atomic sum->imag -= vec[0].imag; + } + } +} + +template +struct ComplexType { + T real; + T imag; +}; + +@kernel void atomic_sub_template(const ComplexType* vec, ComplexType* sum) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic sum->real -= vec[0].real; + @atomic sum->imag -= vec[0].imag; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_sub_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_sub_ref.cpp new file mode 100644 index 00000000..40dad89d --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_sub_ref.cpp @@ -0,0 +1,61 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_sub_builtin_0(device const int* iVec [[buffer(0)]], + device int* iSum [[buffer(1)]], + device const float* fVec [[buffer(2)]], + device float* fSum [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *iSum -= iVec[0]; + *fSum -= fVec[0]; + } + } +} + +struct ComplexTypeF32 { + float real; + float imag; +}; + +kernel void _occa_atomic_sub_struct_0(device const ComplexTypeF32* vec [[buffer(0)]], + device ComplexTypeF32* sum [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + sum->real -= vec[0].real; + sum->imag -= vec[0].imag; + } + } +} + +template +struct ComplexType { + T real; + T imag; +}; + +kernel void _occa_atomic_sub_template_0(device const ComplexType* vec [[buffer(0)]], + device ComplexType* sum [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + sum->real -= vec[0].real; + sum->imag -= vec[0].imag; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_xor.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_xor.cpp new file mode 100644 index 00000000..e25eaacd --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_xor.cpp @@ -0,0 +1,22 @@ + +@kernel void atomic_and_builtin(const unsigned int* masks, unsigned int* mask) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* mask ^= masks[0]; + } + } +} + +struct ComplexMaskType { + unsigned int mask1; + unsigned int mask2; +}; + +@kernel void atomic_and_struct(const ComplexMaskType* masks, ComplexMaskType* mask) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic mask->mask1 ^= masks[0].mask1; + @atomic mask->mask2 ^= masks[0].mask2; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/atomic_xor_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/atomic_xor_ref.cpp new file mode 100644 index 00000000..d29db56e --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/atomic_xor_ref.cpp @@ -0,0 +1,37 @@ +#include +#include +using namespace metal; + +kernel void _occa_atomic_and_builtin_0(device const unsigned int* masks [[buffer(0)]], + device unsigned int* mask [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *mask ^= masks[0]; + } + } +} + +struct ComplexMaskType { + unsigned int mask1; + unsigned int mask2; +}; + +kernel void _occa_atomic_and_struct_0(device const ComplexMaskType* masks [[buffer(0)]], + device ComplexMaskType* mask [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + mask->mask1 ^= masks[0].mask1; + mask->mask2 ^= masks[0].mask2; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/issue_case.cpp b/tests/functional/data/transpiler/backends/metal/atomic/issue_case.cpp new file mode 100644 index 00000000..693c65dd --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/issue_case.cpp @@ -0,0 +1,14 @@ + +struct ComplexTypeF32 { + ComplexTypeF32& operator=(const ComplexTypeF32&) = default; + float real; + float imag; +}; + +@kernel void atomic_exch_struct(const ComplexTypeF32* vec, ComplexTypeF32* result) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @atomic* result = vec[0]; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/atomic/issue_case_ref.cpp b/tests/functional/data/transpiler/backends/metal/atomic/issue_case_ref.cpp new file mode 100644 index 00000000..6a6f6e3f --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/atomic/issue_case_ref.cpp @@ -0,0 +1,23 @@ +#include +#include +using namespace metal; + +struct ComplexTypeF32 { + ComplexTypeF32& operator=(const ComplexTypeF32&) = default; + float real; + float imag; +}; + +kernel void _occa_atomic_exch_struct_0(device const ComplexTypeF32* vec [[buffer(0)]], + device ComplexTypeF32* result [[buffer(1)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position + [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + *result = vec[0]; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/barrier/barrier_builtin.cpp b/tests/functional/data/transpiler/backends/metal/barrier/barrier_builtin.cpp new file mode 100644 index 00000000..ed6ee3d9 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/barrier/barrier_builtin.cpp @@ -0,0 +1,19 @@ +static float add(const float* a, int i, const float* b, int j) { + return a[i] + b[j]; +} + +// TODO: fix preprocessor handling and try with define +#define BLOCK_SIZE 4 + +@kernel void addVectors(const int N, const float* a, const float* b, float* ab) { + @outer for (int i = 0; i < N; i += BLOCK_SIZE) { + @shared float s_b[BLOCK_SIZE]; + const float* g_a = a; + @inner for (int j = 0; j < BLOCK_SIZE; ++j) { + s_b[j] = b[i + j]; + @barrier; + + ab[i + j] = add(g_a, i + j, s_b, j); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/barrier/barrier_builtin_ref.cpp b/tests/functional/data/transpiler/backends/metal/barrier/barrier_builtin_ref.cpp new file mode 100644 index 00000000..c431a464 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/barrier/barrier_builtin_ref.cpp @@ -0,0 +1,27 @@ +#include +#include +using namespace metal; + +static float add(const float* a, int i, const float* b, int j) { + return a[i] + b[j]; +} + +// TODO: fix preprocessor handling and try with define +kernel void _occa_addVectors_0(constant int& N [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + ((4) * _occa_group_position.x); + threadgroup float s_b[4]; + const float* g_a = a; + { + int j = (0) + _occa_thread_position.x; + s_b[j] = b[i + j]; + threadgroup_barrier(mem_flags::mem_threadgroup); + ab[i + j] = add(g_a, i + j, s_b, j); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/barrier/barrier_warp.cpp b/tests/functional/data/transpiler/backends/metal/barrier/barrier_warp.cpp new file mode 100644 index 00000000..86b7e1c3 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/barrier/barrier_warp.cpp @@ -0,0 +1,7 @@ +@kernel void test_kern() { + @outer for (int i = 0; i < 10; ++i) { + @inner for (int j = 0; j < 10; ++j) { + @barrier("warp"); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/barrier/barrier_warp_ref.cpp b/tests/functional/data/transpiler/backends/metal/barrier/barrier_warp_ref.cpp new file mode 100644 index 00000000..fb4bf79b --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/barrier/barrier_warp_ref.cpp @@ -0,0 +1,14 @@ +#include +#include +using namespace metal; + +kernel void _occa_test_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_builtin.cpp b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_builtin.cpp new file mode 100644 index 00000000..e52b9df5 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_builtin.cpp @@ -0,0 +1,20 @@ +static float add(const float* a, int i, const float* b, int j) { + return a[i] + b[j]; +} + +// TODO: fix preprocessor handling and try with define +// #define BLOCK_SIZE 4 +const int BLOCK_SIZE = 4; + +@kernel void addVectors(const int N, const float* a, const float* b, float* ab) { + @outer for (int i = 0; i < N; i += BLOCK_SIZE) { + @shared float s_b[BLOCK_SIZE]; + @exclusive const float* g_a = a; + @inner for (int j = 0; j < BLOCK_SIZE; ++j) { + s_b[j] = b[i + j]; + @barrier; + + ab[i + j] = add(g_a, i + j, s_b, j); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_builtin_ref.cpp b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_builtin_ref.cpp new file mode 100644 index 00000000..a6822853 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_builtin_ref.cpp @@ -0,0 +1,30 @@ +#include +#include +using namespace metal; + +static float add(const float* a, int i, const float* b, int j) { + return a[i] + b[j]; +} + +// TODO: fix preprocessor handling and try with define +// #define BLOCK_SIZE 4 +const int BLOCK_SIZE = 4; + +kernel void _occa_addVectors_0(constant int& N [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + ((BLOCK_SIZE)*_occa_group_position.x); + threadgroup float s_b[BLOCK_SIZE]; + const float* g_a = a; + { + int j = (0) + _occa_thread_position.x; + s_b[j] = b[i + j]; + threadgroup_barrier(mem_flags::mem_threadgroup); + ab[i + j] = add(g_a, i + j, s_b, j); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_in_typedecl.cpp b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_in_typedecl.cpp new file mode 100644 index 00000000..ee8e5df6 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_in_typedecl.cpp @@ -0,0 +1,10 @@ +typedef float ex_float32_t @exclusive; + +@kernel void test_kernel() { + @outer for (int i = 0; i < 32; ++i) { + ex_float32_t d[32]; + @inner for (int j = 0; j < 32; ++j) { + d[j] = i - j; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_in_typedecl_ref.cpp b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_in_typedecl_ref.cpp new file mode 100644 index 00000000..048daa7b --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/exclusive/exclusive_in_typedecl_ref.cpp @@ -0,0 +1,17 @@ +#include +#include +using namespace metal; + +typedef float ex_float32_t; + +kernel void _occa_test_kernel_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + ex_float32_t d[32]; + { + int j = (0) + _occa_thread_position.x; + d[j] = i - j; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/const_global_const_size_array.cpp b/tests/functional/data/transpiler/backends/metal/implicit/const_global_const_size_array.cpp new file mode 100644 index 00000000..026bc6a0 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/const_global_const_size_array.cpp @@ -0,0 +1,21 @@ +// const array +const int arr_const0[12] = {0}; +int const arr_const1[12] = {0}; + +// Stupid formatting +const int arr_const2[12] = {0}; + +// Deduced size +const float arr_const3[] = {1., 2., 3., 4., 5., 6.}; + +// Multidimensional +const float arr_const4[][2] = {{1., 2.}, {3., 4.}, {5., 6.}}; +const float arr_const5[][3][2] = {{{1., 2.}, {3., 4.}, {5., 6.}}, {{1., 2.}, {3., 4.}, {5., 6.}}}; + +// At least one @kern function is required +@kernel void kern() { + @outer for (int i = 0; i < 32; ++i) { + @inner for (int j = 0; j < 32; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/const_global_const_size_array_ref.cpp b/tests/functional/data/transpiler/backends/metal/implicit/const_global_const_size_array_ref.cpp new file mode 100644 index 00000000..1fb590c0 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/const_global_const_size_array_ref.cpp @@ -0,0 +1,23 @@ +#include +#include +using namespace metal; + +// const array +const int arr_const0[12] = {0}; +int const arr_const1[12] = {0}; +// Stupid formatting +const int arr_const2[12] = {0}; +// Deduced size +const float arr_const3[] = {1., 2., 3., 4., 5., 6.}; +// Multidimensional +const float arr_const4[][2] = {{1., 2.}, {3., 4.}, {5., 6.}}; +const float arr_const5[][3][2] = {{{1., 2.}, {3., 4.}, {5., 6.}}, {{1., 2.}, {3., 4.}, {5., 6.}}}; + +// At least one @kern function is required +kernel void _occa_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/const_global_pointer.cpp b/tests/functional/data/transpiler/backends/metal/implicit/const_global_pointer.cpp new file mode 100644 index 00000000..5c25d5c4 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/const_global_pointer.cpp @@ -0,0 +1,21 @@ +// pointer to const +const int* ptr_const0 = 0; +int const* ptr_const1 = 0; + +// const pointer to const +const int* const ptr_const2 = 0; +int const* const ptr_const3 = 0; + +// const pointer to non const +int* const ptr_const4 = 0; + +// Stupid formatting +const int* ptr_const5 = 0; + +// At least one @kern function is required +@kernel void kern() { + @outer for (int i = 0; i < 32; ++i) { + @inner for (int j = 0; j < 32; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/const_global_pointer_ref.cpp b/tests/functional/data/transpiler/backends/metal/implicit/const_global_pointer_ref.cpp new file mode 100644 index 00000000..ff81c73e --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/const_global_pointer_ref.cpp @@ -0,0 +1,23 @@ +#include +#include +using namespace metal; + +// pointer to const +const int* ptr_const0 = 0; +int const* ptr_const1 = 0; +// const pointer to const +const int* const ptr_const2 = 0; +int const* const ptr_const3 = 0; +// const pointer to non const +int* const ptr_const4 = 0; +// Stupid formatting +const int* ptr_const5 = 0; + +// At least one @kern function is required +kernel void _occa_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/const_global_variable.cpp b/tests/functional/data/transpiler/backends/metal/implicit/const_global_variable.cpp new file mode 100644 index 00000000..8f553644 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/const_global_variable.cpp @@ -0,0 +1,20 @@ +// int const, const int +const int var_const0 = 0; +int const var_const1 = 0; + +// volatile qualifier +volatile const int var_const2 = 0; +volatile int const var_const3 = 0; + +// Stupid formatting +const int var_const4 = 0; + +int const var_const5 = 0; + +// At least one @kern function is required +@kernel void kern() { + @outer for (int i = 0; i < 32; ++i) { + @inner for (int j = 0; j < 32; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/const_global_variable_ref.cpp b/tests/functional/data/transpiler/backends/metal/implicit/const_global_variable_ref.cpp new file mode 100644 index 00000000..c2c5cda1 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/const_global_variable_ref.cpp @@ -0,0 +1,22 @@ +#include +#include +using namespace metal; + +// int const, const int +const int var_const0 = 0; +int const var_const1 = 0; +// volatile qualifier +volatile const int var_const2 = 0; +volatile int const var_const3 = 0; +// Stupid formatting +const int var_const4 = 0; +int const var_const5 = 0; + +// At least one @kern function is required +kernel void _occa_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/constexpr.cpp b/tests/functional/data/transpiler/backends/metal/implicit/constexpr.cpp new file mode 100644 index 00000000..52cef2ca --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/constexpr.cpp @@ -0,0 +1,13 @@ +constexpr float f = 13; + +class HelloClass { + public: + static constexpr int a = 2 + 2; +}; + +@kernel void test() { + for (int i = 0; i < 10; ++i; @outer) { + for (int j = 0; j < 10; ++j; @inner) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/constexpr_ref.cpp b/tests/functional/data/transpiler/backends/metal/implicit/constexpr_ref.cpp new file mode 100644 index 00000000..156fbaeb --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/constexpr_ref.cpp @@ -0,0 +1,18 @@ +#include +#include +using namespace metal; + +constexpr float f = 13; + +class HelloClass { + public: + static constexpr int a = 2 + 2; +}; + +kernel void _occa_test_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/extern_const_global_array.cpp b/tests/functional/data/transpiler/backends/metal/implicit/extern_const_global_array.cpp new file mode 100644 index 00000000..ae0e51ec --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/extern_const_global_array.cpp @@ -0,0 +1,14 @@ +struct S { + int hello[12]; +}; +extern const int arr_0[]; +extern const float arr_1[]; +extern const S arr_2[]; + +// At least one @kern function is required +@kernel void kern() { + @outer for (int i = 0; i < 32; ++i) { + @inner for (int j = 0; j < 32; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/extern_const_global_array_ref.cpp b/tests/functional/data/transpiler/backends/metal/implicit/extern_const_global_array_ref.cpp new file mode 100644 index 00000000..3e3a2730 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/extern_const_global_array_ref.cpp @@ -0,0 +1,20 @@ +#include +#include +using namespace metal; + +struct S { + int hello[12]; +}; + +extern const int arr_0[]; +extern const float arr_1[]; +extern const S arr_2[]; + +// At least one @kern function is required +kernel void _occa_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/non_kernel_function.cpp b/tests/functional/data/transpiler/backends/metal/implicit/non_kernel_function.cpp new file mode 100644 index 00000000..e8b5ad50 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/non_kernel_function.cpp @@ -0,0 +1,15 @@ +static float add1(const float* a, int i, const float* b, int j) { + return a[i] + b[i]; +} + +float add2(const float* a, int i, const float* b, int j) { + return a[i] + b[i]; +} + +// At least one @kern function is required +@kernel void kern() { + @outer for (int i = 0; i < 32; ++i) { + @inner for (int j = 0; j < 32; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/implicit/non_kernel_function_ref.cpp b/tests/functional/data/transpiler/backends/metal/implicit/non_kernel_function_ref.cpp new file mode 100644 index 00000000..922b44e1 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/implicit/non_kernel_function_ref.cpp @@ -0,0 +1,20 @@ +#include +#include +using namespace metal; + +static float add1(const float* a, int i, const float* b, int j) { + return a[i] + b[i]; +} + +float add2(const float* a, int i, const float* b, int j) { + return a[i] + b[i]; +} + +// At least one @kern function is required +kernel void _occa_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/max_inner_loops/outer_inner_split_max.cpp b/tests/functional/data/transpiler/backends/metal/max_inner_loops/outer_inner_split_max.cpp new file mode 100644 index 00000000..a2ddca2b --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/max_inner_loops/outer_inner_split_max.cpp @@ -0,0 +1,83 @@ + +@kernel void test0(const int entries, const float* a, const float* b, float* ab) { + int before0 = 0; + @outer for (int x = 0; x < 10; ++x; @max_inner_dims(2, 3, 5)) { + // int before1 = 1 + before0; + int before1 = 1; + @outer for (int y = 0; y < 20; ++y) { + int before2 = 1 + before1; + @outer for (int z = 0; z < 30; ++z) { + int before3 = 1 + before2; + @inner for (int n = 0; n < 2; ++n) { + int after0 = 1 + before3; + @inner for (int m = 0; m < 3; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + @inner for (int m = 0; m < 5; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } + int before00 = 1 + before0; + @max_inner_dims(3, 5, 7) for (int x = 0; x < 10; ++x; @outer) { + // int before1 = 1 + before00; + int before1 = 1; + @outer for (int y = 0; y < 20; ++y) { + int before2 = 1 + before1; + @outer for (int z = 0; z < 30; ++z) { + int before3 = 1 + before2; + @inner for (int n = 0; n < 2; ++n) { + int after0 = 1 + before3; + @inner for (int m = 0; m < 3; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + @inner for (int m = 0; m < 5; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/max_inner_loops/outer_inner_split_max_ref.cpp b/tests/functional/data/transpiler/backends/metal/max_inner_loops/outer_inner_split_max_ref.cpp new file mode 100644 index 00000000..7c91dec7 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/max_inner_loops/outer_inner_split_max_ref.cpp @@ -0,0 +1,117 @@ +#include +#include +using namespace metal; + +kernel void _occa_test0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int x = (0) + _occa_group_position.z; + // int before1 = 1 + before0; + int before1 = 1; + { + int y = (0) + _occa_group_position.y; + int before2 = 1 + before1; + { + int z = (0) + _occa_group_position.x; + int before3 = 1 + before2; + { + int n = (0) + _occa_thread_position.z; + int after0 = 1 + before3; + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } +} + +kernel void _occa_test0_1(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int x = (0) + _occa_group_position.z; + // int before1 = 1 + before00; + int before1 = 1; + { + int y = (0) + _occa_group_position.y; + int before2 = 1 + before1; + { + int z = (0) + _occa_group_position.x; + int before3 = 1 + before2; + { + int n = (0) + _occa_thread_position.z; + int after0 = 1 + before3; + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/nobarrier/nobarrier_builtin.cpp b/tests/functional/data/transpiler/backends/metal/nobarrier/nobarrier_builtin.cpp new file mode 100644 index 00000000..75274256 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/nobarrier/nobarrier_builtin.cpp @@ -0,0 +1,32 @@ +@kernel void hello_kern() { + for (int i = 0; i < 10; ++i; @outer) { + @shared int shm[10]; + for (int j = 0; j < 10; ++j; @inner) { + shm[j] = j; + } + + @nobarrier for (int j = 0; j < 10; ++j; @inner) { + shm[j] = j; + } + + for (int j = 0; j < 10; ++j; @inner) { + shm[j] = j; + } + + for (int j = 0; j < 10; ++j; @inner) { + shm[j] = j; + } + } +} + +@kernel void priority_issue() { + @outer for (int i = 0; i < 32; ++i) { + @shared float shm[32]; + @nobarrier for (int j = 0; j < 32; ++j; @inner) { + shm[i] = i; + } + @inner for (int j = 0; j < 32; ++j) { + @atomic shm[i * j] += 32; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/nobarrier/nobarrier_builtin_ref.cpp b/tests/functional/data/transpiler/backends/metal/nobarrier/nobarrier_builtin_ref.cpp new file mode 100644 index 00000000..28bf3f83 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/nobarrier/nobarrier_builtin_ref.cpp @@ -0,0 +1,45 @@ +#include +#include +using namespace metal; + +kernel void _occa_hello_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup int shm[10]; + { + int j = (0) + _occa_thread_position.x; + shm[j] = j; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + { + int j = (0) + _occa_thread_position.x; + shm[j] = j; + } + { + int j = (0) + _occa_thread_position.x; + shm[j] = j; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + { + int j = (0) + _occa_thread_position.x; + shm[j] = j; + } + } +} + +kernel void _occa_priority_issue_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup float shm[32]; + { + int j = (0) + _occa_thread_position.x; + shm[i] = i; + } + { + int j = (0) + _occa_thread_position.x; + shm[i * j] += 32; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_dec.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_dec.cpp new file mode 100644 index 00000000..d2254673 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_dec.cpp @@ -0,0 +1,83 @@ +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = entries - 1; j >= 0; j -= 1) { + @inner for (int i = entries - 1; i >= 0; i -= 1) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner non 1 increment +@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = entries - 1; j >= 0; j -= 2) { + @inner for (int i = entries - 1; i >= 0; i -= 2) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary post add +@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = entries - 1; j >= 0; j--) { + @inner for (int i = entries; i > 0; i--) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary pre add +@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = entries - 1; j >= 0; --j) { + @inner for (int i = entries - 1; i >= 0; --i) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> outer -> inner -> inner +// TODO: change after sema calculates dimensions +@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) { + @outer for (int i = entries - 1; i >= 0; --i) { + @outer for (int j = entries - 1; j >= 0; --j) { + @inner for (int k = entries - 1; k >= 0; --k) { + @inner for (int ii = entries - 1; ii >= 0; --ii) { + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + manual dimensions specification +@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) { + @outer(1) for (int i = entries - 1; i >= 0; --i) { + @outer(0) for (int j = entries - 1; j >= 0; --j) { + @inner(1) for (int k = entries - 1; k >= 0; --k) { + @inner(0) for (int ii = entries - 1; ii >= 0; --ii) { + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + partially manual dimensions specification +// TODO: change after sema calculates dimensions +@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) { + @outer for (int i = entries - 1; i >= 0; --i) { + @outer(0) for (int j = entries - 1; j >= 0; --j) { + @inner for (int k = entries - 1; k >= 0; --k) { + @inner(0) for (int ii = entries - 1; ii >= 0; --ii) { + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_dec_ref.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_dec_ref.cpp new file mode 100644 index 00000000..eb70f8e2 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_dec_ref.cpp @@ -0,0 +1,142 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +kernel void _occa_addVectors0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (entries - 1) - ((1) * _occa_group_position.x); + { + int i = (entries - 1) - ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner non 1 increment +kernel void _occa_addVectors1_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (entries - 1) - ((2) * _occa_group_position.x); + { + int i = (entries - 1) - ((2) * _occa_thread_position.x); + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary post add +kernel void _occa_addVectors2_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (entries - 1) - _occa_group_position.x; + { + int i = (entries)-_occa_thread_position.x; + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary pre add +kernel void _occa_addVectors3_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (entries - 1) - _occa_group_position.x; + { + int i = (entries - 1) - _occa_thread_position.x; + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> outer -> inner -> inner +// TODO: change after sema calculates dimensions +kernel void _occa_addVectors4_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (entries - 1) - _occa_group_position.y; + { + int j = (entries - 1) - _occa_group_position.x; + { + int k = (entries - 1) - _occa_thread_position.y; + { + int ii = (entries - 1) - _occa_thread_position.x; + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + manual dimensions specification +kernel void _occa_addVectors5_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (entries - 1) - _occa_group_position.y; + { + int j = (entries - 1) - _occa_group_position.x; + { + int k = (entries - 1) - _occa_thread_position.y; + { + int ii = (entries - 1) - _occa_thread_position.x; + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + partially manual dimensions specification +// TODO: change after sema calculates dimensions +kernel void _occa_addVectors6_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (entries - 1) - _occa_group_position.y; + { + int j = (entries - 1) - _occa_group_position.x; + { + int k = (entries - 1) - _occa_thread_position.y; + { + int ii = (entries - 1) - _occa_thread_position.x; + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_inc.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_inc.cpp new file mode 100644 index 00000000..a11c3d7b --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_inc.cpp @@ -0,0 +1,82 @@ +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = 0; j < entries; j += 1) { + @inner for (int i = 0; i < entries; i += 1) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner non 1 increment +@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = 0; j < entries; j += 2) { + @inner for (int i = 0; i < entries; i += 2) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary post add +@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = 0; j < entries; j++) { + @inner for (int i = 0; i <= entries - 1; i++) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary pre add +@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) { + @outer for (int j = 0; j < entries; ++j) { + @inner for (int i = 0; i < entries; ++i) { + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> outer -> inner -> inner +// TODO: change after sema calculates dimensions +@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) { + @outer for (int i = 0; i < entries; ++i) { + @outer for (int j = 0; j < entries; ++j) { + @inner for (int k = 0; k < entries; ++k) { + @inner for (int ii = 0; ii < entries; ++ii) { + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + manual dimensions specification +@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) { + @outer(1) for (int i = 0; i < entries; ++i) { + @outer(0) for (int j = 0; j < entries; ++j) { + @inner(1) for (int k = 0; k < entries; ++k) { + @inner(0) for (int ii = 0; ii < entries; ++ii) { + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + partially manual dimensions specification +@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) { + @outer for (int i = 0; i < entries; ++i) { + @outer(0) for (int j = 0; j < entries; ++j) { + @inner for (int k = 0; k < entries; ++k) { + @inner(0) for (int ii = 0; ii < entries; ++ii) { + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_inc_ref.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_inc_ref.cpp new file mode 100644 index 00000000..5373f422 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_inc_ref.cpp @@ -0,0 +1,141 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +kernel void _occa_addVectors0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (0) + ((1) * _occa_group_position.x); + { + int i = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner non 1 increment +kernel void _occa_addVectors1_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (0) + ((2) * _occa_group_position.x); + { + int i = (0) + ((2) * _occa_thread_position.x); + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary post add +kernel void _occa_addVectors2_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (0) + _occa_group_position.x; + { + int i = (0) + _occa_thread_position.x; + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> inner unary pre add +kernel void _occa_addVectors3_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int j = (0) + _occa_group_position.x; + { + int i = (0) + _occa_thread_position.x; + ab[i] = add(a[i], b[i]); + } + } +} + +// Outer -> outer -> inner -> inner +// TODO: change after sema calculates dimensions +kernel void _occa_addVectors4_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.y; + { + int j = (0) + _occa_group_position.x; + { + int k = (0) + _occa_thread_position.y; + { + int ii = (0) + _occa_thread_position.x; + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + manual dimensions specification +kernel void _occa_addVectors5_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.y; + { + int j = (0) + _occa_group_position.x; + { + int k = (0) + _occa_thread_position.y; + { + int ii = (0) + _occa_thread_position.x; + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} + +// Outer -> outer -> inner -> inner + partially manual dimensions specification +kernel void _occa_addVectors6_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.y; + { + int j = (0) + _occa_group_position.x; + { + int k = (0) + _occa_thread_position.y; + { + int ii = (0) + _occa_thread_position.x; + ab[ii + k] = add(a[i], b[j]); + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_multiple.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_multiple.cpp new file mode 100644 index 00000000..c53693e2 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_multiple.cpp @@ -0,0 +1,108 @@ +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// with shared memory usage (should be automatic sync) +@kernel void addVectors(const int entries, float* a, float* b, float* ab, float* mat @dim(10, 10)) { + for (int i = 0; i < entries; i += 1; @outer) { + for (int i2 = 0; i2 < entries; i2 += 1; @outer) { + @shared int shm[32]; + @shared int shm2[32]; + for (int j = 0; j < entries; j += 1; @inner) { + shm[j] = 0; // shared memory usage -> should be barrier after @inner loop + mat(0, 0) = 12; + for (int k = 0; k < entries; k += 1; @inner) { + for (int ii = 0; ii < entries; ii += 1; @inner) { + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + for (int k = 0; k < entries; k += 1; @inner) { + for (int ii = 0; ii < entries; ii += 1; @inner) { + ab[i] = add(a[i], b[k]); + } + + ab[i] = add(a[i], b[k]); + } + } + + for (int j = 0; j < entries; j += 1; @tile(4, @inner, @inner, check = false)) { + for (int k = 0; k < entries; k += 1; @inner) { + // shared memory usage -> should be barrier, since @tile is inner, inner + shm[j] = 0; + } + } + + for (int j = 0; j < entries; j += 1; @inner) { + shm[j] = 0; + for (int k = 0; k < entries; k += 1; @inner) { + for (int ii = 0; ii < entries; ii += 1; @inner) { + ab[i] = add(a[i], b[k]); + } + + ab[i] = add(a[i], b[k]); + } + + for (int k = 0; k < entries; k += 1; @tile(4, @inner, @inner, check = false)) { + ab[i] = add(a[i], b[k]); + } + } + } + } +} + +// without shared memory usage (should be no automatic sync) +@kernel void addVectors1(const int entries, + float* a, + float* b, + float* ab, + float* mat @dim(10, 10)) { + for (int i = 0; i < entries; i += 1; @outer) { + for (int i2 = 0; i2 < entries; i2 += 1; @outer) { + @shared int shm[32]; + @shared int shm2[32]; + for (int j = 0; j < entries; j += 1; @inner) { + // shm[j] = 0; // shared memory usage -> should be barrier after @inner loop + mat(0, 0) = 12; + for (int k = 0; k < entries; k += 1; @inner) { + for (int ii = 0; ii < entries; ii += 1; @inner) { + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + for (int k = 0; k < entries; k += 1; @inner) { + for (int ii = 0; ii < entries; ii += 1; @inner) { + ab[i] = add(a[i], b[k]); + } + + ab[i] = add(a[i], b[k]); + } + } + + for (int j = 0; j < entries; j += 1; @tile(4, @inner, @inner, check = false)) { + for (int k = 0; k < entries; k += 1; @inner) { + // shared memory usage -> should be barrier, since @tile is inner, inner + // shm[j] = 0; + } + } + + for (int j = 0; j < entries; j += 1; @inner) { + shm[j] = 0; + for (int k = 0; k < entries; k += 1; @inner) { + for (int ii = 0; ii < entries; ii += 1; @inner) { + ab[i] = add(a[i], b[k]); + } + + ab[i] = add(a[i], b[k]); + } + + for (int k = 0; k < entries; k += 1; @tile(4, @inner, @inner, check = false)) { + ab[i] = add(a[i], b[k]); + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_multiple_ref.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_multiple_ref.cpp new file mode 100644 index 00000000..75e5d2eb --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_multiple_ref.cpp @@ -0,0 +1,156 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// with shared memory usage (should be automatic sync) +kernel void _occa_addVectors_0(constant int& entries [[buffer(0)]], + device float* a [[buffer(1)]], + device float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + device float* mat [[buffer(4)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + ((1) * _occa_group_position.y); + { + int i2 = (0) + ((1) * _occa_group_position.x); + threadgroup int shm[32]; + threadgroup int shm2[32]; + { + int j = (0) + ((1) * _occa_thread_position.z); + shm[j] = 0; // shared memory usage -> should be barrier after @inner loop + mat[0 + (10 * (0))] = 12; + { + int k = (0) + ((1) * _occa_thread_position.y); + { + int ii = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + { + int k = (0) + ((1) * _occa_thread_position.y); + { + int ii = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + { + int _occa_tiled_j = (0) + (((4) * 1) * _occa_thread_position.z); + { + int j = _occa_tiled_j + ((1) * _occa_thread_position.y); + { + { + int k = (0) + ((1) * _occa_thread_position.x); + // shared memory usage -> should be barrier, since @tile is inner, + // inner + shm[j] = 0; + } + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + { + int j = (0) + ((1) * _occa_thread_position.z); + shm[j] = 0; + { + int k = (0) + ((1) * _occa_thread_position.y); + { + int ii = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + { + int _occa_tiled_k = (0) + (((4) * 1) * _occa_thread_position.y); + { + int k = _occa_tiled_k + ((1) * _occa_thread_position.x); + { ab[i] = add(a[i], b[k]); } + } + } + } + } + } +} + +// without shared memory usage (should be no automatic sync) +kernel void _occa_addVectors1_0(constant int& entries [[buffer(0)]], + device float* a [[buffer(1)]], + device float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + device float* mat [[buffer(4)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + ((1) * _occa_group_position.y); + { + int i2 = (0) + ((1) * _occa_group_position.x); + threadgroup int shm[32]; + threadgroup int shm2[32]; + { + int j = (0) + ((1) * _occa_thread_position.z); + // shm[j] = 0; // shared memory usage -> should be barrier after @inner + // loop + mat[0 + (10 * (0))] = 12; + { + int k = (0) + ((1) * _occa_thread_position.y); + { + int ii = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + { + int k = (0) + ((1) * _occa_thread_position.y); + { + int ii = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + } + { + int _occa_tiled_j = (0) + (((4) * 1) * _occa_thread_position.z); + { + int j = _occa_tiled_j + ((1) * _occa_thread_position.y); + { + { + int k = (0) + ((1) * _occa_thread_position.x); + // shared memory usage -> should be barrier, since @tile is inner, + // inner shm[j] = 0; + } + } + } + } + { + int j = (0) + ((1) * _occa_thread_position.z); + shm[j] = 0; + { + int k = (0) + ((1) * _occa_thread_position.y); + { + int ii = (0) + ((1) * _occa_thread_position.x); + ab[i] = add(a[i], b[k]); + } + ab[i] = add(a[i], b[k]); + } + { + int _occa_tiled_k = (0) + (((4) * 1) * _occa_thread_position.y); + { + int k = _occa_tiled_k + ((1) * _occa_thread_position.x); + { ab[i] = add(a[i], b[k]); } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp new file mode 100644 index 00000000..4877b67f --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed.cpp @@ -0,0 +1,32 @@ +@kernel void test_kernel() { + @outer for (int i = 0; i < 10; ++i) { + @outer for (int i2 = 0; i2 < 10; ++i2) { + @inner for (int j = 0; j < 10; ++j) { + } + + for (int ii = 0; ii < 10; ++ii) { + @inner for (int j = 0; j < 10; ++j) { + } + for (int j = 0; j < 10; ++j) { + } + } + } + + for (int ii = 0; ii < 10; ++ii) { + @outer for (int i = 0; i < 10; ++i) { + @inner for (int j = 0; j < 10; ++j) { + } + } + } + } + for (int ii = 0; ii < 10; ++ii) { + @outer for (int i = 0; i < 10; ++i) { + for (int i2 = 0; i2 < 10; ++i2) { + @outer for (int i2 = 0; i2 < 10; ++i2) { + @inner for (int j = 0; j < 10; ++j) { + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp new file mode 100644 index 00000000..2daf3db7 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_regular_at_same_level_as_attributed_ref.cpp @@ -0,0 +1,40 @@ +#include +#include +using namespace metal; + +kernel void _occa_test_kernel_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.y; + { + int i2 = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + for (int ii = 0; ii < 10; ++ii) { + { + int j = (0) + _occa_thread_position.x; + } + for (int j = 0; j < 10; ++j) { + } + } + } + for (int ii = 0; ii < 10; ++ii) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } + } + } +} + +kernel void _occa_test_kernel_1(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.y; + for (int i2 = 0; i2 < 10; ++i2) { + { + int i2 = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_split.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_split.cpp new file mode 100644 index 00000000..ada5061a --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_split.cpp @@ -0,0 +1,83 @@ + +@kernel void test0(const int entries, const float* a, const float* b, float* ab) { + int before0 = 0; + @outer for (int x = 0; x < 10; ++x) { + // int before1 = 1 + before0; + int before1 = 1; + @outer for (int y = 0; y < 20; ++y) { + int before2 = 1 + before1; + @outer for (int z = 0; z < 30; ++z) { + int before3 = 1 + before2; + @inner for (int n = 0; n < 2; ++n) { + int after0 = 1 + before3; + @inner for (int m = 0; m < 3; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + @inner for (int m = 0; m < 5; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } + int before00 = 1 + before0; + @outer for (int x = 0; x < 10; ++x) { + // int before1 = 1 + before00; + int before1 = 1; + @outer for (int y = 0; y < 20; ++y) { + int before2 = 1 + before1; + @outer for (int z = 0; z < 30; ++z) { + int before3 = 1 + before2; + @inner for (int n = 0; n < 2; ++n) { + int after0 = 1 + before3; + @inner for (int m = 0; m < 3; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 5; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + @inner for (int m = 0; m < 5; ++m) { + int after1 = 1 + after0; + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + @inner for (int k = 0; k < 7; ++k) { + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_split_ref.cpp b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_split_ref.cpp new file mode 100644 index 00000000..7c91dec7 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/outer_inner/outer_inner_split_ref.cpp @@ -0,0 +1,117 @@ +#include +#include +using namespace metal; + +kernel void _occa_test0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int x = (0) + _occa_group_position.z; + // int before1 = 1 + before0; + int before1 = 1; + { + int y = (0) + _occa_group_position.y; + int before2 = 1 + before1; + { + int z = (0) + _occa_group_position.x; + int before3 = 1 + before2; + { + int n = (0) + _occa_thread_position.z; + int after0 = 1 + before3; + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } +} + +kernel void _occa_test0_1(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int x = (0) + _occa_group_position.z; + // int before1 = 1 + before00; + int before1 = 1; + { + int y = (0) + _occa_group_position.y; + int before2 = 1 + before1; + { + int z = (0) + _occa_group_position.x; + int before3 = 1 + before2; + { + int n = (0) + _occa_thread_position.z; + int after0 = 1 + before3; + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + { + int m = (0) + _occa_thread_position.y; + int after1 = 1 + after0; + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + { + int k = (0) + _occa_thread_position.x; + int after2 = 1 + after1; + ab[x] = + a[x] + b[x] + static_cast(k + m + n + z + y + x + after2); + } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_builtin_types.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_builtin_types.cpp new file mode 100644 index 00000000..bf97a411 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_builtin_types.cpp @@ -0,0 +1,9 @@ +@kernel void function1(const int* i32Data @ restrict, + float* fp32Data @ restrict, + const double* fp64Data @ restrict) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + @ restrict float* b = &fp32Data[0]; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_builtin_types_ref.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_builtin_types_ref.cpp new file mode 100644 index 00000000..366ee70d --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_builtin_types_ref.cpp @@ -0,0 +1,17 @@ +#include +#include +using namespace metal; + +kernel void _occa_function1_0(device const int* __restrict__ i32Data [[buffer(0)]], + device float* __restrict__ fp32Data [[buffer(1)]], + device const double* __restrict__ fp64Data [[buffer(2)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { + int j = (0) + _occa_thread_position.x; + float* b = &fp32Data[0]; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_complex_types.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_complex_types.cpp new file mode 100644 index 00000000..834b375b --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_complex_types.cpp @@ -0,0 +1,27 @@ + +template +struct Complex { + T real; + T imaginary; +}; + +struct Configs { + unsigned int size1; + unsigned long size2; +}; + +struct Data { + @ restrict float* x; + @ restrict float* y; + unsigned long size; +}; + +@kernel void function1(const Complex* vectorData @ restrict, + unsigned int vectorSize, + const Complex** matricesData @ restrict, + const Configs* matricesSizes @ restrict) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_complex_types_ref.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_complex_types_ref.cpp new file mode 100644 index 00000000..48619d1f --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_complex_types_ref.cpp @@ -0,0 +1,32 @@ +#include +#include +using namespace metal; + +template +struct Complex { + T real; + T imaginary; +}; + +struct Configs { + unsigned int size1; + unsigned long size2; +}; + +struct Data { + float* __restrict__ x; + float* __restrict__ y; + unsigned long size; +}; + +kernel void _occa_function1_0(device const Complex* __restrict__ vectorData [[buffer(0)]], + constant unsigned int& vectorSize [[buffer(1)]], + device const Complex** __restrict__ matricesData [[buffer(2)]], + device const Configs* __restrict__ matricesSizes [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_namespaced_types.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_namespaced_types.cpp new file mode 100644 index 00000000..9dfbece5 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_namespaced_types.cpp @@ -0,0 +1,38 @@ + +namespace A { +template +struct Complex { + T real; + T imaginary; +}; + +namespace B { +struct Configs { + unsigned int size1; + unsigned long size2; +}; +namespace C { +typedef int SIZE_TYPE; +typedef SIZE_TYPE SIZES; +} // namespace C +} // namespace B +} // namespace A + +@kernel void function1(const A::Complex* vectorData @ restrict, + unsigned int vectorSize, + const A::Complex** matricesData @ restrict, + const A::B::Configs* matricesSizes @ restrict) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + } + } +} + +@kernel void function2(const A::Complex* vectorData @ restrict, + const A::B::Configs* configs @ restrict, + A::B::C::SIZES* vectorSize @ restrict) { + @outer for (int i = 0; i < 1; ++i) { + @inner for (int j = 0; j < 1; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_namespaced_types_ref.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_namespaced_types_ref.cpp new file mode 100644 index 00000000..88dc1d39 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_namespaced_types_ref.cpp @@ -0,0 +1,47 @@ +#include +#include +using namespace metal; + +namespace A { +template +struct Complex { + T real; + T imaginary; +}; + +namespace B { +struct Configs { + unsigned int size1; + unsigned long size2; +}; + +namespace C { +typedef int SIZE_TYPE; +typedef SIZE_TYPE SIZES; +} // namespace C +} // namespace B +} // namespace A + +kernel void _occa_function1_0(device const A::Complex* __restrict__ vectorData [[buffer(0)]], + constant unsigned int& vectorSize [[buffer(1)]], + device const A::Complex** __restrict__ matricesData + [[buffer(2)]], + device const A::B::Configs* __restrict__ matricesSizes [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} + +kernel void _occa_function2_0(device const A::Complex* __restrict__ vectorData [[buffer(0)]], + device const A::B::Configs* __restrict__ configs [[buffer(1)]], + device A::B::C::SIZES* __restrict__ vectorSize [[buffer(2)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_return_type.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_return_type.cpp new file mode 100644 index 00000000..9b0f913c --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_return_type.cpp @@ -0,0 +1,14 @@ +@ restrict float* myfn(float* a) { + return a + 1; +} + +float* myfn2(float* a) { + return a + 1; +} + +@kernel void hello() { + for (int i = 0; i < 10; i++; @outer) { + for (int j = 0; j < 10; j++; @inner) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/restrict/restrict_return_type_ref.cpp b/tests/functional/data/transpiler/backends/metal/restrict/restrict_return_type_ref.cpp new file mode 100644 index 00000000..681884c6 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/restrict/restrict_return_type_ref.cpp @@ -0,0 +1,19 @@ +#include +#include +using namespace metal; + +float* __restrict__ myfn(float* a) { + return a + 1; +} + +float* myfn2(float* a) { + return a + 1; +} + +kernel void _occa_hello_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_between_tiles.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_between_tiles.cpp new file mode 100644 index 00000000..e0b67760 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_between_tiles.cpp @@ -0,0 +1,8 @@ +@kernel void test_kern() { + @tile(4, @outer) for (int i = 0; i < 10; ++i) { + @shared int shm[10]; + @tile(4, @inner, @inner) for (int j = 0; j < 10; ++j) { + shm[j] = j; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_between_tiles_ref.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_between_tiles_ref.cpp new file mode 100644 index 00000000..391411b4 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_between_tiles_ref.cpp @@ -0,0 +1,25 @@ +#include +#include +using namespace metal; + +kernel void _occa_test_kern_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + ((4) * _occa_group_position.x); + for (int i = _occa_tiled_i; i < (_occa_tiled_i + (4)); ++i) { + if (i < 10) { + threadgroup int shm[10]; + { + int _occa_tiled_j = (0) + ((4) * _occa_thread_position.y); + { + int j = _occa_tiled_j + _occa_thread_position.x; + if (j < 10) { + shm[j] = j; + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_builtin_types.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_builtin_types.cpp new file mode 100644 index 00000000..af3d5c90 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_builtin_types.cpp @@ -0,0 +1,39 @@ +@kernel void function1(const int* data) { + @outer for (int i = 0; i < 64; ++i) { + @shared int arr1[32]; + @shared float arr2[8][32]; + @shared double arr3[4 + 4]; + @inner for (int j = 0; j < 64; ++j) { + } + } +} + +// syncronization between @inner loops: +@kernel void function2() { + for (int i = 0; i < 10; i++; @outer) { + @shared int shm[10]; + + for (int j = 0; j < 10; j++; @inner) { + shm[i] = j; + } + // sync should be here + for (int j = 0; j < 10; j++; @inner) { + shm[i] = j; + } + // sync should not be here + } +} + +// Even if loop is last, if it is inside regular loop, syncronization is inserted +@kernel void function3() { + for (int i = 0; i < 10; i++; @outer) { + @shared int shm[10]; + + for (int q = 0; q < 5; ++q) { + for (int j = 0; j < 10; j++; @inner) { + shm[i] = j; + } + // sync should be here + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_builtin_types_ref.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_builtin_types_ref.cpp new file mode 100644 index 00000000..5d169bd1 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_builtin_types_ref.cpp @@ -0,0 +1,53 @@ +#include +#include +using namespace metal; + +kernel void _occa_function1_0(device const int* data [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup int arr1[32]; + threadgroup float arr2[8][32]; + threadgroup double arr3[4 + 4]; + { int j = (0) + _occa_thread_position.x; } + } +} + +// syncronization between @inner loops: +kernel void _occa_function2_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup int shm[10]; + { + int j = (0) + _occa_thread_position.x; + shm[i] = j; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + // sync should be here + { + int j = (0) + _occa_thread_position.x; + shm[i] = j; + } + // sync should not be here + } +} + +// Even if loop is last, if it is inside regular loop, syncronization is +// inserted +kernel void _occa_function3_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup int shm[10]; + for (int q = 0; q < 5; ++q) { + { + int j = (0) + _occa_thread_position.x; + shm[i] = j; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + // sync should be here + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_in_typedecl.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_in_typedecl.cpp new file mode 100644 index 00000000..c08aced3 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_in_typedecl.cpp @@ -0,0 +1,10 @@ +typedef float sh_float32_t @shared; + +@kernel void test_kernel() { + @outer for (int i = 0; i < 32; ++i) { + sh_float32_t b[32]; + @inner for (int j = 0; j < 32; ++j) { + b[j] = i + j; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_in_typedecl_ref.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_in_typedecl_ref.cpp new file mode 100644 index 00000000..d4e48cb1 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_in_typedecl_ref.cpp @@ -0,0 +1,17 @@ +#include +#include +using namespace metal; + +typedef float sh_float32_t; + +kernel void _occa_test_kernel_0(uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup sh_float32_t b[32]; + { + int j = (0) + _occa_thread_position.x; + b[j] = i + j; + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_struct_types.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_struct_types.cpp new file mode 100644 index 00000000..affb4fed --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_struct_types.cpp @@ -0,0 +1,14 @@ + +struct ComplexValueFloat { + float real; + float imaginary; +}; + +@kernel void function1(const int* data) { + @outer for (int i = 0; i < 64; ++i) { + @shared ComplexValueFloat arr2[8][32]; + @shared ComplexValueFloat arr1[32]; + @inner for (int j = 0; j < 64; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_struct_types_ref.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_struct_types_ref.cpp new file mode 100644 index 00000000..8afcfd2b --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_struct_types_ref.cpp @@ -0,0 +1,19 @@ +#include +#include +using namespace metal; + +struct ComplexValueFloat { + float real; + float imaginary; +}; + +kernel void _occa_function1_0(device const int* data [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup ComplexValueFloat arr2[8][32]; + threadgroup ComplexValueFloat arr1[32]; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_template_type.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_template_type.cpp new file mode 100644 index 00000000..61cc672c --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_template_type.cpp @@ -0,0 +1,15 @@ + +template +struct ComplexType { + T real; + T imaginary; +}; + +@kernel void function1(const int* data) { + @outer for (int i = 0; i < 64; ++i) { + @shared ComplexType arr1[32]; + @shared ComplexType arr2[8][32]; + @inner for (int j = 0; j < 64; ++j) { + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/shared/shared_template_type_ref.cpp b/tests/functional/data/transpiler/backends/metal/shared/shared_template_type_ref.cpp new file mode 100644 index 00000000..4893601d --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/shared/shared_template_type_ref.cpp @@ -0,0 +1,20 @@ +#include +#include +using namespace metal; + +template +struct ComplexType { + T real; + T imaginary; +}; + +kernel void _occa_function1_0(device const int* data [[buffer(0)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int i = (0) + _occa_group_position.x; + threadgroup ComplexType arr1[32]; + threadgroup ComplexType arr2[8][32]; + { int j = (0) + _occa_thread_position.x; } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_dec.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_dec.cpp new file mode 100644 index 00000000..4271f2d6 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_dec.cpp @@ -0,0 +1,78 @@ +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner non 1 increment +@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 2; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner unary post add +@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i--; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner unary pre add +@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; --i; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner, check=True +@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer, @inner, check = true)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner, complex range +@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) { + for (int i = (entries + 16); i >= (entries - 12 + 4); i -= (entries / 16 + 1); + @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner, set dimension +@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(1), @inner(2))) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner ==> inner -> inner (nested) +@kernel void addVectors7(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) { + for (int j = entries - 1; j >= 0; j -= 1; @tile(4, @inner(1), @inner(2))) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> inner ==> inner -> inner (nested) + complex range + check true +@kernel void addVectors8(const int entries, const float* a, const float* b, float* ab) { + for (int i = (entries + 16); i >= (entries - 12 + static_cast(*a)); + i -= (entries / 16 + 1); + @tile(4, @outer(0), @inner(0), check = true)) { + for (unsigned long long j = (entries + 16); j >= (entries - 12 + static_cast(*a)); + j -= (entries / 16 + 1); + @tile(4, @inner(1), @inner(2), check = true)) { + ab[i] = add(a[i], b[j]); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_dec_ref.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_dec_ref.cpp new file mode 100644 index 00000000..10699a9a --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_dec_ref.cpp @@ -0,0 +1,192 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +kernel void _occa_addVectors0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.x); + if (i >= 0) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner non 1 increment +kernel void _occa_addVectors1_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 2) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((2) * _occa_thread_position.x); + if (i >= 0) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner unary post add +kernel void _occa_addVectors2_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - ((4) * _occa_group_position.x); + { + int i = _occa_tiled_i - _occa_thread_position.x; + if (i >= 0) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner unary pre add +kernel void _occa_addVectors3_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - ((4) * _occa_group_position.x); + { + int i = _occa_tiled_i - _occa_thread_position.x; + if (i >= 0) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner, check=True +kernel void _occa_addVectors4_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.x); + if (i >= 0) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner, complex range +kernel void _occa_addVectors5_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = + ((entries + 16)) - (((4) * (entries / 16 + 1)) * _occa_group_position.x); + { + int i = _occa_tiled_i - (((entries / 16 + 1)) * _occa_thread_position.x); + if (i >= (entries - 12 + 4)) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner, set dimension +kernel void _occa_addVectors6_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.y); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.z); + if (i >= 0) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner ==> inner -> inner (nested) +kernel void _occa_addVectors7_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.x); + if (i >= 0) { + { + int _occa_tiled_j = (entries - 1) - (((4) * 1) * _occa_thread_position.y); + { + int j = _occa_tiled_j - ((1) * _occa_thread_position.z); + if (j >= 0) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> inner ==> inner -> inner (nested) + complex range + check true +kernel void _occa_addVectors8_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = + ((entries + 16)) - (((4) * (entries / 16 + 1)) * _occa_group_position.x); + { + int i = _occa_tiled_i - (((entries / 16 + 1)) * _occa_thread_position.x); + if (i >= (entries - 12 + static_cast(*a))) { + { + unsigned long long _occa_tiled_j = + ((entries + 16)) - (((4) * (entries / 16 + 1)) * _occa_thread_position.y); + { + unsigned long long j = + _occa_tiled_j - (((entries / 16 + 1)) * _occa_thread_position.z); + if (j >= (entries - 12 + static_cast(*a))) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_inc.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_inc.cpp new file mode 100644 index 00000000..f9238766 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_inc.cpp @@ -0,0 +1,84 @@ +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner non 1 increment +@kernel void addVectors1(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 2; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner unary post add +@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i++; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner unary pre add +@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; ++i; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner, check=True +@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer, @inner, check = true)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner, complex range +@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) { + for (int i = (entries - 12 + 4); i < (entries + 16); i += (entries / 16 + 1); + @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner, set dimension +@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(1), @inner(2))) { + ab[i] = add(a[i], b[i]); + } +} + +// Outer -> inner ==> inner -> inner (nested) +@kernel void addVectors7(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) { + for (int j = 0; j < entries; j += 1; @tile(4, @inner(1), @inner(2))) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> inner ==> inner -> inner (nested) + complex range + check true +@kernel void addVectors8(const int entries, const float* a, const float* b, float* ab) { + for (int i = (entries - 12 + static_cast(*a)); i < (entries + 16); i += (entries / 16 + 1); + @tile(4, @outer(0), @inner(0), check = true)) { + for (unsigned long long j = (entries - 12 + static_cast(*a)); j < (entries + 16); + j += (entries / 16 + 1); + @tile(4, @inner(1), @inner(2), check = true)) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> inner, <= +@kernel void addVectors9(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i <= entries; i += 1; @tile(4, @outer, @inner)) { + ab[i] = add(a[i], b[i]); + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_inc_ref.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_inc_ref.cpp new file mode 100644 index 00000000..82edb9ec --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_inc_ref.cpp @@ -0,0 +1,211 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner +kernel void _occa_addVectors0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i < entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner non 1 increment +kernel void _occa_addVectors1_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 2) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((2) * _occa_thread_position.x); + if (i < entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner unary post add +kernel void _occa_addVectors2_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + ((4) * _occa_group_position.x); + { + int i = _occa_tiled_i + _occa_thread_position.x; + if (i < entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner unary pre add +kernel void _occa_addVectors3_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + ((4) * _occa_group_position.x); + { + int i = _occa_tiled_i + _occa_thread_position.x; + if (i < entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner, check=True +kernel void _occa_addVectors4_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i < entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner, complex range +kernel void _occa_addVectors5_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = + ((entries - 12 + 4)) + (((4) * (entries / 16 + 1)) * _occa_group_position.x); + { + int i = _occa_tiled_i + (((entries / 16 + 1)) * _occa_thread_position.x); + if (i < (entries + 16)) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner, set dimension +kernel void _occa_addVectors6_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.y); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.z); + if (i < entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} + +// Outer -> inner ==> inner -> inner (nested) +kernel void _occa_addVectors7_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i < entries) { + { + int _occa_tiled_j = (0) + (((4) * 1) * _occa_thread_position.y); + { + int j = _occa_tiled_j + ((1) * _occa_thread_position.z); + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> inner ==> inner -> inner (nested) + complex range + check true +kernel void _occa_addVectors8_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = ((entries - 12 + static_cast(*a))) + + (((4) * (entries / 16 + 1)) * _occa_group_position.x); + { + int i = _occa_tiled_i + (((entries / 16 + 1)) * _occa_thread_position.x); + if (i < (entries + 16)) { + { + unsigned long long _occa_tiled_j = + ((entries - 12 + static_cast(*a))) + + (((4) * (entries / 16 + 1)) * _occa_thread_position.y); + { + unsigned long long j = + _occa_tiled_j + (((entries / 16 + 1)) * _occa_thread_position.z); + if (j < (entries + 16)) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> inner, <= +kernel void _occa_addVectors9_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i <= entries) { + ab[i] = add(a[i], b[i]); + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_dec.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_dec.cpp new file mode 100644 index 00000000..f1a9e8d2 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_dec.cpp @@ -0,0 +1,59 @@ +const int offset = 1; + +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner ==> regular -> regular +@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) { + for (int j = entries; j > 0; --j; @tile(4)) { + ab[i] = add(a[i], b[j - 1]); + } + } +} + +// Outer -> inner ==> inner -> regular +@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) { + for (int j = entries; j > 0; --j; @tile(4, @inner(1))) { + ab[i] = add(a[i], b[j - 1]); + } + } +} + +// Outer -> inner ==> inner -> inner +@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @inner(0))) { + for (int j = entries; j > 0; --j; @tile(4, @inner(1), @inner(1))) { + ab[i] = add(a[i], b[j - 1]); + } + } +} + +// Outer -> outer ==> inner -> regular +@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @outer(1))) { + for (int j = entries; j > 0; --j; @tile(4, @inner(1))) { + ab[i] = add(a[i], b[j - 1]); + } + } +} + +// Outer -> outer ==> inner -> inner +@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @outer(1))) { + for (int j = entries; j > 0; --j; @tile(4, @inner(1), @inner(2))) { + ab[i] = add(a[i], b[j - 1]); + } + } +} + +// Outer -> outer ==> outer -> inner +@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) { + for (int i = entries - 1; i >= 0; i -= 1; @tile(4, @outer(0), @outer(1))) { + for (int j = entries; j > 0; --j; @tile(4, @outer(2), @inner(0))) { + ab[i] = add(a[i], b[j - 1]); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_dec_ref.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_dec_ref.cpp new file mode 100644 index 00000000..499acea5 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_dec_ref.cpp @@ -0,0 +1,161 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner ==> regular -> regular +kernel void _occa_addVectors0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.x); + if (i >= 0) { + for (int _occa_tiled_j = entries; _occa_tiled_j > 0; _occa_tiled_j -= (4)) { + for (int j = _occa_tiled_j; j > (_occa_tiled_j - (4)); --j) { + if (j > 0) { + ab[i] = add(a[i], b[j - 1]); + } + } + } + } + } + } +} + +// Outer -> inner ==> inner -> regular +kernel void _occa_addVectors2_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.x); + if (i >= 0) { + { + int _occa_tiled_j = (entries) - ((4) * _occa_thread_position.y); + for (int j = _occa_tiled_j; j > (_occa_tiled_j - (4)); --j) { + if (j > 0) { + ab[i] = add(a[i], b[j - 1]); + } + } + } + } + } + } +} + +// Outer -> inner ==> inner -> inner +kernel void _occa_addVectors3_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_thread_position.x); + if (i >= 0) { + { + int _occa_tiled_j = (entries) - ((4) * _occa_thread_position.y); + { + int j = _occa_tiled_j - _occa_thread_position.y; + if (j > 0) { + ab[i] = add(a[i], b[j - 1]); + } + } + } + } + } + } +} + +// Outer -> outer ==> inner -> regular +kernel void _occa_addVectors4_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_group_position.y); + if (i >= 0) { + { + int _occa_tiled_j = (entries) - ((4) * _occa_thread_position.y); + for (int j = _occa_tiled_j; j > (_occa_tiled_j - (4)); --j) { + if (j > 0) { + ab[i] = add(a[i], b[j - 1]); + } + } + } + } + } + } +} + +// Outer -> outer ==> inner -> inner +kernel void _occa_addVectors5_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_group_position.y); + if (i >= 0) { + { + int _occa_tiled_j = (entries) - ((4) * _occa_thread_position.y); + { + int j = _occa_tiled_j - _occa_thread_position.z; + if (j > 0) { + ab[i] = add(a[i], b[j - 1]); + } + } + } + } + } + } +} + +// Outer -> outer ==> outer -> inner +kernel void _occa_addVectors6_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (entries - 1) - (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i - ((1) * _occa_group_position.y); + if (i >= 0) { + { + int _occa_tiled_j = (entries) - ((4) * _occa_group_position.z); + { + int j = _occa_tiled_j - _occa_thread_position.x; + if (j > 0) { + ab[i] = add(a[i], b[j - 1]); + } + } + } + } + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_inc.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_inc.cpp new file mode 100644 index 00000000..02e3be70 --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_inc.cpp @@ -0,0 +1,60 @@ +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner ==> regular -> regular +@kernel void addVectors0(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) { + for (int j = 0; j < entries; ++j; @tile(4)) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> inner ==> inner -> regular +@kernel void addVectors2(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) { + for (int j = 0; j < entries; ++j; @tile(4, @inner(1))) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> inner ==> inner -> inner +@kernel void addVectors3(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @inner(0))) { + for (int j = 0; j < entries; ++j; @tile(4, @inner(1), @inner(1))) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> outer ==> inner -> regular +@kernel void addVectors4(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @outer(1))) { + for (int j = 0; j < entries; ++j; @tile(4, @inner(1))) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> outer ==> inner -> inner +@kernel void addVectors5(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @outer(1))) { + for (int j = 0; j < entries; ++j; @tile(4, @inner(1), @inner(2))) { + ab[i] = add(a[i], b[j]); + } + } +} + +// Outer -> outer ==> outer -> inner +@kernel void addVectors6(const int entries, const float* a, const float* b, float* ab) { + for (int i = 0; i < entries; i += 1; @tile(4, @outer(0), @outer(1))) { + for (int j = 0; j < entries; ++j; @tile(4, @outer(2), @inner(0))) { + ab[i] = add(a[i], b[j]); + } + } +} diff --git a/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_inc_ref.cpp b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_inc_ref.cpp new file mode 100644 index 00000000..24691e7d --- /dev/null +++ b/tests/functional/data/transpiler/backends/metal/tile/outer_inner_regular_inc_ref.cpp @@ -0,0 +1,162 @@ +#include +#include +using namespace metal; + +const int offset = 1; + +// template +float add(float a, float b) { + return a + b + offset; +} + +// Outer -> inner ==> regular -> regular +kernel void _occa_addVectors0_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i < entries) { + for (int _occa_tiled_j = 0; _occa_tiled_j < entries; _occa_tiled_j += (4)) { + for (int j = _occa_tiled_j; j < (_occa_tiled_j + (4)); ++j) { + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> inner ==> inner -> regular +kernel void _occa_addVectors2_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i < entries) { + { + int _occa_tiled_j = (0) + ((4) * _occa_thread_position.y); + for (int j = _occa_tiled_j; j < (_occa_tiled_j + (4)); ++j) { + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> inner ==> inner -> inner +kernel void _occa_addVectors3_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_thread_position.x); + if (i < entries) { + { + int _occa_tiled_j = (0) + ((4) * _occa_thread_position.y); + { + int j = _occa_tiled_j + _occa_thread_position.y; + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> outer ==> inner -> regular +kernel void _occa_addVectors4_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_group_position.y); + if (i < entries) { + { + int _occa_tiled_j = (0) + ((4) * _occa_thread_position.y); + for (int j = _occa_tiled_j; j < (_occa_tiled_j + (4)); ++j) { + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> outer ==> inner -> inner +kernel void _occa_addVectors5_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_group_position.y); + if (i < entries) { + { + int _occa_tiled_j = (0) + ((4) * _occa_thread_position.y); + { + int j = _occa_tiled_j + _occa_thread_position.z; + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +} + +// Outer -> outer ==> outer -> inner +kernel void _occa_addVectors6_0(constant int& entries [[buffer(0)]], + device const float* a [[buffer(1)]], + device const float* b [[buffer(2)]], + device float* ab [[buffer(3)]], + uint3 _occa_group_position [[threadgroup_position_in_grid]], + uint3 _occa_thread_position [[thread_position_in_threadgroup]]) { + { + int _occa_tiled_i = (0) + (((4) * 1) * _occa_group_position.x); + { + int i = _occa_tiled_i + ((1) * _occa_group_position.y); + if (i < entries) { + { + int _occa_tiled_j = (0) + ((4) * _occa_group_position.z); + { + int j = _occa_tiled_j + _occa_thread_position.x; + if (j < entries) { + ab[i] = add(a[i], b[j]); + } + } + } + } + } + } +}