From 790b237cb7a705c048acf9f46c32e080033fd480 Mon Sep 17 00:00:00 2001
From: Aleksandr Bezzubikov <zuban32s@gmail.com>
Date: Fri, 5 Jun 2020 03:26:00 +0300
Subject: [PATCH] Add backported function pointers patches

---
 ...LowerOpenCL-pass-to-handle-new-block.patch |  998 +++++++++++++++
 ...ntation-of-SPV_INTEL_function_pointe.patch | 1079 +++++++++++++++++
 ...ering-of-constant-expressions-vector.patch |  215 ++++
 ...nction-pointers-in-cast-instructions.patch |   98 ++
 ...-expressions-lowering-for-function-p.patch |  192 +++
 5 files changed, 2582 insertions(+)
 create mode 100644 patches/spirv/0006-Backport-Update-LowerOpenCL-pass-to-handle-new-block.patch
 create mode 100644 patches/spirv/0007-Backport-Implementation-of-SPV_INTEL_function_pointe.patch
 create mode 100644 patches/spirv/0008-Support-lowering-of-constant-expressions-vector.patch
 create mode 100644 patches/spirv/0009-Support-function-pointers-in-cast-instructions.patch
 create mode 100644 patches/spirv/0010-Improve-constant-expressions-lowering-for-function-p.patch

diff --git a/patches/spirv/0006-Backport-Update-LowerOpenCL-pass-to-handle-new-block.patch b/patches/spirv/0006-Backport-Update-LowerOpenCL-pass-to-handle-new-block.patch
new file mode 100644
index 00000000..34908ff1
--- /dev/null
+++ b/patches/spirv/0006-Backport-Update-LowerOpenCL-pass-to-handle-new-block.patch
@@ -0,0 +1,998 @@
+From 0c74ecad7f7efa134a70096ce3fbc8e833aedc9e Mon Sep 17 00:00:00 2001
+From: Aleksandr Bezzubikov <aleksandr.bezzubikov@intel.com>
+Date: Fri, 15 May 2020 23:31:10 +0300
+Subject: [PATCH 06/10] [Backport] Update LowerOpenCL pass to handle new blocks
+ represntation in LLVM IR Original commit:
+ https://github.com/KhronosGroup/SPIRV-LLVM-Translator/pull/133/commits/b7b90e0
+
+---
+ lib/SPIRV/SPIRVLowerOCLBlocks.cpp         | 249 +++-------------------
+ test/global_block.ll                      |  88 +++-----
+ test/literal-struct.ll                    |  31 +--
+ test/transcoding/block_w_struct_return.ll |  47 ++--
+ test/transcoding/enqueue_kernel.ll        | 240 ++++++++++++---------
+ 5 files changed, 244 insertions(+), 411 deletions(-)
+
+diff --git a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
+index 50e1838..7bdfb2f 100644
+--- a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
++++ b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
+@@ -40,207 +40,34 @@
+ // In both cases values with function type used as intermediate representation
+ // for block literal structure.
+ //
+-// This pass is designed to find such cases and simplify them to avoid any
+-// function pointer types occurrences in LLVM IR in 4 steps.
+-//
+-// 1. Find all function pointer allocas, like
+-//      %block = alloca void () *
+-//
+-//    Then find a single store to that alloca:
+-//      %blockLit = alloca <{ i32, i32, ...}>, align 4
+-//      %0 = bitcast <{ i32, i32, ... }>* %blockLit to void ()*
+-//    > store void ()* %0, void ()** %block, align 4
+-//
+-//    And replace the alloca users by new instructions which used stored value
+-//    %blockLit itself instead of function pointer alloca %block.
+-//
+-// 2. Find consecutive casts from block literal type to i8 addrspace(4)*
+-//    used function pointers as an intermediate type:
+-//      %0 = bitcast <{ i32, i32 }> %block to void() *
+-//      %1 = addrspacecast void() * %0 to i8 addrspace(4)*
+-//    And simplify them:
+-//      %2 = addrspacecast <{ i32, i32 }> %block to i8 addrspace(4)*
+-//
+-// 3. Find all unused instructions with function pointer type occured after
+-//    pp.1-2 and remove them.
+-//
+-// 4. Find unused globals with function pointer type, like
+-//    @block = constant void ()*
+-//             bitcast ({ i32, i32 }* @__block_literal_global to void ()*
+-//
+-//    And remove them.
++// In LLVM IR produced by clang, blocks are represented with the following
++// structure:
++// %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
++// Pointers to block invoke functions are stored in the third field. Clang
++// replaces inderect function calls in all cases except if block is passed as a
++// function argument. Note that it is somewhat unclear if the OpenCL C spec
++// should allow passing blocks as function argumernts. This pass is not supposed
++// to work correctly with such functions.
++// Clang though has to store function pointers to this structure. Purpose of
++// this pass is to replace store of function pointers(not allowed in SPIR-V)
++// with null pointers.
+ //
+ //===----------------------------------------------------------------------===//
+ #define DEBUG_TYPE "spv-lower-ocl-blocks"
+ 
+-#include "OCLUtil.h"
+ #include "SPIRVInternal.h"
+ 
+-#include "llvm/ADT/SetVector.h"
+-#include "llvm/Analysis/ValueTracking.h"
+-#include "llvm/IR/GlobalVariable.h"
+-#include "llvm/IR/InstIterator.h"
+ #include "llvm/IR/Module.h"
+ #include "llvm/Pass.h"
+-#include "llvm/PassSupport.h"
+-#include "llvm/Support/Casting.h"
++#include "llvm/Support/Regex.h"
+ 
+ using namespace llvm;
+ 
+ namespace {
+ 
+-static void
+-removeUnusedFunctionPtrInst(Instruction *I,
+-                            SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
+-  for (unsigned OpIdx = 0, Ops = I->getNumOperands(); OpIdx != Ops; ++OpIdx) {
+-    Instruction *OpI = dyn_cast<Instruction>(I->getOperand(OpIdx));
+-    I->setOperand(OpIdx, nullptr);
+-    if (OpI && OpI != I && OpI->user_empty())
+-      FuncPtrInsts.insert(OpI);
+-  }
+-  I->eraseFromParent();
+-}
+-
+-static bool isFuncPtrAlloca(const AllocaInst *AI) {
+-  auto *ET = dyn_cast<PointerType>(AI->getAllocatedType());
+-  return ET && ET->getElementType()->isFunctionTy();
+-}
+-
+-static bool hasFuncPtrType(const Value *V) {
+-  auto *PT = dyn_cast<PointerType>(V->getType());
+-  return PT && PT->getElementType()->isFunctionTy();
+-}
+-
+-static bool isFuncPtrInst(const Instruction *I) {
+-  if (auto *AI = dyn_cast<AllocaInst>(I))
+-    return isFuncPtrAlloca(AI);
+-
+-  for (auto &Op : I->operands()) {
+-    if (auto *AI = dyn_cast<AllocaInst>(Op))
+-      return isFuncPtrAlloca(AI);
+-
+-    auto *OpI = dyn_cast<Instruction>(&Op);
+-    if (OpI && OpI != I && hasFuncPtrType(OpI))
+-      return true;
+-  }
+-  return false;
+-}
+-
+-static StoreInst *findSingleStore(AllocaInst *AI) {
+-  StoreInst *Store = nullptr;
+-  for (auto *U : AI->users()) {
+-    if (!isa<StoreInst>(U))
+-      continue; // not a store
+-    if (Store)
+-      return nullptr; // there are more than one stores
+-    Store = dyn_cast<StoreInst>(U);
+-  }
+-  return Store;
+-}
+-
+-static void fixFunctionPtrAllocaUsers(AllocaInst *AI) {
+-  // Find and remove a single store to alloca
+-  auto *SingleStore = findSingleStore(AI);
+-  assert(SingleStore && "More than one store to the function pointer alloca");
+-  auto *StoredVal = SingleStore->getValueOperand();
+-  SingleStore->eraseFromParent();
+-
+-  // Find loads from the alloca and replace thier users
+-  for (auto *U : AI->users()) {
+-    auto *LI = dyn_cast<LoadInst>(U);
+-    if (!LI)
+-      continue;
+-
+-    for (auto *U : LI->users()) {
+-      auto *UInst = cast<Instruction>(U);
+-      auto *Cast = CastInst::CreatePointerBitCastOrAddrSpaceCast(
+-          StoredVal, UInst->getType(), "", UInst);
+-      UInst->replaceAllUsesWith(Cast);
+-    }
+-  }
+-}
+-
+-static int getBlockLiteralIdx(const Function &F) {
+-  StringRef FName = F.getName();
+-  if (isEnqueueKernelBI(FName))
+-    return FName.contains("events") ? 7 : 4;
+-  if (isKernelQueryBI(FName))
+-    return FName.contains("for_ndrange") ? 2 : 1;
+-  if (FName.startswith("__") && FName.contains("_block_invoke"))
+-    return F.hasStructRetAttr() ? 1 : 0;
+-
+-  return -1; // No block literal argument
+-}
+-
+-static bool hasBlockLiteralArg(const Function &F) {
+-  return getBlockLiteralIdx(F) != -1;
+-}
+-
+-static bool simplifyFunctionPtrCasts(Function &F) {
+-  bool Changed = false;
+-  int BlockLiteralIdx = getBlockLiteralIdx(F);
+-  for (auto *U : F.users()) {
+-    auto *Call = dyn_cast<CallInst>(U);
+-    if (!Call)
+-      continue;
+-    if (Call->getFunction()->getName() == F.getName().str() + "_kernel")
+-      continue; // Skip block invoke function calls inside block invoke kernels
+-
+-    const DataLayout &DL = F.getParent()->getDataLayout();
+-    auto *BlockLiteral = Call->getOperand(BlockLiteralIdx);
+-    auto *BlockLiteralVal = GetUnderlyingObject(BlockLiteral, DL);
+-    if (isa<GlobalVariable>(BlockLiteralVal))
+-      continue; // nothing to do with globals
+-
+-    auto *BlockLiteralAlloca = cast<AllocaInst>(BlockLiteralVal);
+-    assert(!BlockLiteralAlloca->getAllocatedType()->isFunctionTy() &&
+-           "Function type shouldn't be there");
+-
+-    auto *NewBlockLiteral = CastInst::CreatePointerBitCastOrAddrSpaceCast(
+-        BlockLiteralAlloca, BlockLiteral->getType(), "", Call);
+-    BlockLiteral->replaceAllUsesWith(NewBlockLiteral);
+-    Changed |= true;
+-  }
+-  return Changed;
+-}
+-
+-static void
+-findFunctionPtrAllocas(Module &M,
+-                       SmallVectorImpl<AllocaInst *> &FuncPtrAllocas) {
+-  for (auto &F : M) {
+-    if (F.isDeclaration())
+-      continue;
+-    for (auto &I : instructions(F)) {
+-      auto *AI = dyn_cast<AllocaInst>(&I);
+-      if (!AI || !isFuncPtrAlloca(AI))
+-        continue;
+-      FuncPtrAllocas.push_back(AI);
+-    }
+-  }
+-}
+-
+-static void
+-findUnusedFunctionPtrInsts(Module &M,
+-                           SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
+-  for (auto &F : M) {
+-    if (F.isDeclaration())
+-      continue;
+-    for (auto &I : instructions(F))
+-      if (I.user_empty() && isFuncPtrInst(&I))
+-        FuncPtrInsts.insert(&I);
+-  }
+-}
+-
+-static void
+-findUnusedFunctionPtrGlbs(Module &M,
+-                          SmallVectorImpl<GlobalVariable *> &FuncPtrGlbs) {
+-  for (auto &GV : M.globals()) {
+-    if (!GV.user_empty())
+-      continue;
+-    auto *GVType = dyn_cast<PointerType>(GV.getType()->getElementType());
+-    if (GVType && GVType->getElementType()->isFunctionTy())
+-      FuncPtrGlbs.push_back(&GV);
+-  }
++static bool isBlockInvoke(Function &F) {
++  static Regex BlockInvokeRegex("_block_invoke_?[0-9]*$");
++  return BlockInvokeRegex.match(F.getName());
+ }
+ 
+ class SPIRVLowerOCLBlocks : public ModulePass {
+@@ -250,44 +77,24 @@ public:
+ 
+   bool runOnModule(Module &M) {
+     bool Changed = false;
+-
+-    // 1. Find function pointer allocas and fix their users
+-    SmallVector<AllocaInst *, 16> FuncPtrAllocas;
+-    findFunctionPtrAllocas(M, FuncPtrAllocas);
+-
+-    Changed |= !FuncPtrAllocas.empty();
+-    for (auto *AI : FuncPtrAllocas)
+-      fixFunctionPtrAllocaUsers(AI);
+-
+-    // 2. Simplify consecutive casts which use function pointer types
+-    for (auto &F : M)
+-      if (hasBlockLiteralArg(F))
+-        Changed |= simplifyFunctionPtrCasts(F);
+-
+-    // 3. Cleanup unused instructions with function pointer type
+-    // which are occured after pp. 1-2
+-    SmallSetVector<Instruction *, 16> FuncPtrInsts;
+-    findUnusedFunctionPtrInsts(M, FuncPtrInsts);
+-
+-    Changed |= !FuncPtrInsts.empty();
+-    while (!FuncPtrInsts.empty()) {
+-      Instruction *I = FuncPtrInsts.pop_back_val();
+-      removeUnusedFunctionPtrInst(I, FuncPtrInsts);
++    for (Function &F : M) {
++      if (!isBlockInvoke(F))
++        continue;
++      for (User *U : F.users()) {
++        if (!isa<Constant>(U))
++          continue;
++        Constant *Null = Constant::getNullValue(U->getType());
++        if (U != Null) {
++          U->replaceAllUsesWith(Null);
++          Changed = true;
++        }
++      }
+     }
+-
+-    // 4. Find and remove unused global variables with function pointer type
+-    SmallVector<GlobalVariable *, 16> FuncPtrGlbs;
+-    findUnusedFunctionPtrGlbs(M, FuncPtrGlbs);
+-
+-    Changed |= !FuncPtrGlbs.empty();
+-    for (auto *GV : FuncPtrGlbs)
+-      GV->eraseFromParent();
+-
+     return Changed;
+   }
+ 
+   static char ID;
+-}; // class SPIRVLowerOCLBlocks
++};
+ 
+ char SPIRVLowerOCLBlocks::ID = 0;
+ 
+diff --git a/test/global_block.ll b/test/global_block.ll
+index d1ede65..efb4cf3 100644
+--- a/test/global_block.ll
++++ b/test/global_block.ll
+@@ -16,84 +16,66 @@
+ ; RUN: llvm-spirv %t.bc -o %t.spv
+ ; RUN: llvm-spirv -r %t.spv -o - | llvm-dis | FileCheck %s --check-prefix=CHECK-LLVM
+ 
+-target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
++target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+ target triple = "spir-unknown-unknown"
+ 
+ ; CHECK-SPIRV: Name [[block_invoke:[0-9]+]] "_block_invoke"
+ ; CHECK-SPIRV: TypeInt [[int:[0-9]+]] 32
++; CHECK-SPIRV: TypeInt [[int8:[0-9]+]] 8
+ ; CHECK-SPIRV: Constant [[int]] [[five:[0-9]+]] 5
+-; CHECK-SPIRV: TypeFunction [[block_invoke_type:[0-9]+]] [[int]] [[int]]
+-;; Check that block invoke function has no block descriptor argument in SPIR-V
+-; CHECK-SPIRV-NOT: TypeFunction [[block_invoke_type]] [[int]] {{[0-9]+}} [[int]]
++; CHECK-SPIRV: TypePointer [[int8Ptr:[0-9]+]] 8 [[int8]]
++; CHECK-SPIRV: TypeFunction [[block_invoke_type:[0-9]+]] [[int]] [[int8Ptr]] [[int]]
+ 
+-;; This variable is not needed in SPIRV
+-; CHECK-SPIRV-NOT: Name {{[0-9]+}} block_kernel.b1
+-; CHECK-LLVM-NOT: @block_kernel.b1
+-@block_kernel.b1 = internal addrspace(2) constant i32 (i32) addrspace(4)* addrspacecast (i32 (i32) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i32 (i32) addrspace(1)*) to i32 (i32) addrspace(4)*), align 8
++%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+ 
+-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
++@block_kernel.b1 = internal addrspace(2) constant %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), align 4
++@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (i32 (i8 addrspace(4)*, i32)* @_block_invoke to i8*) to i8 addrspace(4)*) }, align 4
+ 
+-; Function Attrs: convergent nounwind
+-define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
++; Function Attrs: convergent noinline nounwind optnone
++define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+ entry:
+-  %res.addr = alloca i32 addrspace(1)*, align 8
+-  store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
+-
+-; CHECK-SPIRV: FunctionCall [[int]] {{[0-9]+}} [[block_invoke]] [[five]]
+-; CHECK-LLVM: %call = call spir_func i32 @_block_invoke(i32 5)
+-  %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
+-
+-  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
+-  store i32 %call, i32 addrspace(1)* %0, align 4, !tbaa !14
++  %res.addr = alloca i32 addrspace(1)*, align 4
++  store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 4
++; CHECK-SPIRV: FunctionCall [[int]] {{[0-9]+}} [[block_invoke]] {{[0-9]+}} [[five]]
++; CHECK-LLVM: %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* {{.*}}, i32 5)
++  %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
++  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 4
++  store i32 %call, i32 addrspace(1)* %0, align 4
+   ret void
+ }
+ 
+-; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 0 [[block_invoke_type]]
++; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 2 [[block_invoke_type]]
++; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int8Ptr]] {{[0-9]+}}
+ ; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int]] {{[0-9]+}}
+-; CHECK-LLVM: define internal spir_func i32 @_block_invoke(i32 %i)
+-; Function Attrs: convergent nounwind
++; CHECK-LLVM: define internal spir_func i32 @_block_invoke(i8 addrspace(4)* {{.*}}, i32 %{{.*}})
++; Function Attrs: convergent noinline nounwind optnone
+ define internal spir_func i32 @_block_invoke(i8 addrspace(4)* %.block_descriptor, i32 %i) #1 {
+ entry:
+-  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
++  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+   %i.addr = alloca i32, align 4
+-  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
+-
+-;; Instruction below is useless and should be removed.
+-; CHECK-SPIRV-NOT: Bitcast
+-; CHECK-LLVM-NOT: bitcast
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+-  store i32 %i, i32* %i.addr, align 4, !tbaa !14
+-  %0 = load i32, i32* %i.addr, align 4, !tbaa !14
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
++  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
++  store i32 %i, i32* %i.addr, align 4
++  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
++  %0 = load i32, i32* %i.addr, align 4
+   %add = add nsw i32 %0, 1
+   ret i32 %add
+ }
+ 
+-attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+-attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #2 = { convergent }
+ 
+ !llvm.module.flags = !{!0}
+-!opencl.enable.FP_CONTRACT = !{}
+ !opencl.ocl.version = !{!1}
+ !opencl.spir.version = !{!1}
+-!opencl.used.extensions = !{!2}
+-!opencl.used.optional.core.features = !{!2}
+-!opencl.compiler.options = !{!2}
+-!llvm.ident = !{!3}
++!llvm.ident = !{!2}
+ 
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 2, i32 0}
+-!2 = !{}
+-!3 = !{!"clang version 7.0.0"}
+-!4 = !{i32 1}
+-!5 = !{!"none"}
+-!6 = !{!"int*"}
+-!7 = !{!""}
+-!8 = !{i1 false}
+-!9 = !{i32 0}
+-!10 = !{!11, !11, i64 0}
+-!11 = !{!"any pointer", !12, i64 0}
+-!12 = !{!"omnipotent char", !13, i64 0}
+-!13 = !{!"Simple C/C++ TBAA"}
+-!14 = !{!15, !15, i64 0}
+-!15 = !{!"int", !12, i64 0}
++!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
++!3 = !{i32 1}
++!4 = !{!"none"}
++!5 = !{!"int*"}
++!6 = !{!""}
+diff --git a/test/literal-struct.ll b/test/literal-struct.ll
+index c52170a..52a731a 100644
+--- a/test/literal-struct.ll
++++ b/test/literal-struct.ll
+@@ -2,7 +2,7 @@
+ ; structs, i.e. structs whose type has no name. Typicaly clang generate such
+ ; structs if the kernel contains OpenCL 2.0 blocks. The IR was produced with
+ ; the following command:
+-; clang -cc1 -triple spir -cl-std=cl2.0 -O0 -finclude-default-header literal-struct.cl -emit-llvm -o test/literal-struct.ll
++; clang -cc1 -triple spir -cl-std=cl2.0 -O0 literal-struct.cl -emit-llvm -o test/literal-struct.ll
+ 
+ ; literal-struct.cl:
+ ; void foo()
+@@ -14,25 +14,28 @@
+ ; RUN: llvm-as < %s | llvm-spirv -spirv-text -o %t
+ ; RUN: FileCheck < %t %s
+ 
+-; CHECK-DAG: TypeInt [[Int:[0-9]+]] 32 0
+-; CHECK-DAG: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] {{$}}
++; CHECK: TypeInt [[Int:[0-9]+]] 32 0
++; CHECK: TypeInt [[Int8:[0-9]+]] 8 0
++; CHECK: TypePointer [[Int8Ptr:[0-9]+]] 8 [[Int8]]
++; CHECK: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] [[Int8Ptr]]
+ 
+ target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+ target triple = "spir"
+ 
+-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
++%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
++
++@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__foo_block_invoke to i8*) to i8 addrspace(4)*) }, align 4
+ ; CHECK: ConstantComposite [[StructType]]
+ 
+-; This is artificial case is added to cover ConstantNull instrucitions with TypeStruct.
+-@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } zeroinitializer, align 4
++@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } zeroinitializer, align 4
+ ; CHECK: ConstantNull [[StructType]]
+ 
+ ; Function Attrs: convergent noinline nounwind optnone
+ define spir_func void @foo() #0 {
+ entry:
+-  %myBlock = alloca void () addrspace(4)*, align 4
+-  store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %myBlock, align 4
+-  call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
++  %myBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 4
++  store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %myBlock, align 4
++  call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
+   ret void
+ }
+ 
+@@ -40,14 +43,14 @@ entry:
+ define internal spir_func void @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) #0 {
+ entry:
+   %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+-  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
+   store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+-  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
++  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
+   ret void
+ }
+ 
+-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #1 = { convergent }
+ 
+ !llvm.module.flags = !{!0}
+@@ -57,4 +60,4 @@ attributes #1 = { convergent }
+ 
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 2, i32 0}
+-!2 = !{!"clang version 8.0.0 "}
++!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
+diff --git a/test/transcoding/block_w_struct_return.ll b/test/transcoding/block_w_struct_return.ll
+index 76e29f0..df89b13 100644
+--- a/test/transcoding/block_w_struct_return.ll
++++ b/test/transcoding/block_w_struct_return.ll
+@@ -16,6 +16,8 @@
+ ;   res[tid] = kernelBlock(aa).a - 6;
+ ; }
+ 
++; clang -cc1 -triple spir -cl-std=cl2.0 -disable-llvm-passes -finclude-default-header block_w_struct_return.cl -emit-llvm -o test/transcoding/block_w_struct_return.ll
++
+ ; RUN: llvm-as %s -o %t.bc
+ ; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt
+ ; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV
+@@ -27,12 +29,14 @@
+ ; CHECK-SPIRV: Name [[BlockInv:[0-9]+]] "__block_ret_struct_block_invoke"
+ 
+ ; CHECK-SPIRV: 4 TypeInt [[IntTy:[0-9]+]] 32
++; CHECK-SPIRV: 4 TypeInt [[Int8Ty:[0-9]+]] 8
++; CHECK-SPIRV: 4 TypePointer [[Int8Ptr:[0-9]+]] 8 [[Int8Ty]]
+ ; CHECK-SPIRV: 3 TypeStruct [[StructTy:[0-9]+]] [[IntTy]]
+ ; CHECK-SPIRV: 4 TypePointer [[StructPtrTy:[0-9]+]] 7 [[StructTy]]
+ 
+ ; CHECK-SPIRV: 4 Variable [[StructPtrTy]] [[StructArg:[0-9]+]] 7
+ ; CHECK-SPIRV: 4 Variable [[StructPtrTy]] [[StructRet:[0-9]+]] 7
+-; CHECK-SPIRV: 4 PtrCastToGeneric {{[0-9]+}} [[BlockLit:[0-9]+]] {{[0-9]+}}
++; CHECK-SPIRV: 4 PtrCastToGeneric [[Int8Ptr]] [[BlockLit:[0-9]+]] {{[0-9]+}}
+ ; CHECK-SPIRV: 7 FunctionCall {{[0-9]+}} {{[0-9]+}} [[BlockInv]] [[StructRet]] [[BlockLit]] [[StructArg]]
+ 
+ ; CHECK-LLVM: %[[StructA:.*]] = type { i32 }
+@@ -41,20 +45,21 @@
+ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+ target triple = "spir64-unknown-unknown"
+ 
++%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+ %struct.A = type { i32 }
+ 
+-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
++@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 16, i32 8, i8 addrspace(4)* addrspacecast (i8* bitcast (void (%struct.A*, i8 addrspace(4)*, %struct.A*)* @__block_ret_struct_block_invoke to i8*) to i8 addrspace(4)*) }, align 8
+ 
+ ; Function Attrs: convergent noinline nounwind optnone
+-define spir_kernel void @block_ret_struct(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 !kernel_arg_host_accessible !8 !kernel_arg_pipe_depth !9 !kernel_arg_pipe_io !7 !kernel_arg_buffer_location !7 {
++define spir_kernel void @block_ret_struct(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+ entry:
+   %res.addr = alloca i32 addrspace(1)*, align 8
+-  %kernelBlock = alloca void (%struct.A*, %struct.A*) addrspace(4)*, align 8
++  %kernelBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 8
+   %tid = alloca i64, align 8
+   %aa = alloca %struct.A, align 4
+   %tmp = alloca %struct.A, align 4
+   store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8
+-  store void (%struct.A*, %struct.A*) addrspace(4)* addrspacecast (void (%struct.A*, %struct.A*) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void (%struct.A*, %struct.A*) addrspace(1)*) to void (%struct.A*, %struct.A*) addrspace(4)*), void (%struct.A*, %struct.A*) addrspace(4)** %kernelBlock, align 8
++  store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %kernelBlock, align 8
+   %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4
+   store i64 %call, i64* %tid, align 8
+   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8
+@@ -63,7 +68,7 @@ entry:
+   store i32 -1, i32 addrspace(1)* %arrayidx, align 4
+   %a = getelementptr inbounds %struct.A, %struct.A* %aa, i32 0, i32 0
+   store i32 5, i32* %a, align 4
+-  call spir_func void @__block_ret_struct_block_invoke(%struct.A* sret %tmp, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), %struct.A* byval align 4 %aa) #5
++  call spir_func void @__block_ret_struct_block_invoke(%struct.A* sret %tmp, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), %struct.A* byval align 4 %aa) #5
+   %a1 = getelementptr inbounds %struct.A, %struct.A* %tmp, i32 0, i32 0
+   %2 = load i32, i32* %a1, align 4
+   %sub = sub nsw i32 %2, 6
+@@ -78,10 +83,10 @@ entry:
+ define internal spir_func void @__block_ret_struct_block_invoke(%struct.A* noalias sret %agg.result, i8 addrspace(4)* %.block_descriptor, %struct.A* byval align 4 %a) #1 {
+ entry:
+   %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
+-  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 8
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 8
+   store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+-  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 8
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
++  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 8
+   %a1 = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 0
+   store i32 6, i32* %a1, align 4
+   %0 = bitcast %struct.A* %agg.result to i8*
+@@ -96,30 +101,22 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture r
+ ; Function Attrs: convergent nounwind readnone
+ declare spir_func i64 @_Z13get_global_idj(i32) #3
+ 
+-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+-attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #2 = { argmemonly nounwind }
+ attributes #3 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #4 = { convergent nounwind readnone }
+ attributes #5 = { convergent }
+ 
+ !llvm.module.flags = !{!0}
+-!opencl.enable.FP_CONTRACT = !{}
+ !opencl.ocl.version = !{!1}
+ !opencl.spir.version = !{!1}
+-!opencl.used.extensions = !{!2}
+-!opencl.used.optional.core.features = !{!2}
+-!opencl.compiler.options = !{!2}
+-!llvm.ident = !{!3}
++!llvm.ident = !{!2}
+ 
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 2, i32 0}
+-!2 = !{}
+-!3 = !{!"clang version 7.0.0"}
+-!4 = !{i32 1}
+-!5 = !{!"none"}
+-!6 = !{!"int*"}
+-!7 = !{!""}
+-!8 = !{i1 false}
+-!9 = !{i32 0}
+-
++!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
++!3 = !{i32 1}
++!4 = !{!"none"}
++!5 = !{!"int*"}
++!6 = !{!""}
+diff --git a/test/transcoding/enqueue_kernel.ll b/test/transcoding/enqueue_kernel.ll
+index 0d29c71..64fe5b0 100644
+--- a/test/transcoding/enqueue_kernel.ll
++++ b/test/transcoding/enqueue_kernel.ll
+@@ -51,11 +51,12 @@
+ ; ModuleID = 'enqueue_kernel.cl'
+ source_filename = "enqueue_kernel.cl"
+ target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+-target triple = "spir-unknown-unknown"
++target triple = "spir"
+ 
+ %opencl.queue_t = type opaque
+ %struct.ndrange_t = type { i32 }
+ %opencl.clk_event_t = type opaque
++%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+ 
+ ; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
+ ; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
+@@ -66,89 +67,123 @@ target triple = "spir-unknown-unknown"
+ 
+ ; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32
+ ; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8
+-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
+ ; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt0:[0-9]+]] 0
+-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 17
++; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 21
+ ; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt2:[0-9]+]] 2
+-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 20
+-; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
++; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
++; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 24
+ 
+-; CHECK-SPIRV: TypePointer {{[0-9]+}} 7 {{[0-9]+}}
++; CHECK-SPIRV-DAG: TypePointer {{[0-9]+}} 7 {{[0-9]+}}
++; CHECK-SPIRV-DAG: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
++; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
+ ; CHECK-SPIRV: TypePointer [[Int32LocPtrTy:[0-9]+]] 7 [[Int32Ty]]
+ ; CHECK-SPIRV: TypeDeviceEvent [[EventTy:[0-9]+]]
+-; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
+ ; CHECK-SPIRV: TypePointer [[EventPtrTy:[0-9]+]] 8 [[EventTy]]
+ ; CHECK-SPIRV: TypeFunction [[BlockTy1:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+ ; CHECK-SPIRV: TypeFunction [[BlockTy2:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+ ; CHECK-SPIRV: TypeFunction [[BlockTy3:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+ ; CHECK-SPIRV: ConstantNull [[EventPtrTy]] [[EventNull:[0-9]+]]
+ 
+-; CHECK-LLVM: [[BlockTy1:%[0-9]+]] = type { i32, i32 }
+-; CHECK-LLVM: [[BlockTy2:%[0-9]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i8 }>
+-; CHECK-LLVM: [[BlockTy3:%[0-9]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
+-; CHECK-LLVM: [[BlockTy4:%[0-9]+]] = type <{ i32, i32 }>
++; CHECK-LLVM: [[BlockTy1:%[0-9a-z\.]+]] = type { i32, i32, i8 addrspace(4)* }
++; CHECK-LLVM: [[BlockTy2:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>
++; CHECK-LLVM: [[BlockTy3:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
++; CHECK-LLVM: [[BlockTy4:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)* }>
+ 
+-; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
+-; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
++; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
++; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
+ 
+-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+-@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
++@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3 to i8*) to i8 addrspace(4)*) }, align 4
++@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4 to i8*) to i8 addrspace(4)*) }, align 4
+ 
+ ; Function Attrs: convergent noinline nounwind optnone
+-define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
++define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+ entry:
++  %a.addr = alloca i32 addrspace(1)*, align 4
++  %b.addr = alloca i32 addrspace(1)*, align 4
++  %i.addr = alloca i32, align 4
++  %c0.addr = alloca i8, align 1
+   %default_queue = alloca %opencl.queue_t*, align 4
+   %flags = alloca i32, align 4
+   %ndrange = alloca %struct.ndrange_t, align 4
+   %clk_event = alloca %opencl.clk_event_t*, align 4
+   %event_wait_list = alloca %opencl.clk_event_t*, align 4
+   %event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4
+-  %block = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, align 4
+-  %block3 = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
++  %block = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, align 4
++  %tmp = alloca %struct.ndrange_t, align 4
++  %block3 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
++  %tmp4 = alloca %struct.ndrange_t, align 4
+   %c = alloca i8, align 1
++  %tmp11 = alloca %struct.ndrange_t, align 4
++  %block_sizes = alloca [1 x i32], align 4
++  %tmp12 = alloca %struct.ndrange_t, align 4
++  %block_sizes13 = alloca [3 x i32], align 4
++  store i32 addrspace(1)* %a, i32 addrspace(1)** %a.addr, align 4
++  store i32 addrspace(1)* %b, i32 addrspace(1)** %b.addr, align 4
++  store i32 %i, i32* %i.addr, align 4
++  store i8 %c0, i8* %c0.addr, align 1
+   store i32 0, i32* %flags, align 4
+   %arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+   %0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4
+   store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4
+   %1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
+   %2 = load i32, i32* %flags, align 4
+-  %block.size = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
+-  store i32 17, i32* %block.size, align 4
+-  %block.align = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
++  %3 = bitcast %struct.ndrange_t* %tmp to i8*
++  %4 = bitcast %struct.ndrange_t* %ndrange to i8*
++  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %3, i8* align 4 %4, i32 4, i1 false)
++  %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
++  store i32 21, i32* %block.size, align 4
++  %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
+   store i32 4, i32* %block.align, align 4
+-  %block.captured = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
+-  store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured, align 4
+-  %block.captured1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
+-  store i32 %i, i32* %block.captured1, align 4
+-  %block.captured2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
+-  store i8 %c0, i8* %block.captured2, align 4
+-  %3 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block to void ()*
+-  %4 = addrspacecast void ()* %3 to i8 addrspace(4)*
++  %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
++  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 4
++  %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
++  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
++  store i32 addrspace(1)* %5, i32 addrspace(1)** %block.captured, align 4
++  %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
++  %6 = load i32, i32* %i.addr, align 4
++  store i32 %6, i32* %block.captured1, align 4
++  %block.captured2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 5
++  %7 = load i8, i8* %c0.addr, align 1
++  store i8 %7, i8* %block.captured2, align 4
++  %8 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block to %struct.__opencl_block_literal_generic*
++  %9 = addrspacecast %struct.__opencl_block_literal_generic* %8 to i8 addrspace(4)*
+ 
+ ; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]]
+ ; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
+ ;                            [[ConstInt0]] [[EventNull]] [[EventNull]]
+ ;                            [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
+ 
+-; CHECK-LLVM: [[Block2:%[0-9]+]] = addrspacecast [[BlockTy2]]* %block to i8 addrspace(4)*
++; CHECK-LLVM: [[Block2:%[0-9]+]] = bitcast [[BlockTy2]]* %block to %struct.__opencl_block_literal_generic*
++; CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block2]] to i8 addrspace(4)*
+ ; CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)*
+-; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2]])
+-
+-  %5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %4)
+-  %6 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
+-  %7 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
+-  %block.size5 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
+-  store i32 20, i32* %block.size5, align 4
+-  %block.align6 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
++; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2Ptr]])
++
++  %10 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %tmp, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
++  %11 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
++  %12 = load i32, i32* %flags, align 4
++  %13 = bitcast %struct.ndrange_t* %tmp4 to i8*
++  %14 = bitcast %struct.ndrange_t* %ndrange to i8*
++  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %13, i8* align 4 %14, i32 4, i1 false)
++  %15 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
++  %16 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
++  %block.size5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
++  store i32 24, i32* %block.size5, align 4
++  %block.align6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
+   store i32 4, i32* %block.align6, align 4
+-  %block.captured7 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
+-  store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured7, align 4
+-  %block.captured8 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
+-  store i32 %i, i32* %block.captured8, align 4
+-  %block.captured9 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
+-  store i32 addrspace(1)* %b, i32 addrspace(1)** %block.captured9, align 4
+-  %8 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to void ()*
+-  %9 = addrspacecast void ()* %8 to i8 addrspace(4)*
++  %block.invoke7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
++  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke7, align 4
++  %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
++  %17 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
++  store i32 addrspace(1)* %17, i32 addrspace(1)** %block.captured8, align 4
++  %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
++  %18 = load i32, i32* %i.addr, align 4
++  store i32 %18, i32* %block.captured9, align 4
++  %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 5
++  %19 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4
++  store i32 addrspace(1)* %19, i32 addrspace(1)** %block.captured10, align 4
++  %20 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to %struct.__opencl_block_literal_generic*
++  %21 = addrspacecast %struct.__opencl_block_literal_generic* %20 to i8 addrspace(4)*
++
+ 
+ ; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event1:[0-9]+]]
+ ; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event2:[0-9]+]]
+@@ -158,16 +193,24 @@ entry:
+ ;                            [[ConstInt2]] [[Event1]] [[Event2]]
+ ;                            [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
+ 
+-; CHECK-LLVM: [[Block3:%[0-9]+]] = addrspacecast [[BlockTy3]]* %block3 to i8 addrspace(4)*
++; CHECK-LLVM: [[Block3:%[0-9]+]] = bitcast [[BlockTy3]]* %block3 to %struct.__opencl_block_literal_generic*
++; CHECK-LLVM: [[Block3Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block3]] to i8 addrspace(4)
+ ; CHECK-LLVM: [[BlockInv3:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8 addrspace(4)*
+-; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3]])
+-
+-  %10 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
+-  %11 = alloca [1 x i32]
+-  %12 = getelementptr [1 x i32], [1 x i32]* %11, i32 0, i32 0
+-  %13 = load i8, i8* %c, align 1
+-  %14 = zext i8 %13 to i32
+-  store i32 %14, i32* %12, align 4
++; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3Ptr]])
++
++  %22 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %11, i32 %12, %struct.ndrange_t* %tmp4, i32 2, %opencl.clk_event_t* addrspace(4)* %15, %opencl.clk_event_t* addrspace(4)* %16, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %21)
++  %23 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
++  %24 = load i32, i32* %flags, align 4
++  %25 = bitcast %struct.ndrange_t* %tmp11 to i8*
++  %26 = bitcast %struct.ndrange_t* %ndrange to i8*
++  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %25, i8* align 4 %26, i32 4, i1 false)
++  %arraydecay = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
++  %27 = addrspacecast %opencl.clk_event_t** %arraydecay to %opencl.clk_event_t* addrspace(4)*
++  %28 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
++  %29 = getelementptr [1 x i32], [1 x i32]* %block_sizes, i32 0, i32 0
++  %30 = load i8, i8* %c, align 1
++  %31 = zext i8 %30 to i32
++  store i32 %31, i32* %29, align 4
+ 
+ ; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf31:[0-9]+]]
+ ; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb1:[0-9]+]]
+@@ -182,14 +225,18 @@ entry:
+ ; CHECK-LLVM: [[BlockInv0:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8 addrspace(4)*
+ ; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv0]], i8 addrspace(4)* [[Block0]], i32 1, i32* {{.*}})
+ 
+-  %15 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %12)
+-  %16 = alloca [3 x i32]
+-  %17 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 0
+-  store i32 1, i32* %17, align 4
+-  %18 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 1
+-  store i32 2, i32* %18, align 4
+-  %19 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 2
+-  store i32 4, i32* %19, align 4
++  %32 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %23, i32 %24, %struct.ndrange_t* %tmp11, i32 2, %opencl.clk_event_t* addrspace(4)* %27, %opencl.clk_event_t* addrspace(4)* %28, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %29)
++  %33 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
++  %34 = load i32, i32* %flags, align 4
++  %35 = bitcast %struct.ndrange_t* %tmp12 to i8*
++  %36 = bitcast %struct.ndrange_t* %ndrange to i8*
++  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %35, i8* align 4 %36, i32 4, i1 false)
++  %37 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 0
++  store i32 1, i32* %37, align 4
++  %38 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 1
++  store i32 2, i32* %38, align 4
++  %39 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 2
++  store i32 4, i32* %39, align 4
+ 
+ ; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf41:[0-9]+]]
+ ; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf42:[0-9]+]]
+@@ -206,24 +253,28 @@ entry:
+ ; CHECK-LLVM: [[BlockInv1:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8 addrspace(4)*
+ ; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}})
+ 
+-  %20 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %17)
++  %40 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %33, i32 %34, %struct.ndrange_t* %tmp12, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %37)
++
+   ret void
+ }
+ 
++; Function Attrs: argmemonly nounwind
++declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1) #1
++
+ ; Function Attrs: convergent noinline nounwind optnone
+ define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 {
+ entry:
+   %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+-  %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
+   store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
+-  store <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
+-  %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
++  store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
++  %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 5
+   %0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4
+   %conv = sext i8 %0 to i32
+-  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 2
++  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
+   %1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4
+-  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
++  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
+   %2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4
+   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2
+   store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
+@@ -243,19 +294,19 @@ declare i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i
+ define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #2 {
+ entry:
+   %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+-  %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
+   store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
+-  store <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
+-  %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
++  store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
++  %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5
+   %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4
+-  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
++  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
+   %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4
+   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1
+   %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+-  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 2
++  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
+   %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4
+-  %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
++  %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
+   %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4
+   %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4
+   store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+@@ -276,11 +327,11 @@ define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspac
+ entry:
+   %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+   %p.addr = alloca i8 addrspace(3)*, align 4
+-  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
+   store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
+   store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4
+-  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
++  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
+   ret void
+ }
+ 
+@@ -300,13 +351,13 @@ entry:
+   %p1.addr = alloca i8 addrspace(3)*, align 4
+   %p2.addr = alloca i8 addrspace(3)*, align 4
+   %p3.addr = alloca i8 addrspace(3)*, align 4
+-  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
++  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
+   store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+-  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
++  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
+   store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4
+   store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4
+   store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4
+-  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
++  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
+   ret void
+ }
+ 
+@@ -329,27 +380,20 @@ declare i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*,
+ ; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*)
+ ; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)
+ 
+-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #1 = { argmemonly nounwind }
+-attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #3 = { nounwind }
+ 
+ !llvm.module.flags = !{!0}
+-!opencl.enable.FP_CONTRACT = !{}
+ !opencl.ocl.version = !{!1}
+ !opencl.spir.version = !{!1}
+-!opencl.used.extensions = !{!2}
+-!opencl.used.optional.core.features = !{!2}
+-!opencl.compiler.options = !{!2}
+-!llvm.ident = !{!3}
++!llvm.ident = !{!2}
+ 
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 2, i32 0}
+-!2 = !{}
+-!3 = !{!"clang version 7.0.0"}
+-!4 = !{i32 1, i32 1, i32 0, i32 0}
+-!5 = !{!"none", !"none", !"none", !"none"}
+-!6 = !{!"int*", !"int*", !"int", !"char"}
+-!7 = !{!"", !"", !"", !""}
+-!8 = !{i1 false, i1 false, i1 false, i1 false}
+-!9 = !{i32 0, i32 0, i32 0, i32 0}
++!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
++!3 = !{i32 1, i32 1, i32 0, i32 0}
++!4 = !{!"none", !"none", !"none", !"none"}
++!5 = !{!"int*", !"int*", !"int", !"char"}
++!6 = !{!"", !"", !"", !""}
+-- 
+2.17.1
+
diff --git a/patches/spirv/0007-Backport-Implementation-of-SPV_INTEL_function_pointe.patch b/patches/spirv/0007-Backport-Implementation-of-SPV_INTEL_function_pointe.patch
new file mode 100644
index 00000000..87c10f6b
--- /dev/null
+++ b/patches/spirv/0007-Backport-Implementation-of-SPV_INTEL_function_pointe.patch
@@ -0,0 +1,1079 @@
+From 245c9891ddc5345e651c7ce1d591e4d721abcd63 Mon Sep 17 00:00:00 2001
+From: Aleksandr Bezzubikov <aleksandr.bezzubikov@intel.com>
+Date: Fri, 15 May 2020 23:32:08 +0300
+Subject: [PATCH 07/10] [Backport] Implementation of
+ SPV_INTEL_function_pointers extension Original commit:
+ https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/934d50e
+
+---
+ lib/SPIRV/OCLUtil.h                           |   7 +
+ lib/SPIRV/SPIRVReader.cpp                     |  39 +++-
+ lib/SPIRV/SPIRVRegularizeLLVM.cpp             |   4 +-
+ lib/SPIRV/SPIRVWriter.cpp                     |  48 ++++-
+ lib/SPIRV/SPIRVWriter.h                       |   2 +
+ lib/SPIRV/libSPIRV/SPIRVEnum.h                |   2 +
+ lib/SPIRV/libSPIRV/SPIRVErrorEnum.h           |   1 +
+ lib/SPIRV/libSPIRV/SPIRVInstruction.cpp       |  25 +++
+ lib/SPIRV/libSPIRV/SPIRVInstruction.h         |  41 ++++
+ lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h         |   1 +
+ lib/SPIRV/libSPIRV/SPIRVModule.cpp            |  20 ++
+ lib/SPIRV/libSPIRV/SPIRVModule.h              |   6 +
+ lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h         |   3 +
+ lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h          |   2 +
+ lib/SPIRV/libSPIRV/spirv.hpp                  |   5 +
+ .../fp-from-host.ll                           |  69 +++++++
+ .../function-pointer-as-function-arg.ll       | 176 ++++++++++++++++++
+ .../function-pointer.ll                       |  91 +++++++++
+ .../non-uniform-function-pointer.ll           | 138 ++++++++++++++
+ .../referenced-indirectly.ll                  |  81 ++++++++
+ 20 files changed, 753 insertions(+), 8 deletions(-)
+ create mode 100644 test/transcoding/SPV_INTEL_function_pointers/fp-from-host.ll
+ create mode 100644 test/transcoding/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll
+ create mode 100644 test/transcoding/SPV_INTEL_function_pointers/function-pointer.ll
+ create mode 100644 test/transcoding/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll
+ create mode 100644 test/transcoding/SPV_INTEL_function_pointers/referenced-indirectly.ll
+
+diff --git a/lib/SPIRV/OCLUtil.h b/lib/SPIRV/OCLUtil.h
+index 37d21d9..cc0fba5 100644
+--- a/lib/SPIRV/OCLUtil.h
++++ b/lib/SPIRV/OCLUtil.h
+@@ -416,6 +416,13 @@ bool isSamplerTy(Type *Ty);
+ // If so, it applies ContractionOff ExecutionMode to the kernel.
+ void checkFpContract(BinaryOperator *B, SPIRVBasicBlock *BB);
+ 
++template <typename T> std::string toString(const T *Object) {
++  std::string S;
++  llvm::raw_string_ostream RSOS(S);
++  Object->print(RSOS);
++  RSOS.flush();
++  return S;
++}
+ } // namespace OCLUtil
+ 
+ ///////////////////////////////////////////////////////////////////////////////
+diff --git a/lib/SPIRV/SPIRVReader.cpp b/lib/SPIRV/SPIRVReader.cpp
+index 2a8bded..aa3013d 100644
+--- a/lib/SPIRV/SPIRVReader.cpp
++++ b/lib/SPIRV/SPIRVReader.cpp
+@@ -508,8 +508,19 @@ std::string SPIRVToLLVM::transTypeToOCLTypeName(SPIRVType *T, bool IsSigned) {
+     break;
+   case OpTypeArray:
+     return "array";
+-  case OpTypePointer:
+-    return transTypeToOCLTypeName(T->getPointerElementType()) + "*";
++  case OpTypePointer: {
++    SPIRVType *ET = T->getPointerElementType();
++    if (isa<OpTypeFunction>(ET)) {
++      SPIRVTypeFunction *TF = static_cast<SPIRVTypeFunction *>(ET);
++      std::string name = transTypeToOCLTypeName(TF->getReturnType());
++      name += " (*)(";
++      for (unsigned I = 0, E = TF->getNumParameters(); I < E; ++I)
++        name += transTypeToOCLTypeName(TF->getParameterType(I)) + ',';
++      name.back() = ')'; // replace the last comma with a closing brace.
++      return name;
++    }
++    return transTypeToOCLTypeName(ET) + "*";
++  }
+   case OpTypeVector:
+     return transTypeToOCLTypeName(T->getVectorComponentType()) +
+            T->getVectorComponentCount();
+@@ -1574,6 +1585,26 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
+   case OpAsmCallINTEL:
+     return mapValue(
+         BV, transAsmCallINTEL(static_cast<SPIRVAsmCallINTEL *>(BV), F, BB));
++  case OpFunctionPointerCallINTEL: {
++    SPIRVFunctionPointerCallINTEL *BC =
++        static_cast<SPIRVFunctionPointerCallINTEL *>(BV);
++    auto Call = CallInst::Create(transValue(BC->getCalledValue(), F, BB),
++                                 transValue(BC->getArgumentValues(), F, BB),
++                                 BC->getName(), BB);
++    // Assuming we are calling a regular device function
++    Call->setCallingConv(CallingConv::SPIR_FUNC);
++    // Don't set attributes, because at translation time we don't know which
++    // function exactly we are calling.
++    return mapValue(BV, Call);
++  }
++
++  case OpFunctionPointerINTEL: {
++    SPIRVFunctionPointerINTEL *BC =
++        static_cast<SPIRVFunctionPointerINTEL *>(BV);
++    SPIRVFunction *F = BC->getFunction();
++    BV->setName(F->getName());
++    return mapValue(BV, transFunction(F));
++  }
+ 
+   case OpExtInst: {
+     auto *ExtInst = static_cast<SPIRVExtInst *>(BV);
+@@ -1715,6 +1746,10 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *BF) {
+   Function *F = cast<Function>(
+       mapValue(BF, Function::Create(FT, Linkage, BF->getName(), M)));
+   mapFunction(BF, F);
++
++  if (BF->hasDecorate(DecorationReferencedIndirectlyINTEL))
++    F->addFnAttr("referenced-indirectly");
++
+   if (!F->isIntrinsic()) {
+     F->setCallingConv(IsKernel ? CallingConv::SPIR_KERNEL
+                                : CallingConv::SPIR_FUNC);
+diff --git a/lib/SPIRV/SPIRVRegularizeLLVM.cpp b/lib/SPIRV/SPIRVRegularizeLLVM.cpp
+index 4a1837d..38a4089 100644
+--- a/lib/SPIRV/SPIRVRegularizeLLVM.cpp
++++ b/lib/SPIRV/SPIRVRegularizeLLVM.cpp
+@@ -197,8 +197,8 @@ void SPIRVRegularizeLLVM::lowerFuncPtr(Module *M) {
+     auto AI = F.arg_begin();
+     if (hasFunctionPointerArg(&F, AI)) {
+       auto OC = getSPIRVFuncOC(F.getName());
+-      assert(OC != OpNop && "Invalid function pointer usage");
+-      Work.push_back(std::make_pair(&F, OC));
++      if (OC != OpNop) // builtin with a function pointer argument
++        Work.push_back(std::make_pair(&F, OC));
+     }
+   }
+   for (auto &I : Work)
+diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp
+index 9afc391..df6f070 100644
+--- a/lib/SPIRV/SPIRVWriter.cpp
++++ b/lib/SPIRV/SPIRVWriter.cpp
+@@ -59,6 +59,7 @@
+ #include "llvm/ADT/StringSwitch.h"
+ #include "llvm/ADT/Triple.h"
+ #include "llvm/Bitcode/BitcodeWriter.h"
++#include "llvm/IR/CallSite.h"
+ #include "llvm/IR/Constants.h"
+ #include "llvm/IR/DebugInfo.h"
+ #include "llvm/IR/DerivedTypes.h"
+@@ -285,7 +286,6 @@ SPIRVType *LLVMToSPIRV::transType(Type *T) {
+   // sampler or pipe type.
+   if (T->isPointerTy()) {
+     auto ET = T->getPointerElementType();
+-    assert(!ET->isFunctionTy() && "Function pointer type is not allowed");
+     auto ST = dyn_cast<StructType>(ET);
+     auto AddrSpc = T->getPointerAddressSpace();
+     if (ST && !ST->isSized()) {
+@@ -520,6 +520,11 @@ SPIRVFunction *LLVMToSPIRV::transFunctionDecl(Function *F) {
+     BF->addDecorate(DecorationFuncParamAttr, FunctionParameterAttributeZext);
+   if (Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+     BF->addDecorate(DecorationFuncParamAttr, FunctionParameterAttributeSext);
++  if (Attrs.hasFnAttribute("referenced-indirectly")) {
++    assert(!oclIsKernel(F) &&
++           "kernel function was marked as referenced-indirectly");
++    BF->addDecorate(DecorationReferencedIndirectlyINTEL);
++  }
+   SPIRVDBG(dbgs() << "[transFunction] " << *F << " => ";
+            spvdbgs() << *BF << '\n';)
+   return BF;
+@@ -783,9 +788,19 @@ SPIRVValue *LLVMToSPIRV::transValueWithoutDecoration(Value *V,
+       MemoryAccess[0] |= MemoryAccessNontemporalMask;
+     if (MemoryAccess.front() == 0)
+       MemoryAccess.clear();
++
++    SPIRVValue *BSV = nullptr;
++    if (Function *F = dyn_cast<Function>(ST->getValueOperand())) {
++      // store of function pointer
++      BSV = BM->addFunctionPointerINTELInst(
++          transType(F->getType()),
++          static_cast<SPIRVFunction *>(transValue(F, BB)), BB);
++    } else {
++      BSV = transValue(ST->getValueOperand(), BB);
++    }
++
+     return mapValue(V, BM->addStoreInst(transValue(ST->getPointerOperand(), BB),
+-                                        transValue(ST->getValueOperand(), BB),
+-                                        MemoryAccess, BB));
++                                        BSV, MemoryAccess, BB));
+   }
+ 
+   if (LoadInst *LD = dyn_cast<LoadInst>(V)) {
+@@ -909,8 +924,17 @@ SPIRVValue *LLVMToSPIRV::transValueWithoutDecoration(Value *V,
+ 
+   if (auto Phi = dyn_cast<PHINode>(V)) {
+     std::vector<SPIRVValue *> IncomingPairs;
++
+     for (size_t I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+-      IncomingPairs.push_back(transValue(Phi->getIncomingValue(I), BB));
++      SPIRVValue *BV = nullptr;
++      if (Function *F = dyn_cast<Function>(Phi->getIncomingValue(I))) {
++        BV = BM->addFunctionPointerINTELInst(
++            transType(F->getType()),
++            static_cast<SPIRVFunction *>(transValue(F, BB)), BB);
++      } else {
++        BV = transValue(Phi->getIncomingValue(I), BB);
++      }
++      IncomingPairs.push_back(BV);
+       IncomingPairs.push_back(transValue(Phi->getIncomingBlock(I), nullptr));
+     }
+     return mapValue(
+@@ -1201,6 +1225,14 @@ SPIRVValue *LLVMToSPIRV::transCallInst(CallInst *CI, SPIRVBasicBlock *BB) {
+   if (isa<InlineAsm>(CI->getCalledValue()))
+     return transAsmCallINTEL(CI, BB);
+ 
++  llvm::CallSite CS(CI);
++  if (CS.isIndirectCall())
++    return transIndirectCallInst(CI, BB);
++  return transDirectCallInst(CI, BB);
++}
++
++SPIRVValue *LLVMToSPIRV::transDirectCallInst(CallInst *CI,
++                                             SPIRVBasicBlock *BB) {
+   SPIRVExtInstSetKind ExtSetKind = SPIRVEIS_Count;
+   SPIRVWord ExtOp = SPIRVWORD_MAX;
+   llvm::Function *F = CI->getCalledFunction();
+@@ -1258,6 +1290,14 @@ SPIRVValue *LLVMToSPIRV::transAsmCallINTEL(CallInst *CI, SPIRVBasicBlock *BB) {
+       BB);
+ }
+ 
++SPIRVValue *LLVMToSPIRV::transIndirectCallInst(CallInst *CI,
++                                               SPIRVBasicBlock *BB) {
++  return BM->addIndirectCallInst(
++      transValue(CI->getCalledValue(), BB), transType(CI->getType()),
++      transArguments(CI, BB, SPIRVEntry::createUnique(OpFunctionCall).get()),
++      BB);
++}
++
+ bool LLVMToSPIRV::transAddressingMode() {
+   Triple TargetTriple(M->getTargetTriple());
+   Triple::ArchType Arch = TargetTriple.getArch();
+diff --git a/lib/SPIRV/SPIRVWriter.h b/lib/SPIRV/SPIRVWriter.h
+index 26b1a4a..904fd48 100644
+--- a/lib/SPIRV/SPIRVWriter.h
++++ b/lib/SPIRV/SPIRVWriter.h
+@@ -101,6 +101,8 @@ public:
+   SPIRVValue *transCallInst(CallInst *Call, SPIRVBasicBlock *BB);
+   SPIRVValue *transAsmINTEL(InlineAsm *Asm);
+   SPIRVValue *transAsmCallINTEL(CallInst *Call, SPIRVBasicBlock *BB);
++  SPIRVValue *transDirectCallInst(CallInst *Call, SPIRVBasicBlock *BB);
++  SPIRVValue *transIndirectCallInst(CallInst *Call, SPIRVBasicBlock *BB);
+   bool transDecoration(Value *V, SPIRVValue *BV);
+   SPIRVWord transFunctionControlMask(CallInst *);
+   SPIRVWord transFunctionControlMask(Function *);
+diff --git a/lib/SPIRV/libSPIRV/SPIRVEnum.h b/lib/SPIRV/libSPIRV/SPIRVEnum.h
+index 0b4a295..f4efccb 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVEnum.h
+@@ -339,6 +339,8 @@ template <> inline void SPIRVMap<Decoration, SPIRVCapVec>::init() {
+   ADD_VEC_INIT(DecorationInputAttachmentIndex, {CapabilityInputAttachment});
+   ADD_VEC_INIT(DecorationAlignment, {CapabilityKernel});
+   ADD_VEC_INIT(DecorationSideEffectsINTEL, {CapabilityAsmINTEL});
++  ADD_VEC_INIT(DecorationReferencedIndirectlyINTEL,
++               {CapabilityIndirectReferencesINTEL});
+ }
+ 
+ template <> inline void SPIRVMap<BuiltIn, SPIRVCapVec>::init() {
+diff --git a/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h b/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h
+index 6df2fb2..d45b88a 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h
+@@ -10,5 +10,6 @@ _SPIRV_OP(InvalidMemoryModel, "Expects 0-3.")
+ _SPIRV_OP(InvalidFunctionControlMask, "")
+ _SPIRV_OP(InvalidBuiltinSetName, "Expects OpenCL.std.")
+ _SPIRV_OP(InvalidFunctionCall, "Unexpected llvm intrinsic:")
++_SPIRV_OP(FunctionPointers, "Can't translate function pointer:\n")
+ 
+ #endif
+diff --git a/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp b/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
+index 2172bc6..ff97b3b 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
++++ b/lib/SPIRV/libSPIRV/SPIRVInstruction.cpp
+@@ -119,6 +119,31 @@ void SPIRVFunctionCall::validate() const {
+   SPIRVFunctionCallGeneric::validate();
+ }
+ 
++SPIRVFunctionPointerCallINTEL::SPIRVFunctionPointerCallINTEL(
++    SPIRVId TheId, SPIRVValue *TheCalledValue, SPIRVType *TheReturnType,
++    const std::vector<SPIRVWord> &TheArgs, SPIRVBasicBlock *BB)
++    : SPIRVFunctionCallGeneric(TheReturnType, TheId, TheArgs, BB),
++      CalledValueId(TheCalledValue->getId()) {
++  validate();
++}
++
++void SPIRVFunctionPointerCallINTEL::validate() const {
++  SPIRVFunctionCallGeneric::validate();
++}
++
++SPIRVFunctionPointerINTEL::SPIRVFunctionPointerINTEL(SPIRVId TheId,
++                                                     SPIRVType *TheType,
++                                                     SPIRVFunction *TheFunction,
++                                                     SPIRVBasicBlock *BB)
++    : SPIRVInstruction(FixedWordCount, OC, TheType, TheId, BB),
++      TheFunction(TheFunction->getId()) {
++  validate();
++}
++
++void SPIRVFunctionPointerINTEL::validate() const {
++  SPIRVInstruction::validate();
++}
++
+ // ToDo: Each instruction should implement this function
+ std::vector<SPIRVValue *> SPIRVInstruction::getOperands() {
+   std::vector<SPIRVValue *> Empty;
+diff --git a/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+index 508c54c..55e38d8 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVInstruction.h
++++ b/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+@@ -1375,6 +1375,47 @@ protected:
+   SPIRVId FunctionId;
+ };
+ 
++class SPIRVFunctionPointerCallINTEL
++    : public SPIRVFunctionCallGeneric<OpFunctionPointerCallINTEL, 4> {
++public:
++  SPIRVFunctionPointerCallINTEL(SPIRVId TheId, SPIRVValue *TheCalledValue,
++                                SPIRVType *TheReturnType,
++                                const std::vector<SPIRVWord> &TheArgs,
++                                SPIRVBasicBlock *BB);
++  SPIRVFunctionPointerCallINTEL() : CalledValueId(SPIRVID_INVALID) {}
++  SPIRVValue *getCalledValue() const { return get<SPIRVValue>(CalledValueId); }
++  _SPIRV_DEF_ENCDEC4(Type, Id, CalledValueId, Args)
++  void validate() const override;
++  bool isOperandLiteral(unsigned Index) const override { return false; }
++  SPIRVCapVec getRequiredCapability() const override {
++    return getVec(CapabilityFunctionPointersINTEL);
++  }
++
++protected:
++  SPIRVId CalledValueId;
++};
++
++class SPIRVFunctionPointerINTEL : public SPIRVInstruction {
++  const static Op OC = OpFunctionPointerINTEL;
++  const static SPIRVWord FixedWordCount = 4;
++
++public:
++  SPIRVFunctionPointerINTEL(SPIRVId TheId, SPIRVType *TheType,
++                            SPIRVFunction *TheFunction, SPIRVBasicBlock *BB);
++  SPIRVFunctionPointerINTEL()
++      : SPIRVInstruction(OC), TheFunction(SPIRVID_INVALID) {}
++  SPIRVFunction *getFunction() const { return get<SPIRVFunction>(TheFunction); }
++  _SPIRV_DEF_ENCDEC3(Type, Id, TheFunction)
++  void validate() const override;
++  bool isOperandLiteral(unsigned Index) const override { return false; }
++  SPIRVCapVec getRequiredCapability() const override {
++    return getVec(CapabilityFunctionPointersINTEL);
++  }
++
++protected:
++  SPIRVId TheFunction;
++};
++
+ class SPIRVExtInst : public SPIRVFunctionCallGeneric<OpExtInst, 5> {
+ public:
+   SPIRVExtInst(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheBuiltinSet,
+diff --git a/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+index 0423951..97d2239 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+@@ -397,6 +397,7 @@ inline bool isValid(spv::Decoration V) {
+   case DecorationInputAttachmentIndex:
+   case DecorationAlignment:
+   case DecorationMaxByteOffset:
++  case DecorationReferencedIndirectlyINTEL:
+     return true;
+   default:
+     return false;
+diff --git a/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+index 7308992..b2c64a3 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVModule.cpp
++++ b/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+@@ -281,6 +281,11 @@ public:
+   SPIRVInstruction *addAsmCallINTELInst(SPIRVAsmINTEL *,
+                                         const std::vector<SPIRVWord> &,
+                                         SPIRVBasicBlock *) override;
++  SPIRVInstruction *addIndirectCallInst(SPIRVValue *, SPIRVType *,
++                                        const std::vector<SPIRVWord> &,
++                                        SPIRVBasicBlock *) override;
++  SPIRVInstruction *addFunctionPointerINTELInst(SPIRVType *, SPIRVFunction *,
++                                                SPIRVBasicBlock *) override;
+   SPIRVInstruction *addCmpInst(Op, SPIRVType *, SPIRVValue *, SPIRVValue *,
+                                SPIRVBasicBlock *) override;
+   SPIRVInstruction *addLoadInst(SPIRVValue *, const std::vector<SPIRVWord> &,
+@@ -1134,6 +1139,21 @@ SPIRVModuleImpl::addAsmCallINTELInst(SPIRVAsmINTEL *TheAsm,
+       new SPIRVAsmCallINTEL(getId(), TheAsm, TheArguments, BB), BB);
+ }
+ 
++SPIRVInstruction *SPIRVModuleImpl::addIndirectCallInst(
++    SPIRVValue *TheCalledValue, SPIRVType *TheReturnType,
++    const std::vector<SPIRVWord> &TheArguments, SPIRVBasicBlock *BB) {
++  return addInstruction(
++      new SPIRVFunctionPointerCallINTEL(getId(), TheCalledValue, TheReturnType,
++                                        TheArguments, BB),
++      BB);
++}
++
++SPIRVInstruction *SPIRVModuleImpl::addFunctionPointerINTELInst(
++    SPIRVType *TheType, SPIRVFunction *TheFunction, SPIRVBasicBlock *BB) {
++  return addInstruction(
++      new SPIRVFunctionPointerINTEL(getId(), TheType, TheFunction, BB), BB);
++}
++
+ SPIRVInstruction *SPIRVModuleImpl::addBinaryInst(Op TheOpCode, SPIRVType *Type,
+                                                  SPIRVValue *Op1,
+                                                  SPIRVValue *Op2,
+diff --git a/lib/SPIRV/libSPIRV/SPIRVModule.h b/lib/SPIRV/libSPIRV/SPIRVModule.h
+index 72d3dd1..9043346 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVModule.h
++++ b/lib/SPIRV/libSPIRV/SPIRVModule.h
+@@ -293,6 +293,12 @@ public:
+   virtual SPIRVInstruction *addAsmCallINTELInst(SPIRVAsmINTEL *,
+                                                 const std::vector<SPIRVWord> &,
+                                                 SPIRVBasicBlock *) = 0;
++  virtual SPIRVInstruction *addIndirectCallInst(SPIRVValue *, SPIRVType *,
++                                                const std::vector<SPIRVWord> &,
++                                                SPIRVBasicBlock *) = 0;
++  virtual SPIRVInstruction *addFunctionPointerINTELInst(SPIRVType *,
++                                                        SPIRVFunction *,
++                                                        SPIRVBasicBlock *) = 0;
+   virtual SPIRVInstruction *
+   addCompositeConstructInst(SPIRVType *, const std::vector<SPIRVId> &,
+                             SPIRVBasicBlock *) = 0;
+diff --git a/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+index 13c3813..4c03b8f 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+@@ -330,6 +330,7 @@ template <> inline void SPIRVMap<Decoration, std::string>::init() {
+   add(DecorationAlignment, "Alignment");
+   add(DecorationMaxByteOffset, "MaxByteOffset");
+   add(DecorationSideEffectsINTEL, "SideEffectsINTEL");
++  add(DecorationReferencedIndirectlyINTEL, "ReferencedIndirectlyINTEL");
+ }
+ SPIRV_DEF_NAMEMAP(Decoration, SPIRVDecorationNameMap)
+ 
+@@ -476,6 +477,8 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
+       "SubgroupAvcMotionEstimationIntraINTEL");
+   add(CapabilitySubgroupAvcMotionEstimationChromaINTEL,
+       "SubgroupAvcMotionEstimationChromaINTEL");
++  add(CapabilityFunctionPointersINTEL, "FunctionPointersINTEL");
++  add(CapabilityIndirectReferencesINTEL, "IndirectReferencesINTEL");
+ }
+ SPIRV_DEF_NAMEMAP(Capability, SPIRVCapabilityNameMap)
+ 
+diff --git a/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h b/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
+index 65f5e71..d822b80 100644
+--- a/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
++++ b/lib/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
+@@ -305,6 +305,8 @@ _SPIRV_OP(SubgroupImageBlockReadINTEL, 5577)
+ _SPIRV_OP(SubgroupImageBlockWriteINTEL, 5578)
+ _SPIRV_OP(SubgroupImageMediaBlockReadINTEL, 5580)
+ _SPIRV_OP(SubgroupImageMediaBlockWriteINTEL, 5581)
++_SPIRV_OP(FunctionPointerINTEL, 5600)
++_SPIRV_OP(FunctionPointerCallINTEL, 5601)
+ _SPIRV_OP(AsmTargetINTEL, 5609)
+ _SPIRV_OP(AsmINTEL, 5610)
+ _SPIRV_OP(AsmCallINTEL, 5611)
+diff --git a/lib/SPIRV/libSPIRV/spirv.hpp b/lib/SPIRV/libSPIRV/spirv.hpp
+index a45eb9d..028e6ea 100644
+--- a/lib/SPIRV/libSPIRV/spirv.hpp
++++ b/lib/SPIRV/libSPIRV/spirv.hpp
+@@ -386,6 +386,7 @@ enum Decoration {
+   DecorationPassthroughNV = 5250,
+   DecorationViewportRelativeNV = 5252,
+   DecorationSecondaryViewportRelativeNV = 5256,
++  DecorationReferencedIndirectlyINTEL = 5602,
+   DecorationSideEffectsINTEL = 5608,
+   DecorationMax = 0x7fffffff,
+ };
+@@ -653,6 +654,8 @@ enum Capability {
+   CapabilitySubgroupImageBlockIOINTEL = 5570,
+   CapabilitySubgroupImageMediaBlockIOINTEL = 5579,
+   CapabilityAsmINTEL = 5606,
++  CapabilityFunctionPointersINTEL = 5603,
++  CapabilityIndirectReferencesINTEL = 5604,
+   CapabilitySubgroupAvcMotionEstimationINTEL = 5696,
+   CapabilitySubgroupAvcMotionEstimationIntraINTEL = 5697,
+   CapabilitySubgroupAvcMotionEstimationChromaINTEL = 5698,
+@@ -984,6 +987,8 @@ enum Op {
+   OpAsmTargetINTEL = 5609,
+   OpAsmINTEL = 5610,
+   OpAsmCallINTEL = 5611,
++  OpFunctionPointerINTEL = 5600,
++  OpFunctionPointerCallINTEL = 5601,
+   OpVmeImageINTEL = 5699,
+   OpTypeVmeImageINTEL = 5700,
+   OpTypeAvcImePayloadINTEL = 5701,
+diff --git a/test/transcoding/SPV_INTEL_function_pointers/fp-from-host.ll b/test/transcoding/SPV_INTEL_function_pointers/fp-from-host.ll
+new file mode 100644
+index 0000000..5c08060
+--- /dev/null
++++ b/test/transcoding/SPV_INTEL_function_pointers/fp-from-host.ll
+@@ -0,0 +1,69 @@
++; RUN: llvm-as %s -o %t.bc
++; RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
++; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
++; RUN: llvm-spirv %t.bc -o %t.spv
++; RUN: llvm-spirv -r %t.spv -o %t.r.bc
++; RUN: llvm-dis %t.r.bc -o %t.r.ll
++; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
++;
++; Generated from:
++; typedef int (*fp_t)(int);
++;
++; __kernel void test(__global int *fp, __global int *data) {
++;
++;   data[0] = ((fp_t)(*fp))(data[1]);
++; }
++;
++; CHECK-SPIRV: Capability FunctionPointersINTEL
++;
++; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[KERNEL_ID:[0-9]+]] "test"
++; CHECK-SPIRV: TypeInt [[INT32_TYPE_ID:[0-9]+]] 32
++; CHECK-SPIRV: TypePointer [[INT_PTR:[0-9]+]] 5 [[INT32_TYPE_ID]]
++; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[INT32_TYPE_ID]] [[INT32_TYPE_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_TYPE_PTR_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[KERNEL_ID]]
++; CHECK-SPIRV: FunctionParameter [[INT_PTR]] [[FP:[0-9]+]]
++; CHECK-SPIRV: Load [[INT32_TYPE_ID]] [[FUNC_ADDR:[0-9]+]] [[FP]]
++; CHECK-SPIRV: ConvertUToPtr [[FOO_TYPE_PTR_ID]] [[FOO_PTR:[0-9]+]] [[FUNC_ADDR]]
++; CHECK-SPIRV: FunctionPointerCallINTEL [[INT32_TYPE_ID]] {{[0-9]+}} [[FOO_PTR]]
++;
++; CHECK-LLVM: define spir_kernel void @test(i32 addrspace(1)*
++; CHECK-LLVM: %{{.*}} = call spir_func i32 %{{.*}}(i32
++
++target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir64-unknown-unknown"
++
++; Function Attrs: convergent nounwind
++define spir_kernel void @test(i32 addrspace(1)* %fp, i32 addrspace(1)* %data) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
++entry:
++  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %data, i64 1
++  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !tbaa !8
++  %1 = load i32, i32 addrspace(1)* %fp, align 4, !tbaa !8
++  %2 = inttoptr i32 %1 to i32 (i32)*
++  %call = call spir_func i32 %2(i32 %0) #1
++  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %data, i64 0
++  store i32 %call, i32 addrspace(1)* %arrayidx1, align 4, !tbaa !8
++  ret void
++}
++
++attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { convergent }
++
++!llvm.module.flags = !{!0}
++!opencl.ocl.version = !{!1}
++!opencl.spir.version = !{!2}
++!llvm.ident = !{!3}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 1, i32 0}
++!2 = !{i32 1, i32 2}
++!3 = !{!"clang version 7.1.0 "}
++!4 = !{i32 1, i32 1}
++!5 = !{!"none", !"none"}
++!6 = !{!"int*", !"int*"}
++!7 = !{!"", !""}
++!8 = !{!9, !9, i64 0}
++!9 = !{!"int", !10, i64 0}
++!10 = !{!"omnipotent char", !11, i64 0}
++!11 = !{!"Simple C/C++ TBAA"}
+diff --git a/test/transcoding/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll b/test/transcoding/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll
+new file mode 100644
+index 0000000..d1c3f1e
+--- /dev/null
++++ b/test/transcoding/SPV_INTEL_function_pointers/function-pointer-as-function-arg.ll
+@@ -0,0 +1,176 @@
++; RUN: llvm-as %s -o %t.bc
++; RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
++; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
++; RUN: llvm-spirv %t.bc -o %t.spv
++; RUN: llvm-spirv -r %t.spv -o %t.r.bc
++; RUN: llvm-dis %t.r.bc -o %t.r.ll
++; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
++;
++; Generated from:
++; int helper(int (*f)(int), int arg) {
++;   return f(arg);
++; }
++;
++; int foo(int v) {
++;   return v + 1;
++; }
++;
++; int bar(int v) {
++;   return v + 2;
++; }
++;
++; __kernel void test(__global int *data, int control) {
++;   int (*fp)(int) = 0;
++;
++;   if (get_global_id(0) % control == 0)
++;     fp = &foo;
++;   else
++;     fp = &bar;
++;
++;   data[get_global_id(0)] = helper(fp, data[get_global_id(0)]);
++; }
++;
++; CHECK-SPIRV: Capability FunctionPointersINTEL
++;
++; CHECK-SPIRV: EntryPoint 6 [[KERNEL_ID:[0-9]+]] "test"
++; CHECK-SPIRV: TypeInt [[TYPE_INT32_ID:[0-9]+]] 32
++; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[TYPE_INT32_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_PTR_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++; CHECK-SPIRV: TypeFunction [[HELPER_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[FOO_PTR_TYPE_ID]] [[TYPE_INT32_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_PTR_ALLOCA_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_PTR_TYPE_ID]]
++; CHECK-SPIRV: TypePointer [[TYPE_INT32_ALLOCA_ID:[0-9]+]] {{[0-9]+}} [[TYPE_INT32_ID]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[HELPER_ID:[0-9]+]] {{[0-9]+}} [[HELPER_TYPE_ID]]
++; CHECK-SPIRV: FunctionParameter [[FOO_PTR_TYPE_ID]] [[T_PTR_ARG_ID:[0-9]+]]
++; CHECK-SPIRV: FunctionParameter [[TYPE_INT32_ID:[0-9]+]] [[INT_ARG_ID:[0-9]+]]
++; CHECK-SPIRV: Variable [[FOO_PTR_ALLOCA_TYPE_ID]] [[T_PTR_ALLOCA_ID:[0-9]+]]
++; CHECK-SPIRV: Variable [[TYPE_INT32_ALLOCA_ID]] [[INT_ALLOCA_ID:[0-9]+]]
++; CHECK-SPIRV: Store [[T_PTR_ALLOCA_ID]] [[T_PTR_ARG_ID]]
++; CHECK-SPIRV: Store [[INT_ALLOCA_ID]] [[INT_ARG_ID]]
++; CHECK-SPIRV: Load [[FOO_PTR_TYPE_ID]] [[LOADED_T_PTR:[0-9]+]] [[T_PTR_ALLOCA_ID]]
++; CHECK-SPIRV: Load [[TYPE_INT32_ID]] [[LOADED_INT:[0-9]+]] [[INT_ALLOCA_ID]]
++; CHECK-SPIRV: FunctionPointerCallINTEL [[TYPE_INT32_ID]] [[RESULT:[0-9]+]] [[LOADED_T_PTR]] [[LOADED_INT]]
++; CHECK-SPIRV: ReturnValue [[RESULT]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[FOO_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++; CHECK-SPIRV: Function {{[0-9]+}} [[BAR_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[KERNEL_ID]]
++; CHECK-SPIRV: Variable [[FOO_PTR_ALLOCA_TYPE_ID]] [[F_PTR_ALLOCA_ID:[0-9]+]]
++; CHECK-SPIRV: FunctionPointerINTEL [[FOO_PTR_TYPE_ID]] [[FOO_PTR_ID:[0-9]+]] [[FOO_ID]]
++; CHECK-SPIRV: Store [[F_PTR_ALLOCA_ID]] [[FOO_PTR_ID]]
++; CHECK-SPIRV: FunctionPointerINTEL [[FOO_PTR_TYPE_ID]] [[BAR_PTR_ID:[0-9]+]] [[BAR_ID]]
++; CHECK-SPIRV: Store [[F_PTR_ALLOCA_ID]] [[BAR_PTR_ID]]
++; CHECK-SPIRV: Load [[FOO_PTR_TYPE_ID]] [[LOADED_F_PTR:[0-9]+]] [[F_PTR_ALLOCA_ID]]
++; CHECK-SPIRV: FunctionCall {{[0-9]+}} {{[0-9]+}} [[HELPER_ID]] [[LOADED_F_PTR]]
++;
++; CHECK-LLVM: define spir_func i32 @helper(i32 (i32)* %[[F:.*]],
++; CHECK-LLVM: %[[F_ADDR:.*]] = alloca i32 (i32)*
++; CHECK-LLVM: store i32 (i32)* %[[F]], i32 (i32)** %[[F_ADDR]]
++; CHECK-LLVM: %[[F_LOADED:.*]] = load i32 (i32)*, i32 (i32)** %[[F_ADDR]]
++; CHECK-LLVM: %[[CALL:.*]] = call spir_func i32 %[[F_LOADED]]
++; CHECK-LLVM: ret i32 %[[CALL]]
++;
++; CHECK-LLVM: define spir_kernel void @test
++; CHECK-LLVM: %[[FP:.*]] = alloca i32 (i32)*
++; CHECK-LLVM: store i32 (i32)* @foo, i32 (i32)** %[[FP]]
++; CHECK-LLVM: store i32 (i32)* @bar, i32 (i32)** %[[FP]]
++; CHECK-LLVM: %[[FP_LOADED:.*]] = load i32 (i32)*, i32 (i32)** %[[FP]]
++; CHECK-LLVM: call spir_func i32 @helper(i32 (i32)* %[[FP_LOADED]]
++
++
++target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir64-unknown-unknown"
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @helper(i32 (i32)* %f, i32 %arg) #0 {
++entry:
++  %f.addr = alloca i32 (i32)*, align 8
++  %arg.addr = alloca i32, align 4
++  store i32 (i32)* %f, i32 (i32)** %f.addr, align 8
++  store i32 %arg, i32* %arg.addr, align 4
++  %0 = load i32 (i32)*, i32 (i32)** %f.addr, align 8
++  %1 = load i32, i32* %arg.addr, align 4
++  %call = call spir_func i32 %0(i32 %1) #3
++  ret i32 %call
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @foo(i32 %v) #0 {
++entry:
++  %v.addr = alloca i32, align 4
++  store i32 %v, i32* %v.addr, align 4
++  %0 = load i32, i32* %v.addr, align 4
++  %add = add nsw i32 %0, 1
++  ret i32 %add
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @bar(i32 %v) #0 {
++entry:
++  %v.addr = alloca i32, align 4
++  store i32 %v, i32* %v.addr, align 4
++  %0 = load i32, i32* %v.addr, align 4
++  %add = add nsw i32 %0, 2
++  ret i32 %add
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_kernel void @test(i32 addrspace(1)* %data, i32 %control) #1 !kernel_arg_addr_space !1 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
++entry:
++  %data.addr = alloca i32 addrspace(1)*, align 8
++  %control.addr = alloca i32, align 4
++  %fp = alloca i32 (i32)*, align 8
++  store i32 addrspace(1)* %data, i32 addrspace(1)** %data.addr, align 8
++  store i32 %control, i32* %control.addr, align 4
++  store i32 (i32)* null, i32 (i32)** %fp, align 8
++  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4
++  %0 = load i32, i32* %control.addr, align 4
++  %conv = sext i32 %0 to i64
++  %rem = urem i64 %call, %conv
++  %cmp = icmp eq i64 %rem, 0
++  br i1 %cmp, label %if.then, label %if.else
++
++if.then:                                          ; preds = %entry
++  store i32 (i32)* @foo, i32 (i32)** %fp, align 8
++  br label %if.end
++
++if.else:                                          ; preds = %entry
++  store i32 (i32)* @bar, i32 (i32)** %fp, align 8
++  br label %if.end
++
++if.end:                                           ; preds = %if.else, %if.then
++  %1 = load i32 (i32)*, i32 (i32)** %fp, align 8
++  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %data.addr, align 8
++  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #4
++  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %call2
++  %3 = load i32, i32 addrspace(1)* %arrayidx, align 4
++  %call3 = call spir_func i32 @helper(i32 (i32)* %1, i32 %3) #3
++  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %data.addr, align 8
++  %call4 = call spir_func i64 @_Z13get_global_idj(i32 0) #4
++  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %call4
++  store i32 %call3, i32 addrspace(1)* %arrayidx5, align 4
++  ret void
++}
++
++; Function Attrs: convergent nounwind readnone
++declare spir_func i64 @_Z13get_global_idj(i32) #2
++
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #3 = { convergent }
++attributes #4 = { convergent nounwind readnone }
++
++!llvm.module.flags = !{!0}
++!opencl.ocl.version = !{!1}
++!opencl.spir.version = !{!2}
++!llvm.ident = !{!3}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 1, i32 0}
++!2 = !{i32 1, i32 2}
++!3 = !{!"clang version 7.1.0 "}
++!4 = !{!"none", !"none"}
++!5 = !{!"int*", !"int"}
++!6 = !{!"", !""}
+diff --git a/test/transcoding/SPV_INTEL_function_pointers/function-pointer.ll b/test/transcoding/SPV_INTEL_function_pointers/function-pointer.ll
+new file mode 100644
+index 0000000..febaa3c
+--- /dev/null
++++ b/test/transcoding/SPV_INTEL_function_pointers/function-pointer.ll
+@@ -0,0 +1,91 @@
++; RUN: llvm-as %s -o %t.bc
++; RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
++; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
++; RUN: llvm-spirv %t.bc -o %t.spv
++; RUN: llvm-spirv -r %t.spv -o %t.r.bc
++; RUN: llvm-dis %t.r.bc -o %t.r.ll
++; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
++;
++; Generated from:
++; int foo(int arg) {
++;   return arg + 10;
++; }
++;
++; void __kernel test(__global int *data, int input) {
++;   int (__constant *fp)(int) = &foo;
++;
++;   *data = fp(input);
++; }
++;
++; CHECK-SPIRV: Capability FunctionPointersINTEL
++; CHECK-SPIRV: EntryPoint 6 [[KERNEL_ID:[0-9]+]] "test"
++; CHECK-SPIRV: TypeInt [[TYPE_INT_ID:[0-9]+]]
++; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT_ID]] [[TYPE_INT_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_PTR_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_PTR_ALLOCA_ID:[0-9]+]] 7 [[FOO_PTR_ID]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[FOO_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++; CHECK-SPIRV: Function {{[0-9]+}} [[KERNEL_ID]]
++; CHECK-SPIRV: Variable [[FOO_PTR_ALLOCA_ID]] [[FOO_PTR_ALLOCA:[0-9]+]]
++; CHECK-SPIRV: FunctionPointerINTEL [[FOO_PTR_ID]] [[FOO_PTR:[0-9]+]] [[FOO_ID]]
++; CHECK-SPIRV: Store [[FOO_PTR_ALLOCA]] [[FOO_PTR]]
++; CHECK-SPIRV: Load [[FOO_PTR_ID]] [[LOADED_FOO_PTR:[0-9]+]] [[FOO_PTR_ALLOCA]]
++; CHECK-SPIRV: FunctionPointerCallINTEL 2 {{[0-9]+}} [[LOADED_FOO_PTR]]
++;
++; CHECK-LLVM: define spir_kernel void @test
++; CHECK-LLVM: %fp = alloca i32 (i32)*
++; CHECK-LLVM: store i32 (i32)* @foo, i32 (i32)** %fp
++; CHECK-LLVM: %0 = load i32 (i32)*, i32 (i32)** %fp
++; CHECK-LLVM: %call = call spir_func i32 %0(i32 %1)
++
++target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir64-unknown-unknown"
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @foo(i32 %arg) #0 {
++entry:
++  %arg.addr = alloca i32, align 4
++  store i32 %arg, i32* %arg.addr, align 4
++  %0 = load i32, i32* %arg.addr, align 4
++  %add = add nsw i32 %0, 10
++  ret i32 %add
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_kernel void @test(i32 addrspace(1)* %data, i32 %input) #1 !kernel_arg_addr_space !1 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
++entry:
++  %data.addr = alloca i32 addrspace(1)*, align 8
++  %input.addr = alloca i32, align 4
++  %fp = alloca i32 (i32)*, align 8
++  store i32 addrspace(1)* %data, i32 addrspace(1)** %data.addr, align 8
++  store i32 %input, i32* %input.addr, align 4
++  store i32 (i32)* @foo, i32 (i32)** %fp, align 8
++  %0 = load i32 (i32)*, i32 (i32)** %fp, align 8
++  %1 = load i32, i32* %input.addr, align 4
++  %call = call spir_func i32 %0(i32 %1) #2
++  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %data.addr, align 8
++  store i32 %call, i32 addrspace(1)* %2, align 4
++  ret void
++}
++
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { convergent }
++
++!llvm.module.flags = !{!0}
++!opencl.ocl.version = !{!1}
++!opencl.spir.version = !{!2}
++!opencl.used.extensions = !{!3}
++!opencl.used.optional.core.features = !{!3}
++!opencl.compiler.options = !{!3}
++!llvm.ident = !{!4}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 1, i32 0}
++!2 = !{i32 1, i32 2}
++!3 = !{}
++!4 = !{!"clang version 7.0.0 "}
++!5 = !{!"none", !"none"}
++!6 = !{!"int*", !"int"}
++!7 = !{!"", !""}
++
+diff --git a/test/transcoding/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll b/test/transcoding/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll
+new file mode 100644
+index 0000000..30521cd
+--- /dev/null
++++ b/test/transcoding/SPV_INTEL_function_pointers/non-uniform-function-pointer.ll
+@@ -0,0 +1,138 @@
++; RUN: llvm-as %s -o %t.bc
++; RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
++; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
++; RUN: llvm-spirv %t.bc -o %t.spv
++; RUN: llvm-spirv -r %t.spv -o %t.r.bc
++; RUN: llvm-dis %t.r.bc -o %t.r.ll
++; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
++;
++; Generated from:
++; int foo(int v) {
++;   return v + 1;
++; }
++;
++; int bar(int v) {
++;   return v + 2;
++; }
++;
++; __kernel void test(__global int *data, int control) {
++;   int (*fp)(int) = 0;
++;
++;   if (get_global_id(0) % control == 0)
++;     fp = &foo;
++;   else
++;     fp = &bar;
++;
++;   data[get_global_id(0)] = fp(data[get_global_id(0)]);
++; }
++;
++; CHECK-SPIRV: Capability FunctionPointersINTEL
++;
++; CHECK-SPIRV: EntryPoint 6 [[KERNEL_ID:[0-9]+]] "test"
++; CHECK-SPIRV: TypeInt [[TYPE_INT32_ID:[0-9+]]] 32
++; CHECK-SPIRV: TypeFunction [[FOO_TYPE_ID:[0-9]+]] [[TYPE_INT32_ID]] [[TYPE_INT32_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_PTR_TYPE_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++; CHECK-SPIRV: TypePointer [[FOO_PTR_ALLOCA_TYPE_ID:[0-9]+]] 7 [[FOO_PTR_TYPE_ID]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[FOO_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++; CHECK-SPIRV: Function {{[0-9]+}} [[BAR_ID:[0-9]+]] {{[0-9]+}} [[FOO_TYPE_ID]]
++;
++; CHECK-SPIRV: Function {{[0-9]+}} [[KERNEL_ID]]
++; CHECK-SPIRV: Variable [[FOO_PTR_ALLOCA_TYPE_ID]] [[FOO_PTR_ALLOCA_ID:[0-9]+]]
++; CHECK-SPIRV: FunctionPointerINTEL [[FOO_PTR_TYPE_ID]] [[FOO_PTR_ID:[0-9]+]] [[FOO_ID]]
++; CHECK-SPIRV: Store [[FOO_PTR_ALLOCA_ID]] [[FOO_PTR_ID]]
++; CHECK-SPIRV: FunctionPointerINTEL [[FOO_PTR_TYPE_ID]] [[BAR_PTR_ID:[0-9]+]] [[BAR_ID]]
++; CHECK-SPIRV: Store [[FOO_PTR_ALLOCA_ID]] [[BAR_PTR_ID]]
++; CHECK-SPIRV: Load [[FOO_PTR_TYPE_ID]] [[LOADED_FOO_PTR:[0-9]+]] [[FOO_PTR_ALLOCA_ID]]
++; CHECK-SPIRV: FunctionPointerCallINTEL {{[0-9]+}} {{[0-9]+}} [[LOADED_FOO_PTR]]
++;
++; CHECK-LLVM: define spir_kernel void @test
++; CHECK-LLVM: %fp = alloca i32 (i32)*
++; CHECK-LLVM: store i32 (i32)* @foo, i32 (i32)** %fp
++; CHECK-LLVM: store i32 (i32)* @bar, i32 (i32)** %fp
++; CHECK-LLVM: %[[FP:.*]] = load i32 (i32)*, i32 (i32)** %fp
++; CHECK-LLVM: call spir_func i32 %[[FP]](i32 %{{.*}})
++
++
++target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir64-unknown-unknown"
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @foo(i32 %v) #0 {
++entry:
++  %v.addr = alloca i32, align 4
++  store i32 %v, i32* %v.addr, align 4
++  %0 = load i32, i32* %v.addr, align 4
++  %add = add nsw i32 %0, 1
++  ret i32 %add
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @bar(i32 %v) #0 {
++entry:
++  %v.addr = alloca i32, align 4
++  store i32 %v, i32* %v.addr, align 4
++  %0 = load i32, i32* %v.addr, align 4
++  %add = add nsw i32 %0, 2
++  ret i32 %add
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_kernel void @test(i32 addrspace(1)* %data, i32 %control) #1 !kernel_arg_addr_space !1 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
++entry:
++  %data.addr = alloca i32 addrspace(1)*, align 8
++  %control.addr = alloca i32, align 4
++  %fp = alloca i32 (i32)*, align 8
++  store i32 addrspace(1)* %data, i32 addrspace(1)** %data.addr, align 8
++  store i32 %control, i32* %control.addr, align 4
++  store i32 (i32)* null, i32 (i32)** %fp, align 8
++  %call = call spir_func i64 @_Z13get_global_idj(i32 0) #3
++  %0 = load i32, i32* %control.addr, align 4
++  %conv = sext i32 %0 to i64
++  %rem = urem i64 %call, %conv
++  %cmp = icmp eq i64 %rem, 0
++  br i1 %cmp, label %if.then, label %if.else
++
++if.then:                                          ; preds = %entry
++  store i32 (i32)* @foo, i32 (i32)** %fp, align 8
++  br label %if.end
++
++if.else:                                          ; preds = %entry
++  store i32 (i32)* @bar, i32 (i32)** %fp, align 8
++  br label %if.end
++
++if.end:                                           ; preds = %if.else, %if.then
++  %1 = load i32 (i32)*, i32 (i32)** %fp, align 8
++  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %data.addr, align 8
++  %call2 = call spir_func i64 @_Z13get_global_idj(i32 0) #3
++  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %call2
++  %3 = load i32, i32 addrspace(1)* %arrayidx, align 4
++  %call3 = call spir_func i32 %1(i32 %3) #4
++  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %data.addr, align 8
++  %call4 = call spir_func i64 @_Z13get_global_idj(i32 0) #3
++  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %call4
++  store i32 %call3, i32 addrspace(1)* %arrayidx5, align 4
++  ret void
++}
++
++; Function Attrs: convergent nounwind readnone
++declare spir_func i64 @_Z13get_global_idj(i32) #2
++
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #3 = { convergent nounwind readnone }
++attributes #4 = { convergent }
++
++!llvm.module.flags = !{!0}
++!opencl.ocl.version = !{!1}
++!opencl.spir.version = !{!2}
++!llvm.ident = !{!3}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 1, i32 0}
++!2 = !{i32 1, i32 2}
++!3 = !{!"clang version 7.1.0 "}
++!4 = !{!"none", !"none"}
++!5 = !{!"int*", !"int"}
++!6 = !{!"", !""}
+diff --git a/test/transcoding/SPV_INTEL_function_pointers/referenced-indirectly.ll b/test/transcoding/SPV_INTEL_function_pointers/referenced-indirectly.ll
+new file mode 100644
+index 0000000..bd435c1
+--- /dev/null
++++ b/test/transcoding/SPV_INTEL_function_pointers/referenced-indirectly.ll
+@@ -0,0 +1,81 @@
++; RUN: llvm-as %s -o %t.bc
++; RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
++; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
++; RUN: llvm-spirv %t.bc -o %t.spv
++; RUN: llvm-spirv -r %t.spv -o %t.r.bc
++; RUN: llvm-dis %t.r.bc -o %t.r.ll
++; RUN: FileCheck < %t.r.ll %s --check-prefix=CHECK-LLVM
++;
++; Generated from:
++; __attribute__((referenced_indirectly))
++; int foo(int arg) {
++;   return arg + 10;
++; }
++;
++; void __kernel test(__global int *data, int input) {
++;   int (__constant *fp)(int) = &foo;
++;
++;   *data = fp(input);
++; }
++;
++; CHECK-SPIRV: Capability FunctionPointersINTEL
++; CHECK-SPIRV: Capability IndirectReferencesINTEL
++;
++; CHECK-SPIRV: Name [[FOO_ID:[0-9]+]] "foo"
++; CHECK-SPIRV: Decorate [[FOO_ID]] ReferencedIndirectlyINTEL
++; CHECK-SPIRV: Function {{[0-9]+}} [[FOO_ID]]
++;
++; CHECK-LLVM: define spir_func i32 @foo(i32 %arg) #[[ATTRS:[0-9]+]]
++; CHECK-LLVM: attributes #[[ATTRS]] = {{.*}} "referenced-indirectly"
++
++target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir64-unknown-unknown"
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_func i32 @foo(i32 %arg) #0 {
++entry:
++  %arg.addr = alloca i32, align 4
++  store i32 %arg, i32* %arg.addr, align 4
++  %0 = load i32, i32* %arg.addr, align 4
++  %add = add nsw i32 %0, 10
++  ret i32 %add
++}
++
++; Function Attrs: convergent noinline nounwind optnone
++define spir_kernel void @test(i32 addrspace(1)* %data, i32 %input) #1 !kernel_arg_addr_space !1 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
++entry:
++  %data.addr = alloca i32 addrspace(1)*, align 8
++  %input.addr = alloca i32, align 4
++  %fp = alloca i32 (i32)*, align 8
++  store i32 addrspace(1)* %data, i32 addrspace(1)** %data.addr, align 8
++  store i32 %input, i32* %input.addr, align 4
++  store i32 (i32)* @foo, i32 (i32)** %fp, align 8
++  %0 = load i32 (i32)*, i32 (i32)** %fp, align 8
++  %1 = load i32, i32* %input.addr, align 4
++  %call = call spir_func i32 %0(i32 %1) #2
++  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %data.addr, align 8
++  store i32 %call, i32 addrspace(1)* %2, align 4
++  ret void
++}
++
++attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" "referenced-indirectly" }
++attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { convergent }
++
++!llvm.module.flags = !{!0}
++!opencl.ocl.version = !{!1}
++!opencl.spir.version = !{!2}
++!opencl.used.extensions = !{!3}
++!opencl.used.optional.core.features = !{!3}
++!opencl.compiler.options = !{!3}
++!llvm.ident = !{!4}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 1, i32 0}
++!2 = !{i32 1, i32 2}
++!3 = !{}
++!4 = !{!"clang version 7.0.0 "}
++!5 = !{!"none", !"none"}
++!6 = !{!"int*", !"int"}
++!7 = !{!"", !""}
++
+-- 
+2.17.1
+
diff --git a/patches/spirv/0008-Support-lowering-of-constant-expressions-vector.patch b/patches/spirv/0008-Support-lowering-of-constant-expressions-vector.patch
new file mode 100644
index 00000000..c92bd41c
--- /dev/null
+++ b/patches/spirv/0008-Support-lowering-of-constant-expressions-vector.patch
@@ -0,0 +1,215 @@
+From a8176d6a4172c7233055cd1b62bb2e606e4edcd4 Mon Sep 17 00:00:00 2001
+From: Aleksandr Bezzubikov <aleksandr.bezzubikov@intel.com>
+Date: Sat, 18 Apr 2020 00:44:26 +0300
+Subject: [PATCH 08/10] Support lowering of constant expressions vector
+
+Since the translator has to lower LLVM constant expressions to be able to represent them in SPIR-V,
+we now need to support not only single constexprs but vectors of constexprs as well.
+---
+ lib/SPIRV/SPIRVLowerConstExpr.cpp |  64 +++++++++++------
+ test/constexpr_vector.ll          | 115 ++++++++++++++++++++++++++++++
+ 2 files changed, 157 insertions(+), 22 deletions(-)
+ create mode 100644 test/constexpr_vector.ll
+
+diff --git a/lib/SPIRV/SPIRVLowerConstExpr.cpp b/lib/SPIRV/SPIRVLowerConstExpr.cpp
+index 730f0d9..0c7d04d 100644
+--- a/lib/SPIRV/SPIRVLowerConstExpr.cpp
++++ b/lib/SPIRV/SPIRVLowerConstExpr.cpp
+@@ -125,31 +125,51 @@ void SPIRVLowerConstExpr::visit(Module *M) {
+     auto FBegin = I.begin();
+     while (!WorkList.empty()) {
+       auto II = WorkList.front();
+-      WorkList.pop_front();
+-      for (unsigned OI = 0, OE = II->getNumOperands(); OI != OE; ++OI) {
+-        auto Op = II->getOperand(OI);
+ 
+-        if (auto CE = dyn_cast<ConstantExpr>(Op)) {
+-          SPIRVDBG(dbgs() << "[lowerConstantExpressions] " << *CE;)
+-          auto ReplInst = CE->getAsInstruction();
+-          auto InsPoint = II->getParent() == &*FBegin ? II : &FBegin->back();
+-          ReplInst->insertBefore(InsPoint);
+-          SPIRVDBG(dbgs() << " -> " << *ReplInst << '\n';)
+-          WorkList.push_front(ReplInst);
+-          std::vector<Instruction *> Users;
+-          // Do not replace use during iteration of use. Do it in another loop
+-          for (auto U : CE->users()) {
+-            SPIRVDBG(dbgs()
+-                         << "[lowerConstantExpressions] Use: " << *U << '\n';)
+-            if (auto InstUser = dyn_cast<Instruction>(U)) {
+-              // Only replace users in scope of current function
+-              if (InstUser->getParent()->getParent() == &I)
+-                Users.push_back(InstUser);
+-            }
++      auto LowerOp = [&II, &FBegin, &I](ConstantExpr *CE) {
++        SPIRVDBG(dbgs() << "[lowerConstantExpressions] " << *CE;)
++        auto ReplInst = CE->getAsInstruction();
++        auto InsPoint = II->getParent() == &*FBegin ? II : &FBegin->back();
++        ReplInst->insertBefore(InsPoint);
++        SPIRVDBG(dbgs() << " -> " << *ReplInst << '\n';)
++        std::vector<Instruction *> Users;
++        // Do not replace use during iteration of use. Do it in another loop
++        for (auto U : CE->users()) {
++          SPIRVDBG(dbgs() << "[lowerConstantExpressions] Use: " << *U << '\n';)
++          if (auto InstUser = dyn_cast<Instruction>(U)) {
++            // Only replace users in scope of current function
++            if (InstUser->getParent()->getParent() == &I)
++              Users.push_back(InstUser);
+           }
+-          for (auto &User : Users)
+-            User->replaceUsesOfWith(CE, ReplInst);
+         }
++        for (auto &User : Users)
++          User->replaceUsesOfWith(CE, ReplInst);
++        return ReplInst;
++      };
++
++      WorkList.pop_front();
++      for (unsigned OI = 0, OE = II->getNumOperands(); OI != OE; ++OI) {
++        auto Op = II->getOperand(OI);
++        auto *Vec = dyn_cast<ConstantVector>(Op);
++        if (Vec && std::all_of(Vec->op_begin(), Vec->op_end(),
++                               [](Value *V) { return isa<ConstantExpr>(V); })) {
++          // Expand a vector of constexprs and construct it back with series of
++          // insertelement instructions
++          std::list<Instruction *> ReplList;
++          std::transform(
++              Vec->op_begin(), Vec->op_end(), std::back_inserter(ReplList),
++              [LowerOp](Value *V) { return LowerOp(cast<ConstantExpr>(V)); });
++          Value *Repl = nullptr;
++          unsigned Idx = 0;
++          for (auto V : ReplList)
++            Repl = InsertElementInst::Create(
++                (Repl ? Repl : UndefValue::get(Vec->getType())), V,
++                ConstantInt::get(Type::getInt32Ty(M->getContext()), Idx++), "",
++                II);
++          II->replaceUsesOfWith(Op, Repl);
++          WorkList.splice(WorkList.begin(), ReplList);
++        } else if (auto CE = dyn_cast<ConstantExpr>(Op))
++          WorkList.push_front(LowerOp(CE));
+       }
+     }
+   }
+diff --git a/test/constexpr_vector.ll b/test/constexpr_vector.ll
+new file mode 100644
+index 0000000..6df71e2
+--- /dev/null
++++ b/test/constexpr_vector.ll
+@@ -0,0 +1,115 @@
++; RUN: llvm-as < %s | llvm-spirv -spirv-mem2reg=false -s | llvm-dis | FileCheck %s --check-prefix=CHECK-LLVM
++
++; CHECK-LLVM: define dllexport void @vadd() {
++; CHECK-LLVM-NEXT: entry:
++; CHECK-LLVM-NEXT:   %Funcs = alloca <16 x i8>, align 16
++; CHECK-LLVM-NEXT:   %0 = ptrtoint i32 (i32)* @_Z2f1u2CMvb32_j to i64
++; CHECK-LLVM-NEXT:   %1 = bitcast i64 %0 to <8 x i8>
++; CHECK-LLVM-NEXT:   %2 = extractelement <8 x i8> %1, i32 0
++; CHECK-LLVM-NEXT:   %3 = extractelement <8 x i8> %1, i32 1
++; CHECK-LLVM-NEXT:   %4 = extractelement <8 x i8> %1, i32 2
++; CHECK-LLVM-NEXT:   %5 = extractelement <8 x i8> %1, i32 3
++; CHECK-LLVM-NEXT:   %6 = extractelement <8 x i8> %1, i32 4
++; CHECK-LLVM-NEXT:   %7 = extractelement <8 x i8> %1, i32 5
++; CHECK-LLVM-NEXT:   %8 = extractelement <8 x i8> %1, i32 6
++; CHECK-LLVM-NEXT:   %9 = extractelement <8 x i8> %1, i32 7
++; CHECK-LLVM-NEXT:   %10 = ptrtoint i32 (i32)* @_Z2f2u2CMvb32_j to i64
++; CHECK-LLVM-NEXT:   %11 = bitcast i64 %10 to <8 x i8>
++; CHECK-LLVM-NEXT:   %12 = extractelement <8 x i8> %11, i32 0
++; CHECK-LLVM-NEXT:   %13 = extractelement <8 x i8> %11, i32 1
++; CHECK-LLVM-NEXT:   %14 = extractelement <8 x i8> %11, i32 2
++; CHECK-LLVM-NEXT:   %15 = extractelement <8 x i8> %11, i32 3
++; CHECK-LLVM-NEXT:   %16 = extractelement <8 x i8> %11, i32 4
++; CHECK-LLVM-NEXT:   %17 = extractelement <8 x i8> %11, i32 5
++; CHECK-LLVM-NEXT:   %18 = extractelement <8 x i8> %11, i32 6
++; CHECK-LLVM-NEXT:   %19 = extractelement <8 x i8> %11, i32 7
++; CHECK-LLVM-NEXT:   %20 = insertelement <16 x i8> undef, i8 %2, i32 0
++; CHECK-LLVM-NEXT:   %21 = insertelement <16 x i8> %20, i8 %3, i32 1
++; CHECK-LLVM-NEXT:   %22 = insertelement <16 x i8> %21, i8 %4, i32 2
++; CHECK-LLVM-NEXT:   %23 = insertelement <16 x i8> %22, i8 %5, i32 3
++; CHECK-LLVM-NEXT:   %24 = insertelement <16 x i8> %23, i8 %6, i32 4
++; CHECK-LLVM-NEXT:   %25 = insertelement <16 x i8> %24, i8 %7, i32 5
++; CHECK-LLVM-NEXT:   %26 = insertelement <16 x i8> %25, i8 %8, i32 6
++; CHECK-LLVM-NEXT:   %27 = insertelement <16 x i8> %26, i8 %9, i32 7
++; CHECK-LLVM-NEXT:   %28 = insertelement <16 x i8> %27, i8 %12, i32 8
++; CHECK-LLVM-NEXT:   %29 = insertelement <16 x i8> %28, i8 %13, i32 9
++; CHECK-LLVM-NEXT:   %30 = insertelement <16 x i8> %29, i8 %14, i32 10
++; CHECK-LLVM-NEXT:   %31 = insertelement <16 x i8> %30, i8 %15, i32 11
++; CHECK-LLVM-NEXT:   %32 = insertelement <16 x i8> %31, i8 %16, i32 12
++; CHECK-LLVM-NEXT:   %33 = insertelement <16 x i8> %32, i8 %17, i32 13
++; CHECK-LLVM-NEXT:   %34 = insertelement <16 x i8> %33, i8 %18, i32 14
++; CHECK-LLVM-NEXT:   %35 = insertelement <16 x i8> %34, i8 %19, i32 15
++; CHECK-LLVM-NEXT:   store <16 x i8> %35, <16 x i8>* %Funcs, align 16
++; CHECK-LLVM-NEXT:   ret void
++; CHECK-LLVM-NEXT: }
++
++; RUN: llvm-as < %s | llvm-spirv -spirv-text -spirv-mem2reg=false | FileCheck %s --check-prefix=CHECK-SPIRV
++
++; CHECK-SPIRV: 4 Name [[Funcs:[0-9]+]] "Funcs"
++
++; CHECK-SPIRV-DAG: 4 TypeInt [[TypeInt8:[0-9]+]] 8 0
++; CHECK-SPIRV-DAG: 4 TypeInt [[TypeInt32:[0-9]+]] 32 0
++; CHECK-SPIRV-DAG: 4 TypeInt [[TypeInt64:[0-9]+]] 64 0
++; CHECK-SPIRV-DAG: 4 TypeVector [[TypeVec16:[0-9]+]] [[TypeInt8]] 16
++; CHECK-SPIRV-DAG: 4 TypeVector [[TypeVec8:[0-9]+]] [[TypeInt8]] 8
++; CHECK-SPIRV-DAG: 4 TypePointer [[StorePtr:[0-9]+]] 7 [[TypeVec16]]
++; CHECK-SPIRV-DAG: 3 Undef [[TypeVec16]] [[TypeUndef:[0-9]+]]
++
++; CHECK-SPIRV: 4 ConvertPtrToU [[TypeInt64]] [[Ptr1:[0-9]+]] {{[0-9]+}}
++; CHECK-SPIRV-NEXT: 4 Bitcast [[TypeVec8]] [[Vec1:[0-9]+]] [[Ptr1]]
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v00:[0-9]+]] [[Vec1]] 0
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v01:[0-9]+]] [[Vec1]] 1
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v02:[0-9]+]] [[Vec1]] 2
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v03:[0-9]+]] [[Vec1]] 3
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v04:[0-9]+]] [[Vec1]] 4
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v05:[0-9]+]] [[Vec1]] 5
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v06:[0-9]+]] [[Vec1]] 6
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v07:[0-9]+]] [[Vec1]] 7
++; CHECK-SPIRV-NEXT: 4 ConvertPtrToU [[TypeInt64]] [[Ptr2:[0-9]+]] {{[0-9]+}}
++; CHECK-SPIRV-NEXT: 4 Bitcast [[TypeVec8]] [[Vec2:[0-9]+]] [[Ptr2]]
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v10:[0-9]+]] [[Vec2]] 0
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v11:[0-9]+]] [[Vec2]] 1
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v12:[0-9]+]] [[Vec2]] 2
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v13:[0-9]+]] [[Vec2]] 3
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v14:[0-9]+]] [[Vec2]] 4
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v15:[0-9]+]] [[Vec2]] 5
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v16:[0-9]+]] [[Vec2]] 6
++; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v17:[0-9]+]] [[Vec2]] 7
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec0:[0-9]+]] [[v00]] [[TypeUndef]] 0
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec1:[0-9]+]] [[v01]] [[NewVec0]] 1
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec2:[0-9]+]] [[v02]] [[NewVec1]] 2
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec3:[0-9]+]] [[v03]] [[NewVec2]] 3
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec4:[0-9]+]] [[v04]] [[NewVec3]] 4
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec5:[0-9]+]] [[v05]] [[NewVec4]] 5
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec6:[0-9]+]] [[v06]] [[NewVec5]] 6
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec7:[0-9]+]] [[v07]] [[NewVec6]] 7
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec8:[0-9]+]] [[v10]] [[NewVec7]] 8
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec9:[0-9]+]] [[v11]] [[NewVec8]] 9
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec10:[0-9]+]] [[v12]] [[NewVec9]] 10
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec11:[0-9]+]] [[v13]] [[NewVec10]] 11
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec12:[0-9]+]] [[v14]] [[NewVec11]] 12
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec13:[0-9]+]] [[v15]] [[NewVec12]] 13
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec14:[0-9]+]] [[v16]] [[NewVec13]] 14
++; CHECK-SPIRV-NEXT: 6 CompositeInsert [[TypeVec16]] [[NewVec15:[0-9]+]] [[v17]] [[NewVec14]] 15
++; CHECK-SPIRV-NEXT: 5 Store [[Funcs]] [[NewVec15]] [[TypeInt32]] [[StorePtr]]
++
++target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir-unknown-unknown"
++
++; Function Attrs: noinline norecurse nounwind readnone
++define internal i32 @_Z2f1u2CMvb32_j(i32 %x) {
++entry:
++  ret i32 %x
++}
++; Function Attrs: noinline norecurse nounwind readnone
++define internal i32 @_Z2f2u2CMvb32_j(i32 %x) {
++entry:
++  ret i32 %x
++}
++; Function Attrs: noinline nounwind
++define dllexport void @vadd() {
++entry:
++  %Funcs = alloca <16 x i8>, align 16
++  store <16 x i8> <i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 7), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 7)>, <16 x i8>* %Funcs, align 16
++  ret void
++}
+-- 
+2.17.1
+
diff --git a/patches/spirv/0009-Support-function-pointers-in-cast-instructions.patch b/patches/spirv/0009-Support-function-pointers-in-cast-instructions.patch
new file mode 100644
index 00000000..0f8046d1
--- /dev/null
+++ b/patches/spirv/0009-Support-function-pointers-in-cast-instructions.patch
@@ -0,0 +1,98 @@
+From 4f791674ee90444075d1b05c1553da23536c902a Mon Sep 17 00:00:00 2001
+From: Aleksandr Bezzubikov <zuban32s@gmail.com>
+Date: Thu, 30 Apr 2020 22:19:20 +0300
+Subject: [PATCH 09/10] Support function pointers in cast instructions
+
+---
+ lib/SPIRV/SPIRVWriter.cpp           | 12 +++++++++++-
+ test/DebugInfo/DebugValueInvalid.ll |  6 ------
+ test/constexpr_vector.ll            | 10 +++++++---
+ 3 files changed, 18 insertions(+), 10 deletions(-)
+
+diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp
+index df6f070..7fccf34 100644
+--- a/lib/SPIRV/SPIRVWriter.cpp
++++ b/lib/SPIRV/SPIRVWriter.cpp
+@@ -684,6 +684,7 @@ SPIRVInstruction *LLVMToSPIRV::transCmpInst(CmpInst *Cmp, SPIRVBasicBlock *BB) {
+ SPIRV::SPIRVInstruction *LLVMToSPIRV::transUnaryInst(UnaryInstruction *U,
+                                                      SPIRVBasicBlock *BB) {
+   Op BOC = OpNop;
++  SPIRVValue *Op = nullptr;
+   if (auto Cast = dyn_cast<AddrSpaceCastInst>(U)) {
+     if (Cast->getDestTy()->getPointerAddressSpace() == SPIRAS_Generic) {
+       assert(Cast->getSrcTy()->getPointerAddressSpace() != SPIRAS_Constant &&
+@@ -698,9 +699,18 @@ SPIRV::SPIRVInstruction *LLVMToSPIRV::transUnaryInst(UnaryInstruction *U,
+   } else {
+     auto OpCode = U->getOpcode();
+     BOC = OpCodeMap::map(OpCode);
++
++    if (Function *F = dyn_cast<Function>(U->getOperand(0))) {
++      assert(BOC == OpConvertPtrToU &&
++             "Illegal unary operation on function pointer");
++      Op = BM->addFunctionPointerINTELInst(
++          transType(F->getType()),
++          static_cast<SPIRVFunction *>(transValue(F, BB)), BB);
++    }
+   }
+ 
+-  auto Op = transValue(U->getOperand(0), BB);
++  if (!Op)
++    Op = transValue(U->getOperand(0), BB);
+   return BM->addUnaryInst(transBoolOpCode(Op, BOC), transType(U->getType()), Op,
+                           BB);
+ }
+diff --git a/test/DebugInfo/DebugValueInvalid.ll b/test/DebugInfo/DebugValueInvalid.ll
+index d58eece..01dd4d1 100644
+--- a/test/DebugInfo/DebugValueInvalid.ll
++++ b/test/DebugInfo/DebugValueInvalid.ll
+@@ -54,12 +54,6 @@ entry:
+   %call = call spir_func %opencl.queue_t* @_Z17get_default_queuev() #6, !dbg !54
+ ; CHECK: InBoundsPtrAccessChain
+ ; CHECK: Store
+-; SPIRVLowerOCLBlocks pass has removed bitcast and addrspacecast, because their operands are function pointers.
+-; CHECK-NOT: BitCast
+-; CHECK-NOT: PtrCastToGeneric
+-; The result of these casts was used by llvm.dbg.value. This intrinsic becomes invalid,
+-; so we do not translate it and its debug location
+-; CHECK-NOT: DebugValue
+ ; CHECK: GetDefaultQueue
+ 
+   %3 = bitcast %struct.ndrange_t* %tmp to i8*, !dbg !55
+diff --git a/test/constexpr_vector.ll b/test/constexpr_vector.ll
+index 6df71e2..d441429 100644
+--- a/test/constexpr_vector.ll
++++ b/test/constexpr_vector.ll
+@@ -45,7 +45,9 @@
+ 
+ ; RUN: llvm-as < %s | llvm-spirv -spirv-text -spirv-mem2reg=false | FileCheck %s --check-prefix=CHECK-SPIRV
+ 
+-; CHECK-SPIRV: 4 Name [[Funcs:[0-9]+]] "Funcs"
++; CHECK-SPIRV-DAG: 4 Name [[Funcs:[0-9]+]] "Funcs"
++; CHECK-SPIRV-DAG: 6 Name [[F1:[0-9+]]] "_Z2f1u2CMvb32_j"
++; CHECK-SPIRV-DAG: 6 Name [[F2:[0-9+]]] "_Z2f2u2CMvb32_j"
+ 
+ ; CHECK-SPIRV-DAG: 4 TypeInt [[TypeInt8:[0-9]+]] 8 0
+ ; CHECK-SPIRV-DAG: 4 TypeInt [[TypeInt32:[0-9]+]] 32 0
+@@ -55,7 +57,8 @@
+ ; CHECK-SPIRV-DAG: 4 TypePointer [[StorePtr:[0-9]+]] 7 [[TypeVec16]]
+ ; CHECK-SPIRV-DAG: 3 Undef [[TypeVec16]] [[TypeUndef:[0-9]+]]
+ 
+-; CHECK-SPIRV: 4 ConvertPtrToU [[TypeInt64]] [[Ptr1:[0-9]+]] {{[0-9]+}}
++; CHECK-SPIRV: 4 FunctionPointerINTEL [[FuncPtrTy:[0-9]+]] [[F1Ptr:[0-9]+]] [[F1]]
++; CHECK-SPIRV-NEXT: 4 ConvertPtrToU [[TypeInt64]] [[Ptr1:[0-9]+]] [[F1Ptr]]
+ ; CHECK-SPIRV-NEXT: 4 Bitcast [[TypeVec8]] [[Vec1:[0-9]+]] [[Ptr1]]
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v00:[0-9]+]] [[Vec1]] 0
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v01:[0-9]+]] [[Vec1]] 1
+@@ -65,7 +68,8 @@
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v05:[0-9]+]] [[Vec1]] 5
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v06:[0-9]+]] [[Vec1]] 6
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v07:[0-9]+]] [[Vec1]] 7
+-; CHECK-SPIRV-NEXT: 4 ConvertPtrToU [[TypeInt64]] [[Ptr2:[0-9]+]] {{[0-9]+}}
++; CHECK-SPIRV-NEXT: 4 FunctionPointerINTEL [[FuncPtrTy]] [[F2Ptr:[0-9]+]] [[F2]]
++; CHECK-SPIRV-NEXT: 4 ConvertPtrToU [[TypeInt64]] [[Ptr2:[0-9]+]] [[F2Ptr]]
+ ; CHECK-SPIRV-NEXT: 4 Bitcast [[TypeVec8]] [[Vec2:[0-9]+]] [[Ptr2]]
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v10:[0-9]+]] [[Vec2]] 0
+ ; CHECK-SPIRV-NEXT: 5 CompositeExtract [[TypeInt8]] [[v11:[0-9]+]] [[Vec2]] 1
+-- 
+2.17.1
+
diff --git a/patches/spirv/0010-Improve-constant-expressions-lowering-for-function-p.patch b/patches/spirv/0010-Improve-constant-expressions-lowering-for-function-p.patch
new file mode 100644
index 00000000..48c38403
--- /dev/null
+++ b/patches/spirv/0010-Improve-constant-expressions-lowering-for-function-p.patch
@@ -0,0 +1,192 @@
+From d13e7857c08beee2143f35b4222e8f5a9f78570e Mon Sep 17 00:00:00 2001
+From: Aleksandr Bezzubikov <zuban32s@gmail.com>
+Date: Wed, 20 May 2020 01:16:22 +0300
+Subject: [PATCH] Improve constant expressions lowering for function pointers.
+
+Extend constexprs lowering support to lower constant vector of pure function pointers
+w/o any transformations inside.
+---
+ lib/SPIRV/SPIRVLowerConstExpr.cpp             | 27 ++++++++-----
+ lib/SPIRV/SPIRVWriter.cpp                     | 14 +++++--
+ test/constexpr_vector.ll                      | 13 ++++--
+ .../vector_elem.ll                            | 40 +++++++++++++++++++
+ 4 files changed, 76 insertions(+), 18 deletions(-)
+ create mode 100644 test/transcoding/SPV_INTEL_function_pointers/vector_elem.ll
+
+diff --git a/lib/SPIRV/SPIRVLowerConstExpr.cpp b/lib/SPIRV/SPIRVLowerConstExpr.cpp
+index 0c7d04d..74d73fd 100644
+--- a/lib/SPIRV/SPIRVLowerConstExpr.cpp
++++ b/lib/SPIRV/SPIRVLowerConstExpr.cpp
+@@ -115,7 +115,6 @@ bool SPIRVLowerConstExpr::runOnModule(Module &Module) {
+ 
+ void SPIRVLowerConstExpr::visit(Module *M) {
+   for (auto &I : M->functions()) {
+-    std::map<ConstantExpr *, Instruction *> CMap;
+     std::list<Instruction *> WorkList;
+     for (auto &BI : I) {
+       for (auto &II : BI) {
+@@ -126,7 +125,10 @@ void SPIRVLowerConstExpr::visit(Module *M) {
+     while (!WorkList.empty()) {
+       auto II = WorkList.front();
+ 
+-      auto LowerOp = [&II, &FBegin, &I](ConstantExpr *CE) {
++      auto LowerOp = [&II, &FBegin, &I](Value *V) -> Value * {
++        if (isa<Function>(V))
++          return V;
++        auto *CE = cast<ConstantExpr>(V);
+         SPIRVDBG(dbgs() << "[lowerConstantExpressions] " << *CE;)
+         auto ReplInst = CE->getAsInstruction();
+         auto InsPoint = II->getParent() == &*FBegin ? II : &FBegin->back();
+@@ -151,25 +153,30 @@ void SPIRVLowerConstExpr::visit(Module *M) {
+       for (unsigned OI = 0, OE = II->getNumOperands(); OI != OE; ++OI) {
+         auto Op = II->getOperand(OI);
+         auto *Vec = dyn_cast<ConstantVector>(Op);
+-        if (Vec && std::all_of(Vec->op_begin(), Vec->op_end(),
+-                               [](Value *V) { return isa<ConstantExpr>(V); })) {
++        if (Vec && std::all_of(Vec->op_begin(), Vec->op_end(), [](Value *V) {
++              return isa<ConstantExpr>(V) || isa<Function>(V);
++            })) {
+           // Expand a vector of constexprs and construct it back with series of
+           // insertelement instructions
+-          std::list<Instruction *> ReplList;
+-          std::transform(
+-              Vec->op_begin(), Vec->op_end(), std::back_inserter(ReplList),
+-              [LowerOp](Value *V) { return LowerOp(cast<ConstantExpr>(V)); });
++          std::list<Value *> OpList;
++          std::transform(Vec->op_begin(), Vec->op_end(),
++                         std::back_inserter(OpList),
++                         [LowerOp](Value *V) { return LowerOp(V); });
+           Value *Repl = nullptr;
+           unsigned Idx = 0;
+-          for (auto V : ReplList)
++          std::list<Instruction *> ReplList;
++          for (auto V : OpList) {
++            if (auto *Inst = dyn_cast<Instruction>(V))
++              ReplList.push_back(Inst);
+             Repl = InsertElementInst::Create(
+                 (Repl ? Repl : UndefValue::get(Vec->getType())), V,
+                 ConstantInt::get(Type::getInt32Ty(M->getContext()), Idx++), "",
+                 II);
++          }
+           II->replaceUsesOfWith(Op, Repl);
+           WorkList.splice(WorkList.begin(), ReplList);
+         } else if (auto CE = dyn_cast<ConstantExpr>(Op))
+-          WorkList.push_front(LowerOp(CE));
++          WorkList.push_front(cast<Instruction>(LowerOp(CE)));
+       }
+     }
+   }
+diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp
+index 7fccf34..8a0ca47 100644
+--- a/lib/SPIRV/SPIRVWriter.cpp
++++ b/lib/SPIRV/SPIRVWriter.cpp
+@@ -997,13 +997,19 @@ SPIRVValue *LLVMToSPIRV::transValueWithoutDecoration(Value *V,
+ 
+   if (auto Ins = dyn_cast<InsertElementInst>(V)) {
+     auto Index = Ins->getOperand(2);
+-    if (auto Const = dyn_cast<ConstantInt>(Index))
++    if (auto Const = dyn_cast<ConstantInt>(Index)) {
++      SPIRVValue *InsVal = nullptr;
++      if (auto *F = dyn_cast<Function>(Ins->getOperand(1))) {
++        InsVal = BM->addFunctionPointerINTELInst(
++            transType(F->getType()),
++            static_cast<SPIRVFunction *>(transValue(F, BB)), BB);
++      } else
++        InsVal = transValue(Ins->getOperand(1), BB);
+       return mapValue(V, BM->addCompositeInsertInst(
+-                             transValue(Ins->getOperand(1), BB),
+-                             transValue(Ins->getOperand(0), BB),
++                             InsVal, transValue(Ins->getOperand(0), BB),
+                              std::vector<SPIRVWord>(1, Const->getZExtValue()),
+                              BB));
+-    else
++    } else
+       return mapValue(
+           V, BM->addVectorInsertDynamicInst(transValue(Ins->getOperand(0), BB),
+                                             transValue(Ins->getOperand(1), BB),
+diff --git a/test/constexpr_vector.ll b/test/constexpr_vector.ll
+index d441429..d74b334 100644
+--- a/test/constexpr_vector.ll
++++ b/test/constexpr_vector.ll
+@@ -1,8 +1,7 @@
+ ; RUN: llvm-as < %s | llvm-spirv -spirv-mem2reg=false -s | llvm-dis | FileCheck %s --check-prefix=CHECK-LLVM
+ 
+ ; CHECK-LLVM: define dllexport void @vadd() {
+-; CHECK-LLVM-NEXT: entry:
+-; CHECK-LLVM-NEXT:   %Funcs = alloca <16 x i8>, align 16
++; CHECK-LLVM:   %Funcs = alloca <16 x i8>, align 16
+ ; CHECK-LLVM-NEXT:   %0 = ptrtoint i32 (i32)* @_Z2f1u2CMvb32_j to i64
+ ; CHECK-LLVM-NEXT:   %1 = bitcast i64 %0 to <8 x i8>
+ ; CHECK-LLVM-NEXT:   %2 = extractelement <8 x i8> %1, i32 0
+@@ -40,8 +39,12 @@
+ ; CHECK-LLVM-NEXT:   %34 = insertelement <16 x i8> %33, i8 %18, i32 14
+ ; CHECK-LLVM-NEXT:   %35 = insertelement <16 x i8> %34, i8 %19, i32 15
+ ; CHECK-LLVM-NEXT:   store <16 x i8> %35, <16 x i8>* %Funcs, align 16
+-; CHECK-LLVM-NEXT:   ret void
+-; CHECK-LLVM-NEXT: }
++; CHECK-LLVM:        %Funcs1 = alloca <2 x i64>, align 16
++; CHECK-LLVM-NEXT:   %36 = ptrtoint i32 (i32)* @_Z2f1u2CMvb32_j to i64
++; CHECK-LLVM-NEXT:   %37 = ptrtoint i32 (i32)* @_Z2f2u2CMvb32_j to i64
++; CHECK-LLVM-NEXT:   %38 = insertelement <2 x i64> undef, i64 %36, i32 0
++; CHECK-LLVM-NEXT:   %39 = insertelement <2 x i64> %38, i64 %37, i32 1
++; CHECK-LLVM-NEXT:   store <2 x i64> %39, <2 x i64>* %Funcs1, align 16
+ 
+ ; RUN: llvm-as < %s | llvm-spirv -spirv-text -spirv-mem2reg=false | FileCheck %s --check-prefix=CHECK-SPIRV
+ 
+@@ -115,5 +118,7 @@ define dllexport void @vadd() {
+ entry:
+   %Funcs = alloca <16 x i8>, align 16
+   store <16 x i8> <i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64) to <8 x i8>), i32 7), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 0), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 1), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 2), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 3), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 4), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 5), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 6), i8 extractelement (<8 x i8> bitcast (i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64) to <8 x i8>), i32 7)>, <16 x i8>* %Funcs, align 16
++  %Funcs1 = alloca <2 x i64>, align 16
++  store <2 x i64> <i64 ptrtoint (i32 (i32)* @_Z2f1u2CMvb32_j to i64), i64 ptrtoint (i32 (i32)* @_Z2f2u2CMvb32_j to i64)>, <2 x i64>* %Funcs1, align 16
+   ret void
+ }
+diff --git a/test/transcoding/SPV_INTEL_function_pointers/vector_elem.ll b/test/transcoding/SPV_INTEL_function_pointers/vector_elem.ll
+new file mode 100644
+index 0000000..85fa224
+--- /dev/null
++++ b/test/transcoding/SPV_INTEL_function_pointers/vector_elem.ll
+@@ -0,0 +1,40 @@
++; RUN: llvm-as < %s | llvm-spirv -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV
++
++; CHECK-SPIRV-DAG: 6 Name [[F1:[0-9+]]] "_Z2f1u2CMvb32_j"
++; CHECK-SPIRV-DAG: 6 Name [[F2:[0-9+]]] "_Z2f2u2CMvb32_j"
++
++; CHECK-SPIRV: 4 TypeInt [[TypeInt32:[0-9]+]] 32 0
++; CHECK-SPIRV: 4 TypeFunction [[TypeFunc:[0-9]+]] [[TypeInt32]] [[TypeInt32]]
++; CHECK-SPIRV: 4 TypePointer [[TypePtr:[0-9]+]] {{[0-9]+}} [[TypeFunc]]
++; CHECK-SPIRV: 4 TypeVector [[TypeVec:[0-9]+]] [[TypePtr]] [[TypeInt32]]
++; CHECK-SPIRV: 3 Undef [[TypeVec]] [[TypeUndef:[0-9]+]]
++
++; CHECK-SPIRV: 4 FunctionPointerINTEL [[TypePtr]] [[F1Ptr:[0-9]+]] [[F1]]
++; CHECK-SPIRV: 6 CompositeInsert [[TypeVec]] [[NewVec0:[0-9]+]] [[F1Ptr]] [[TypeUndef]] 0
++; CHECK-SPIRV: 4 FunctionPointerINTEL [[TypePtr]] [[F2Ptr:[0-9]+]] [[F2]]
++; CHECK-SPIRV: 6 CompositeInsert [[TypeVec]] [[NewVec1:[0-9]+]] [[F2Ptr]] [[NewVec0]] 1
++
++
++target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
++target triple = "spir-unknown-unknown"
++
++; Function Attrs: noinline norecurse nounwind readnone
++define internal i32 @_Z2f1u2CMvb32_j(i32 %x) {
++entry:
++  ret i32 %x
++}
++; Function Attrs: noinline norecurse nounwind readnone
++define internal i32 @_Z2f2u2CMvb32_j(i32 %x) {
++entry:
++  ret i32 %x
++}
++
++; Function Attrs: noinline nounwind
++define dllexport void @vadd() {
++entry:
++  %Funcs = alloca <2 x i32 (i32)*>, align 16
++  %0 = insertelement <2 x i32 (i32)*> undef, i32 (i32)* @_Z2f1u2CMvb32_j, i32 0
++  %1 = insertelement <2 x i32 (i32)*> %0, i32 (i32)* @_Z2f2u2CMvb32_j, i32 1
++  store <2 x i32 (i32)*> %1, <2 x i32 (i32)*>* %Funcs, align 16
++  ret void
++}
+-- 
+2.17.1
+