From 25931d6471890692ceb384cd5489d2319015f2c1 Mon Sep 17 00:00:00 2001
From: Memo Akten <memo@memo.tv>
Date: Wed, 17 May 2017 22:23:44 +0100
Subject: [PATCH] update headers to tensorflow r1.1

---
 libs/tensorflow/include/Eigen/PardisoSupport  |   0
 .../include/Eigen/src/Cholesky/LDLT.h         |   1 -
 .../include/Eigen/src/Cholesky/LLT.h          |   1 -
 .../include/Eigen/src/Core/ArrayBase.h        |   8 +-
 .../include/Eigen/src/Core/Assign.h           |   2 +-
 .../include/Eigen/src/Core/Assign_MKL.h       |   0
 .../include/Eigen/src/Core/BooleanRedux.h     |   6 +-
 .../include/Eigen/src/Core/CommaInitializer.h |   4 +-
 .../include/Eigen/src/Core/CoreEvaluators.h   |   8 +-
 .../include/Eigen/src/Core/CwiseBinaryOp.h    |   5 +-
 .../include/Eigen/src/Core/CwiseNullaryOp.h   |  84 +-
 .../include/Eigen/src/Core/DenseBase.h        |   8 +-
 .../include/Eigen/src/Core/DenseStorage.h     |  29 +-
 .../include/Eigen/src/Core/Diagonal.h         |  10 +-
 .../include/Eigen/src/Core/DiagonalMatrix.h   |   4 +-
 .../include/Eigen/src/Core/DiagonalProduct.h  |   2 +-
 libs/tensorflow/include/Eigen/src/Core/Dot.h  |  16 +-
 .../include/Eigen/src/Core/EigenBase.h        |   3 +
 .../tensorflow/include/Eigen/src/Core/Fuzzy.h |   6 +-
 .../include/Eigen/src/Core/GeneralProduct.h   |   2 +-
 .../Eigen/src/Core/GenericPacketMath.h        |   6 +-
 .../include/Eigen/src/Core/MatrixBase.h       |   4 +-
 .../include/Eigen/src/Core/NestByValue.h      |  10 +-
 .../include/Eigen/src/Core/PlainObjectBase.h  |  10 +-
 .../include/Eigen/src/Core/Random.h           |   2 +-
 .../tensorflow/include/Eigen/src/Core/Redux.h |  14 +-
 .../include/Eigen/src/Core/Replicate.h        |   4 +-
 .../include/Eigen/src/Core/ReturnByValue.h    |   2 +-
 .../include/Eigen/src/Core/Reverse.h          |   6 +-
 .../include/Eigen/src/Core/SelfAdjointView.h  |   4 +-
 .../Eigen/src/Core/SelfCwiseBinaryOp.h        |   8 +-
 .../include/Eigen/src/Core/SolveTriangular.h  |   2 +-
 .../include/Eigen/src/Core/StableNorm.h       |   3 +-
 .../include/Eigen/src/Core/Transpose.h        |  10 +-
 .../include/Eigen/src/Core/TriangularMatrix.h |  15 +-
 .../include/Eigen/src/Core/VectorwiseOp.h     |   4 +-
 .../Eigen/src/Core/arch/AltiVec/PacketMath.h  |   0
 .../include/Eigen/src/Core/arch/CUDA/Half.h   |   2 +-
 .../Eigen/src/Core/arch/CUDA/PacketMath.h     |   6 +-
 .../Eigen/src/Core/arch/NEON/PacketMath.h     |  52 +-
 .../Eigen/src/Core/arch/SSE/PacketMath.h      |   0
 .../Eigen/src/Core/arch/ZVector/PacketMath.h  |   0
 .../products/GeneralMatrixMatrixTriangular.h  |   8 +-
 .../src/Core/products/SelfadjointProduct.h    |   2 +-
 .../Core/products/SelfadjointRank2Update.h    |   2 +-
 .../Core/products/TriangularMatrixMatrix.h    |   4 +-
 .../include/Eigen/src/Core/util/BlasUtil.h    |   0
 .../src/Core/util/DisableStupidWarnings.h     |   0
 .../Eigen/src/Core/util/IntegralConstant.h    |   4 +-
 .../include/Eigen/src/Core/util/MKL_support.h |   0
 .../include/Eigen/src/Core/util/Macros.h      |   6 +-
 .../include/Eigen/src/Core/util/Meta.h        |   0
 .../src/Eigenvalues/MatrixBaseEigenvalues.h   |   4 +-
 .../src/Eigenvalues/SelfAdjointEigenSolver.h  |   3 +-
 .../include/Eigen/src/Geometry/Scaling.h      |   0
 .../Eigen/src/Householder/BlockHouseholder.h  |   3 +-
 .../include/Eigen/src/LU/FullPivLU.h          |   2 -
 .../Eigen/src/QR/ColPivHouseholderQR.h        |   1 -
 .../src/QR/CompleteOrthogonalDecomposition.h  |   2 +-
 .../Eigen/src/QR/FullPivHouseholderQR.h       |   9 +-
 .../include/Eigen/src/QR/HouseholderQR.h      |   9 +-
 .../include/Eigen/src/SVD/JacobiSVD.h         |   6 +-
 .../include/Eigen/src/SVD/SVDBase.h           |   1 -
 .../include/Eigen/src/misc/lapacke.h          |   9 +-
 libs/tensorflow/include/tensorflow/cc/BUILD   |  11 +
 .../tensorflow/cc/client/client_session.h     |  20 +-
 .../include/tensorflow/cc/framework/scope.h   |  91 +-
 .../include/tensorflow/cc/gradients/README.md |  52 ++
 .../include/tensorflow/cc/ops/array_ops.h     | 363 ++------
 .../tensorflow/cc/ops/array_ops_internal.h    |   0
 .../cc/ops/candidate_sampling_ops_internal.h  |   0
 .../cc/ops/control_flow_ops_internal.h        |   0
 .../include/tensorflow/cc/ops/data_flow_ops.h |  25 +-
 .../cc/ops/data_flow_ops_internal.h           |   0
 .../include/tensorflow/cc/ops/image_ops.h     |   2 +-
 .../tensorflow/cc/ops/image_ops_internal.h    |   0
 .../tensorflow/cc/ops/io_ops_internal.h       |   0
 .../tensorflow/cc/ops/linalg_ops_internal.h   |   0
 .../tensorflow/cc/ops/logging_ops_internal.h  |   0
 .../include/tensorflow/cc/ops/math_ops.h      | 283 +++---
 .../tensorflow/cc/ops/math_ops_internal.h     |   0
 .../include/tensorflow/cc/ops/nn_ops.h        | 147 ++-
 .../tensorflow/cc/ops/nn_ops_internal.h       |   0
 .../tensorflow/cc/ops/no_op_internal.h        |   0
 .../tensorflow/cc/ops/parsing_ops_internal.h  |   0
 .../tensorflow/cc/ops/random_ops_internal.h   |   0
 .../cc/ops/remote_fused_graph_ops.h           |  48 -
 .../cc/ops/remote_fused_graph_ops_internal.h  |  28 -
 .../tensorflow/cc/ops/sparse_ops_internal.h   |   0
 .../tensorflow/cc/ops/state_ops_internal.h    |   0
 .../include/tensorflow/cc/ops/string_ops.h    |   2 +-
 .../tensorflow/cc/ops/string_ops_internal.h   |   0
 .../tensorflow/cc/ops/training_ops_internal.h |   0
 .../tensorflow/cc/ops/user_ops_internal.h     |   0
 .../include/tensorflow/cc/saved_model/BUILD   |   1 +
 .../tensorflow/cc/saved_model/constants.h     |   3 +
 .../variables/variables.data-00000-of-00001   | Bin
 .../00000123/variables/variables.index        | Bin
 .../variables/variables.data-00000-of-00001   | Bin 0 -> 12 bytes
 .../00000123/variables/variables.index        | Bin 0 -> 151 bytes
 .../00000123/saved_model.pbtxt                |  40 +-
 .../variables/variables.data-00000-of-00001   | Bin
 .../00000123/variables/variables.index        | Bin
 .../tensorflow/cc/training/coordinator.h      |  16 +-
 .../tensorflow/cc/training/queue_runner.h     |  21 +-
 libs/tensorflow/include/tensorflow/core/BUILD | 192 +++-
 .../core/common_runtime/direct_session.h      |   3 +
 .../core/common_runtime/gpu/gpu_device.h      |   2 +
 .../core/common_runtime/mkl_cpu_allocator.h   | 120 +++
 .../common_runtime/optimization_registry.h    |   7 +
 .../core/common_runtime/session_factory.h     |  23 +-
 .../core/common_runtime/shape_refiner.h       |   5 +-
 .../include/tensorflow/core/debug/BUILD       |  29 +-
 .../tensorflow/core/debug/debug_graph_utils.h |  21 +
 .../tensorflow/core/debug/debug_io_utils.h    |  12 +-
 .../core/debug/debug_service.grpc.pb.h        | 142 ---
 .../core/distributed_runtime/graph_mgr.h      |   4 +
 .../core/distributed_runtime/local_master.h   |   1 +
 .../core/distributed_runtime/master.h         |   1 +
 .../core/distributed_runtime/master_env.h     |   9 +-
 .../core/distributed_runtime/master_session.h |  10 +-
 .../core/distributed_runtime/rpc/BUILD        |   1 +
 .../distributed_runtime/rpc/grpc_channel.h    |   2 +-
 .../core/distributed_runtime/worker.h         |  11 +-
 .../core/distributed_runtime/worker_cache.h   |   2 +-
 .../core/example/example.pb_text-impl.h       |   0
 .../tensorflow/core/example/example.pb_text.h |   0
 .../example/example_parser_configuration.pb.h |   0
 .../tensorflow/core/example/feature.pb.h      |  18 +
 .../core/example/feature.pb_text-impl.h       |   0
 .../tensorflow/core/example/feature.pb_text.h |   0
 .../allocation_description.pb_text-impl.h     |   0
 .../allocation_description.pb_text.h          |   0
 .../tensorflow/core/framework/allocator.h     |   5 +
 .../core/framework/allocator_registry.h       |  77 ++
 .../tensorflow/core/framework/attr_value.pb.h |  18 +
 .../core/framework/attr_value.pb_text-impl.h  |   0
 .../core/framework/attr_value.pb_text.h       |   0
 .../tensorflow/core/framework/cost_graph.pb.h | 164 ++--
 .../core/framework/cost_graph.pb_text-impl.h  |   0
 .../core/framework/cost_graph.pb_text.h       |   0
 .../device_attributes.pb_text-impl.h          |   0
 .../framework/device_attributes.pb_text.h     |   0
 .../tensorflow/core/framework/device_base.h   |   1 +
 .../tensorflow/core/framework/function.h      |  12 +
 .../core/framework/function.pb_text-impl.h    |   0
 .../core/framework/function.pb_text.h         |   0
 .../core/framework/graph.pb_text-impl.h       |   0
 .../tensorflow/core/framework/graph.pb_text.h |   0
 .../core/framework/graph_transfer_info.pb.h   |  64 ++
 .../graph_transfer_info.pb_text-impl.h        |   2 +
 .../framework/graph_transfer_info.pb_text.h   |   0
 .../tensorflow/core/framework/kernel_def.pb.h |  18 +
 .../core/framework/kernel_def.pb_text-impl.h  |   0
 .../core/framework/kernel_def.pb_text.h       |   0
 .../tensorflow/core/framework/log_memory.pb.h |   0
 .../core/framework/log_memory.pb_text-impl.h  |   0
 .../core/framework/log_memory.pb_text.h       |   0
 .../tensorflow/core/framework/node_def.pb.h   |  18 +
 .../core/framework/node_def.pb_text-impl.h    |   0
 .../core/framework/node_def.pb_text.h         |   0
 .../tensorflow/core/framework/numeric_op.h    |  11 +-
 .../core/framework/op_def.pb_text-impl.h      |   0
 .../core/framework/op_def.pb_text.h           |   0
 .../tensorflow/core/framework/op_kernel.h     | 198 +++-
 .../core/framework/partial_tensor_shape.h     |   2 +-
 .../core/{kernels => framework}/reader_base.h |  18 +-
 .../{kernels => framework}/reader_base.pb.h   |  14 +-
 .../remote_fused_graph_execute_info.pb.h      | 833 +++++++++++++++++
 ...te_fused_graph_execute_info.pb_text-impl.h |  44 +
 .../remote_fused_graph_execute_info.pb_text.h |  34 +
 .../core/framework/resource_handle.pb.h       |   0
 .../framework/resource_handle.pb_text-impl.h  |   0
 .../core/framework/resource_handle.pb_text.h  |   0
 .../tensorflow/core/framework/resource_mgr.h  |   3 +
 .../tensorflow/core/framework/session_state.h |   2 +
 .../core/framework/shape_inference.h          |  32 +-
 .../core/framework/shape_inference_testutil.h |   2 +
 .../tensorflow/core/framework/step_stats.pb.h | 340 +++++++
 .../core/framework/step_stats.pb_text-impl.h  |   7 +
 .../core/framework/step_stats.pb_text.h       |  10 +
 .../core/framework/summary.pb_text-impl.h     |   0
 .../core/framework/summary.pb_text.h          |   0
 .../tensorflow/core/framework/tensor.h        |   7 +
 .../tensorflow/core/framework/tensor.pb.h     |  18 +
 .../core/framework/tensor.pb_text-impl.h      |   0
 .../core/framework/tensor.pb_text.h           |   0
 .../tensor_description.pb_text-impl.h         |   0
 .../framework/tensor_description.pb_text.h    |   0
 .../framework/tensor_shape.pb_text-impl.h     |   0
 .../core/framework/tensor_shape.pb_text.h     |   0
 .../framework/tensor_slice.pb_text-impl.h     |   0
 .../core/framework/tensor_slice.pb_text.h     |   0
 .../tensorflow/core/framework/tensor_util.h   |   7 +-
 .../tensorflow/core/framework/type_index.h    |   4 +-
 .../include/tensorflow/core/framework/types.h |   6 +-
 .../core/framework/types.pb_text-impl.h       |   0
 .../tensorflow/core/framework/types.pb_text.h |   0
 .../core/framework/variable.pb_text-impl.h    |  32 +
 .../core/framework/variable.pb_text.h         |  34 +
 .../core/framework/versions.pb_text-impl.h    |   0
 .../core/framework/versions.pb_text.h         |   0
 .../include/tensorflow/core/graph/costmodel.h |  53 +-
 .../include/tensorflow/core/graph/graph.h     |  22 +-
 .../tensorflow/core/graph/graph_constructor.h |   2 +
 .../tensorflow/core/graph/mkl_layout_pass.h   |  36 +
 .../core/graph/mkl_optimizer_merge.h          |   6 -
 .../core/graph/mkl_tfconversion_pass.h        |  36 +
 .../include/tensorflow/core/grappler/BUILD    | 107 +++
 .../tensorflow/core/grappler/clusters/BUILD   |  76 ++
 .../core/grappler/clusters/cluster.h          |  97 ++
 .../core/grappler/clusters/single_machine.h   |  67 ++
 .../tensorflow/core/grappler/costs/BUILD      | 118 +++
 .../core/grappler/costs/cost_estimator.h      | 149 +++
 .../core/grappler/costs/graph_memory.h        |  61 ++
 .../core/grappler/costs/graph_properties.h    |  57 ++
 .../tensorflow/core/grappler/costs/utils.h    |  53 ++
 .../tensorflow/core/grappler/devices.h        |  38 +
 .../tensorflow/core/grappler/grappler_item.h  |  62 ++
 .../core/grappler/grappler_item_builder.h     |  47 +
 .../tensorflow/core/grappler/inputs/BUILD     |  83 ++
 .../core/grappler/inputs/input_yielder.h      |  35 +
 .../inputs/trivial_test_graph_input_yielder.h |  47 +
 .../tensorflow/core/grappler/inputs/utils.h   |  35 +
 .../tensorflow/core/grappler/optimizers/BUILD |  70 ++
 .../grappler/optimizers/graph_optimizer.h     |  53 ++
 .../grappler/optimizers/layout_optimizer.h    |  42 +
 .../core/grappler/optimizers/meta_optimizer.h |  34 +
 .../include/tensorflow/core/grappler/utils.h  |  50 +
 .../include/tensorflow/core/kernels/BUILD     | 236 ++++-
 .../tensorflow/core/kernels/adjust_hue_op.h   |  42 +
 .../tensorflow/core/kernels/assign_op.h       |  90 +-
 .../core/kernels/batch_matmul_op_impl.h       |  79 +-
 .../tensorflow/core/kernels/cloud/BUILD       |  98 --
 .../kernels/cloud/bigquery_table_accessor.h   | 207 -----
 .../cloud/bigquery_table_accessor_test_data.h | 325 -------
 .../core/kernels/conditional_accumulator.h    |   4 +-
 .../tensorflow/core/kernels/conv_grad_ops.h   |  26 +-
 .../tensorflow/core/kernels/conv_ops_gpu.h    |  65 +-
 .../tensorflow/core/kernels/cwise_ops.h       |   7 +
 .../core/kernels/cwise_ops_common.h           |  65 +-
 .../core/kernels/cwise_ops_gpu_common.cu.h    |  17 +
 .../tensorflow/core/kernels/debug_ops.h       |  53 +-
 .../core/kernels/depthwise_conv_op.h          |   1 +
 .../tensorflow/core/kernels/eigen_pooling.h   |  16 +-
 .../tensorflow/core/kernels/hexagon/BUILD     |  33 +-
 .../kernels/hexagon/graph_transfer_utils.h    |   9 +-
 .../core/kernels/hexagon/graph_transferer.h   |  95 +-
 .../kernels/hexagon/hexagon_control_wrapper.h |  23 +-
 ...pper.h => i_remote_fused_graph_executor.h} |  50 +-
 .../core/kernels/linalg_ops_common.h          |   6 +-
 .../tensorflow/core/kernels/lookup_table_op.h |   4 +
 .../tensorflow/core/kernels/queue_base.h      |   2 +-
 .../tensorflow/core/kernels/record_yielder.h  |   2 +-
 .../core/kernels/reduction_ops_common.h       |  26 +-
 ...remote_fused_graph_execute_op_test_utils.h |  42 +
 .../remote_fused_graph_execute_utils.h        | 125 +++
 .../tensorflow/core/kernels/softmax_op.h      |   4 +-
 .../kernels/sparse_conditional_accumulator.h  |   4 +-
 .../core/kernels/sparse_matmul_op.h           |  16 +-
 .../core/kernels/strided_slice_op_impl.h      |   2 +
 .../core/kernels/transpose_functor.h          |  54 ++
 .../tensorflow/core/kernels/transpose_op.h    |  11 +
 .../tensorflow/core/kernels/typed_queue.h     |  52 ++
 .../tensorflow/core/kernels/variable_ops.h    |  10 +
 .../core/lib/core/error_codes.pb_text-impl.h  |   0
 .../core/lib/core/error_codes.pb_text.h       |   0
 .../include/tensorflow/core/lib/core/status.h |  21 +-
 .../tensorflow/core/lib/core/threadpool.h     |  28 +-
 .../tensorflow/core/lib/gtl/optional.h        | 876 ++++++++++++++++++
 .../include/tensorflow/core/ops/compat/BUILD  |   1 +
 .../core/ops/compat/ops_history.v1.pbtxt      | 475 ++++++++++
 .../include/tensorflow/core/ops/ops.pbtxt     | 349 ++++++-
 .../tensorflow/core/platform/cloud/BUILD      |  14 +
 .../core/platform/cloud/gcs_file_system.h     |   7 +-
 .../core/platform/cloud/retrying_utils.h      |  11 +
 .../include/tensorflow/core/platform/env.h    |  11 +-
 .../tensorflow/core/platform/file_system.h    |   6 +
 .../include/tensorflow/core/platform/macros.h |  11 +
 .../tensorflow/core/platform/prefetch.h       |   4 +-
 .../android_armv7a_cpu_utils_helper.h         |   6 +
 .../profile_utils/clock_cycle_profiler.h      |   2 +
 .../core/platform/profile_utils/cpu_utils.h   |   4 -
 .../tensorflow/core/platform/protobuf.h       |   2 +-
 .../tensorflow/core/platform/tracing.h        |   6 +
 .../core/platform/windows/cpu_info.h          |   3 +
 .../core/platform/windows/intrinsics_port.h   |   4 +
 .../tensorflow/core/protobuf/config.pb.h      |  93 ++
 .../core/protobuf/config.pb_text-impl.h       |   2 +
 .../tensorflow/core/protobuf/config.pb_text.h |   0
 .../core/protobuf/control_flow.pb.h           |  36 +
 .../tensorflow/core/protobuf/debug.pb.h       |  57 ++
 .../core/protobuf/debug.pb_text-impl.h        |   0
 .../tensorflow/core/protobuf/debug.pb_text.h  |   0
 .../tensorflow/core/protobuf/meta_graph.pb.h  |  54 ++
 .../core/protobuf/named_tensor.pb.h           |   0
 .../core/protobuf/queue_runner.pb.h           |  18 +
 .../core/protobuf/queue_runner.pb_text-impl.h |  27 +
 .../core/protobuf/queue_runner.pb_text.h      |  24 +
 .../rewriter_config.pb.h}                     | 107 ++-
 .../protobuf/rewriter_config.pb_text-impl.h   |  25 +
 .../core/protobuf/rewriter_config.pb_text.h   |  24 +
 .../tensorflow/core/protobuf/saved_model.pb.h |   0
 .../core/protobuf/saver.pb_text-impl.h        |   0
 .../tensorflow/core/protobuf/saver.pb_text.h  |   0
 .../core/protobuf/tensor_bundle.pb.h          |   0
 .../protobuf/tensor_bundle.pb_text-impl.h     |   0
 .../core/protobuf/tensor_bundle.pb_text.h     |   0
 .../core/protobuf/tensorflow_server.pb.h      |   0
 .../include/tensorflow/core/public/session.h  |  18 +-
 .../include/tensorflow/core/public/version.h  |   7 +-
 .../core/util/ctc/ctc_beam_search.h           |  48 +-
 .../tensorflow/core/util/ctc/ctc_decoder.h    |  25 +-
 .../core/util/memmapped_file_system.pb.h      |   0
 .../util/memmapped_file_system.pb_text-impl.h |   0
 .../core/util/memmapped_file_system.pb_text.h |   0
 .../include/tensorflow/core/util/mkl_util.h   | 296 ++++++
 .../core/util/presized_cuckoo_map.h           |   4 +
 .../include/tensorflow/core/util/reporter.h   |   3 +-
 .../util/saved_tensor_slice.pb_text-impl.h    |   0
 .../core/util/saved_tensor_slice.pb_text.h    |   0
 .../core/util/tensor_bundle/tensor_bundle.h   |   6 +
 .../tensorflow/core/util/tensor_format.h      |  39 +-
 .../tensorflow/core/util/test_log.pb.h        | 143 +++
 .../include/third_party/eigen3/BUILD          |   1 +
 .../eigen3/unsupported/Eigen/CXX11/FixedPoint |   2 +-
 .../CXX11/src/FixedPoint/PacketMathAVX2.h     |  98 ++
 .../CXX11/src/FixedPoint/PacketMathAVX512.h   |  12 +-
 .../CXX11/src/NeuralNetworks/Activations.h    |  10 +-
 .../Eigen/CXX11/src/Tensor/README.md          |   6 +-
 .../CXX11/src/Tensor/TensorContractionCuda.h  | 101 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h |   5 +-
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  |  23 +-
 .../CXX11/src/Tensor/TensorReductionCuda.h    |   1 -
 .../Eigen/CXX11/src/Tensor/TensorStorage.h    |  10 +-
 .../Eigen/CXX11/src/Tensor/TensorUInt128.h    |   1 +
 .../src/ThreadPool/NonBlockingThreadPool.h    | 104 ++-
 .../Eigen/CXX11/src/util/EmulateArray.h       |   1 +
 .../Eigen/src/AutoDiff/AutoDiffScalar.h       |   4 +-
 .../src/MatrixFunctions/MatrixExponential.h   |  43 +-
 340 files changed, 8613 insertions(+), 2386 deletions(-)
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/PardisoSupport
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/Assign_MKL.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/arch/SSE/PacketMath.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/arch/ZVector/PacketMath.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/util/BlasUtil.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/util/DisableStupidWarnings.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/util/MKL_support.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Core/util/Meta.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/Geometry/Scaling.h
 mode change 100644 => 100755 libs/tensorflow/include/Eigen/src/misc/lapacke.h
 create mode 100644 libs/tensorflow/include/tensorflow/cc/gradients/README.md
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/array_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/candidate_sampling_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/control_flow_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/image_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/io_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/linalg_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/logging_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/math_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/nn_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/no_op_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/parsing_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/random_ops_internal.h
 delete mode 100644 libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops.h
 delete mode 100644 libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/sparse_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/state_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/string_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/training_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/ops/user_ops_internal.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/variables/variables.data-00000-of-00001
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/variables/variables.index
 create mode 100644 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.data-00000-of-00001
 create mode 100644 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.index
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/variables/variables.data-00000-of-00001
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/variables/variables.index
 create mode 100644 libs/tensorflow/include/tensorflow/core/common_runtime/mkl_cpu_allocator.h
 delete mode 100644 libs/tensorflow/include/tensorflow/core/debug/debug_service.grpc.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/example/example.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/example/example.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/example/example_parser_configuration.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/example/feature.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/example/feature.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/allocation_description.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/allocation_description.pb_text.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/framework/allocator_registry.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/attr_value.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/attr_value.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/device_attributes.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/device_attributes.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/function.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/function.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/graph.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/graph.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/log_memory.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/log_memory.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/log_memory.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/node_def.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/node_def.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/node_def.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/op_def.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/op_def.pb_text.h
 rename libs/tensorflow/include/tensorflow/core/{kernels => framework}/reader_base.h (92%)
 rename libs/tensorflow/include/tensorflow/core/{kernels => framework}/reader_base.pb.h (95%)
 create mode 100755 libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text-impl.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/summary.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/summary.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor_description.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor_description.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor_shape.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor_shape.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor_slice.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/tensor_slice.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/types.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/types.pb_text.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/framework/variable.pb_text-impl.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/framework/variable.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/versions.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/framework/versions.pb_text.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/graph/mkl_layout_pass.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/graph/mkl_tfconversion_pass.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/BUILD
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/clusters/BUILD
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/clusters/cluster.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/clusters/single_machine.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/costs/BUILD
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/costs/cost_estimator.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/costs/graph_memory.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/costs/graph_properties.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/costs/utils.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/devices.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/grappler_item.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/grappler_item_builder.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/inputs/BUILD
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/inputs/input_yielder.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/inputs/utils.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/optimizers/BUILD
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/optimizers/graph_optimizer.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/optimizers/layout_optimizer.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/optimizers/meta_optimizer.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/grappler/utils.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/kernels/adjust_hue_op.h
 delete mode 100644 libs/tensorflow/include/tensorflow/core/kernels/cloud/BUILD
 delete mode 100644 libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor.h
 delete mode 100644 libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h
 rename libs/tensorflow/include/tensorflow/core/kernels/{hexagon/i_soc_control_wrapper.h => i_remote_fused_graph_executor.h} (50%)
 create mode 100644 libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/lib/core/error_codes.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/lib/core/error_codes.pb_text.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/lib/gtl/optional.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/config.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/control_flow.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/debug.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/debug.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/debug.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/named_tensor.pb.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text-impl.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text.h
 rename libs/tensorflow/include/tensorflow/core/{debug/debug_service.pb.h => protobuf/rewriter_config.pb.h} (53%)
 mode change 100644 => 100755
 create mode 100755 libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text-impl.h
 create mode 100755 libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/saved_model.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/saver.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/saver.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/protobuf/tensorflow_server.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb_text.h
 create mode 100644 libs/tensorflow/include/tensorflow/core/util/mkl_util.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/util/saved_tensor_slice.pb_text-impl.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/util/saved_tensor_slice.pb_text.h
 mode change 100644 => 100755 libs/tensorflow/include/tensorflow/core/util/test_log.pb.h
 mode change 100644 => 100755 libs/tensorflow/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h

diff --git a/libs/tensorflow/include/Eigen/PardisoSupport b/libs/tensorflow/include/Eigen/PardisoSupport
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Cholesky/LDLT.h b/libs/tensorflow/include/Eigen/src/Cholesky/LDLT.h
index fcee7b2..9b4fdb4 100644
--- a/libs/tensorflow/include/Eigen/src/Cholesky/LDLT.h
+++ b/libs/tensorflow/include/Eigen/src/Cholesky/LDLT.h
@@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/libs/tensorflow/include/Eigen/src/Cholesky/LLT.h b/libs/tensorflow/include/Eigen/src/Cholesky/LLT.h
index 87ca8d4..e6c02d8 100644
--- a/libs/tensorflow/include/Eigen/src/Cholesky/LLT.h
+++ b/libs/tensorflow/include/Eigen/src/Cholesky/LLT.h
@@ -200,7 +200,6 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/libs/tensorflow/include/Eigen/src/Core/ArrayBase.h b/libs/tensorflow/include/Eigen/src/Core/ArrayBase.h
index af5fb25..9da960f 100644
--- a/libs/tensorflow/include/Eigen/src/Core/ArrayBase.h
+++ b/libs/tensorflow/include/Eigen/src/Core/ArrayBase.h
@@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
diff --git a/libs/tensorflow/include/Eigen/src/Core/Assign.h b/libs/tensorflow/include/Eigen/src/Core/Assign.h
index 53806ba..655412e 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Assign.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
   enum{
diff --git a/libs/tensorflow/include/Eigen/src/Core/Assign_MKL.h b/libs/tensorflow/include/Eigen/src/Core/Assign_MKL.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/BooleanRedux.h b/libs/tensorflow/include/Eigen/src/Core/BooleanRedux.h
index ed607d5..ccf5190 100644
--- a/libs/tensorflow/include/Eigen/src/Core/BooleanRedux.h
+++ b/libs/tensorflow/include/Eigen/src/Core/BooleanRedux.h
@@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
   * \sa any(), Cwise::operator<()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
   * \sa all()
   */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
diff --git a/libs/tensorflow/include/Eigen/src/Core/CommaInitializer.h b/libs/tensorflow/include/Eigen/src/Core/CommaInitializer.h
index d218e98..35fdbb8 100644
--- a/libs/tensorflow/include/Eigen/src/Core/CommaInitializer.h
+++ b/libs/tensorflow/include/Eigen/src/Core/CommaInitializer.h
@@ -141,7 +141,7 @@ struct CommaInitializer
   * \sa CommaInitializer::finished(), class CommaInitializer
   */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
   return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
diff --git a/libs/tensorflow/include/Eigen/src/Core/CoreEvaluators.h b/libs/tensorflow/include/Eigen/src/Core/CoreEvaluators.h
index 412f5a6..15b361b 100644
--- a/libs/tensorflow/include/Eigen/src/Core/CoreEvaluators.h
+++ b/libs/tensorflow/include/Eigen/src/Core/CoreEvaluators.h
@@ -134,19 +134,19 @@ struct evaluator_base
 // this helper permits to completely eliminate m_outerStride if it is known at compiletime.
 template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
   {
     EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
     eigen_internal_assert(outerStride==OuterStride);
   }
-  Index outerStride() const { return OuterStride; }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
   const Scalar *data;
 };
 
 template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
-  Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
   const Scalar *data;
 protected:
   Index m_outerStride;
diff --git a/libs/tensorflow/include/Eigen/src/Core/CwiseBinaryOp.h b/libs/tensorflow/include/Eigen/src/Core/CwiseBinaryOp.h
index a36765e..bf2632d 100644
--- a/libs/tensorflow/include/Eigen/src/Core/CwiseBinaryOp.h
+++ b/libs/tensorflow/include/Eigen/src/Core/CwiseBinaryOp.h
@@ -158,7 +158,7 @@ class CwiseBinaryOpImpl
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
-
diff --git a/libs/tensorflow/include/Eigen/src/Core/CwiseNullaryOp.h b/libs/tensorflow/include/Eigen/src/Core/CwiseNullaryOp.h
index dd498f7..144608e 100644
--- a/libs/tensorflow/include/Eigen/src/Core/CwiseNullaryOp.h
+++ b/libs/tensorflow/include/Eigen/src/Core/CwiseNullaryOp.h
@@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
   * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
   * \sa LinSpaced(Scalar,Scalar)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
   * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
   * Special version for fixed size types which does not require the size parameter.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   typename internal::nested_eval<Derived,1>::type self(derived());
@@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
   *
   * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
   return isApproxToConstant(val, prec);
@@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
   * \sa setConstant(), Constant(), class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
   setConstant(val);
 }
@@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
   * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
   return derived() = Constant(rows(), cols(), val);
 }
@@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
   resize(size);
@@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
   resize(rows, cols);
@@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
   * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
   * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return setLinSpaced(size(), low, high);
@@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
   * \sa Zero(), Zero(Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index rows, Index cols)
 {
   return Constant(rows, cols, Scalar(0));
@@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
   * \sa Zero(), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
   return Constant(size, Scalar(0));
@@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
   * \sa Zero(Index), Zero(Index,Index)
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
   return Constant(Scalar(0));
@@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
   typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
@@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
   * \sa class CwiseNullaryOp, Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
   return setConstant(Scalar(0));
 }
@@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
   * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
   resize(newSize);
@@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
   * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
   resize(rows, cols);
@@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
   * \sa Ones(), Ones(Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index rows, Index cols)
 {
   return Constant(rows, cols, Scalar(1));
@@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
   * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
   return Constant(newSize, Scalar(1));
@@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
   * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
   return Constant(Scalar(1));
@@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
   return isApproxToConstant(Scalar(1), prec);
@@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
   * \sa class CwiseNullaryOp, Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
   return setConstant(Scalar(1));
 }
@@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
   * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
   resize(newSize);
@@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
   * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
   resize(rows, cols);
@@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
   * \sa Identity(), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
   * \sa Identity(Index,Index), setIdentity(), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
   * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
   return internal::setIdentity_impl<Derived>::run(derived());
 }
@@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
   derived().resize(rows, cols);
   return setIdentity();
@@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
   * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return BasisReturnType(SquareMatrixType::Identity(),i);
@@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }
 
 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }
 
 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }
 
 /** \returns an expression of the W axis unit vector (0,0,0,1)
@@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
   * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 
 } // end namespace Eigen
diff --git a/libs/tensorflow/include/Eigen/src/Core/DenseBase.h b/libs/tensorflow/include/Eigen/src/Core/DenseBase.h
index fc80757..fd933ee 100644
--- a/libs/tensorflow/include/Eigen/src/Core/DenseBase.h
+++ b/libs/tensorflow/include/Eigen/src/Core/DenseBase.h
@@ -296,7 +296,7 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-    /** \ínternal
+    /** \internal
       * Copies \a other into *this without evaluating other. \returns a reference to *this.
       * \deprecated */
     template<typename OtherDerived>
@@ -484,9 +484,9 @@ template<typename Derived> class DenseBase
       return derived().coeff(0,0);
     }
 
-    bool all() const;
-    bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC bool all() const;
+    EIGEN_DEVICE_FUNC bool any() const;
+    EIGEN_DEVICE_FUNC Index count() const;
 
     typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
     typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
diff --git a/libs/tensorflow/include/Eigen/src/Core/DenseStorage.h b/libs/tensorflow/include/Eigen/src/Core/DenseStorage.h
index 82201d9..7958fee 100644
--- a/libs/tensorflow/include/Eigen/src/Core/DenseStorage.h
+++ b/libs/tensorflow/include/Eigen/src/Core/DenseStorage.h
@@ -13,9 +13,9 @@
 #define EIGEN_MATRIXSTORAGE_H
 
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif
 
 namespace Eigen {
@@ -184,12 +184,16 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
     internal::plain_array<T,Size,_Options> m_data;
   public:
-    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC DenseStorage() {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
     EIGEN_DEVICE_FUNC
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
     EIGEN_DEVICE_FUNC 
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    }
     EIGEN_DEVICE_FUNC 
     DenseStorage& operator=(const DenseStorage& other)
     { 
@@ -197,7 +201,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       return *this; 
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
       EIGEN_UNUSED_VARIABLE(size);
       EIGEN_UNUSED_VARIABLE(rows);
@@ -343,7 +347,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
     {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
     }
     EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
@@ -351,6 +355,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
       , m_rows(other.m_rows)
       , m_cols(other.m_cols)
     {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols)
       internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
     }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -403,7 +408,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
       m_rows = rows;
       m_cols = cols;
@@ -422,7 +427,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
     EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
     {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
       EIGEN_UNUSED_VARIABLE(rows);
     }
@@ -430,6 +435,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
       , m_cols(other.m_cols)
     {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows)
       internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
     }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -477,7 +483,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
       m_cols = cols;
     }
@@ -495,7 +501,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
     EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
     {
-      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
       EIGEN_UNUSED_VARIABLE(cols);
     }
@@ -503,6 +509,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
       : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
       , m_rows(other.m_rows)
     {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols)
       internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
     }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
@@ -550,7 +557,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
           m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
         else
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
       m_rows = rows;
     }
diff --git a/libs/tensorflow/include/Eigen/src/Core/Diagonal.h b/libs/tensorflow/include/Eigen/src/Core/Diagonal.h
index 49e7112..c62f5ff 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Diagonal.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Diagonal.h
@@ -184,7 +184,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
   *
   * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
   return DiagonalReturnType(derived());
@@ -192,7 +192,7 @@ MatrixBase<Derived>::diagonal()
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -210,7 +210,7 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
   return DiagonalDynamicIndexReturnType(derived(), index);
@@ -218,7 +218,7 @@ MatrixBase<Derived>::diagonal(Index index)
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
   return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -237,6 +237,7 @@ MatrixBase<Derived>::diagonal(Index index) const
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -246,6 +247,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
+EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
diff --git a/libs/tensorflow/include/Eigen/src/Core/DiagonalMatrix.h b/libs/tensorflow/include/Eigen/src/Core/DiagonalMatrix.h
index ecfdce8..4e8297e 100644
--- a/libs/tensorflow/include/Eigen/src/Core/DiagonalMatrix.h
+++ b/libs/tensorflow/include/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
 
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
     EIGEN_DEVICE_FUNC
@@ -273,7 +273,7 @@ class DiagonalWrapper
   * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
   **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
   return DiagonalWrapper<const Derived>(derived());
diff --git a/libs/tensorflow/include/Eigen/src/Core/DiagonalProduct.h b/libs/tensorflow/include/Eigen/src/Core/DiagonalProduct.h
index d372b93..7911d1c 100644
--- a/libs/tensorflow/include/Eigen/src/Core/DiagonalProduct.h
+++ b/libs/tensorflow/include/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
   return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
diff --git a/libs/tensorflow/include/Eigen/src/Core/Dot.h b/libs/tensorflow/include/Eigen/src/Core/Dot.h
index 06ef18b..bb8e3fe 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Dot.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Dot.h
@@ -90,7 +90,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
   return numext::real((*this).cwiseAbs2().sum());
 }
@@ -102,7 +102,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
   * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -117,7 +117,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
   * \sa norm(), normalize()
   */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -139,7 +139,7 @@ MatrixBase<Derived>::normalized() const
   * \sa norm(), normalized()
   */
 template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -160,7 +160,7 @@ inline void MatrixBase<Derived>::normalize()
   * \sa stableNorm(), stableNormalize(), normalized()
   */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
   typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -185,7 +185,7 @@ MatrixBase<Derived>::stableNormalized() const
   * \sa stableNorm(), stableNormalized(), normalize()
   */
 template<typename Derived>
-inline void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::stableNormalize()
 {
   RealScalar w = cwiseAbs().maxCoeff();
   RealScalar z = (derived()/w).squaredNorm();
@@ -257,9 +257,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
diff --git a/libs/tensorflow/include/Eigen/src/Core/EigenBase.h b/libs/tensorflow/include/Eigen/src/Core/EigenBase.h
index f76995a..ccc122c 100644
--- a/libs/tensorflow/include/Eigen/src/Core/EigenBase.h
+++ b/libs/tensorflow/include/Eigen/src/Core/EigenBase.h
@@ -128,6 +128,7 @@ template<typename Derived> struct EigenBase
   */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived());
@@ -136,6 +137,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -144,6 +146,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
diff --git a/libs/tensorflow/include/Eigen/src/Core/Fuzzy.h b/libs/tensorflow/include/Eigen/src/Core/Fuzzy.h
index 3e403a0..43aa49b 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Fuzzy.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
   * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
   */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const typename NumTraits<Scalar>::Real& other,
   const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
   */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
diff --git a/libs/tensorflow/include/Eigen/src/Core/GeneralProduct.h b/libs/tensorflow/include/Eigen/src/Core/GeneralProduct.h
index 0f16cd8..b206b0a 100644
--- a/libs/tensorflow/include/Eigen/src/Core/GeneralProduct.h
+++ b/libs/tensorflow/include/Eigen/src/Core/GeneralProduct.h
@@ -428,7 +428,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 template<typename Derived>
 template<typename OtherDerived>
 const Product<Derived,OtherDerived,LazyProduct>
-MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
   enum {
     ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
diff --git a/libs/tensorflow/include/Eigen/src/Core/GenericPacketMath.h b/libs/tensorflow/include/Eigen/src/Core/GenericPacketMath.h
index ac5552d..d19d5bb 100644
--- a/libs/tensorflow/include/Eigen/src/Core/GenericPacketMath.h
+++ b/libs/tensorflow/include/Eigen/src/Core/GenericPacketMath.h
@@ -231,7 +231,7 @@ pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(
   * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
   * Currently, this function is only used for scalar * complex products.
   */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet with elements of \a *from quadrupled.
@@ -279,7 +279,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@@ -487,7 +487,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
   * by the current computation.
   */
 template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
   return ploadt<Packet, LoadMode>(from);
 }
diff --git a/libs/tensorflow/include/Eigen/src/Core/MatrixBase.h b/libs/tensorflow/include/Eigen/src/Core/MatrixBase.h
index 675c94e..200e577 100644
--- a/libs/tensorflow/include/Eigen/src/Core/MatrixBase.h
+++ b/libs/tensorflow/include/Eigen/src/Core/MatrixBase.h
@@ -294,7 +294,7 @@ template<typename Derived> class MatrixBase
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator!= */
     template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
     { return cwiseEqual(other).all(); }
 
     /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@@ -302,7 +302,7 @@ template<typename Derived> class MatrixBase
       *          fuzzy comparison such as isApprox()
       * \sa isApprox(), operator== */
     template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
     { return cwiseNotEqual(other).any(); }
 
     NoAlias<Derived,Eigen::MatrixBase > noalias();
diff --git a/libs/tensorflow/include/Eigen/src/Core/NestByValue.h b/libs/tensorflow/include/Eigen/src/Core/NestByValue.h
index 13adf07..01cf192 100644
--- a/libs/tensorflow/include/Eigen/src/Core/NestByValue.h
+++ b/libs/tensorflow/include/Eigen/src/Core/NestByValue.h
@@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
     }
 
     template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
     {
       return m_expression.template packet<LoadMode>(row, col);
     }
 
     template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
     {
       m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
     }
 
     template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
     {
       return m_expression.template packet<LoadMode>(index);
     }
 
     template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
     {
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
@@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
   */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
   return NestByValue<Derived>(derived());
diff --git a/libs/tensorflow/include/Eigen/src/Core/PlainObjectBase.h b/libs/tensorflow/include/Eigen/src/Core/PlainObjectBase.h
index 639fb92..77f4f60 100644
--- a/libs/tensorflow/include/Eigen/src/Core/PlainObjectBase.h
+++ b/libs/tensorflow/include/Eigen/src/Core/PlainObjectBase.h
@@ -812,6 +812,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       this->_set_noalias(other);
     }
 
+    // Initialize an arbitrary matrix from an object convertible to the Derived type.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Derived& other){
+      this->_set_noalias(other);
+    }
+
     // Initialize an arbitrary matrix from a generic Eigen expression
     template<typename T, typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -834,7 +841,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       this->derived() = r;
     }
     
-    // For fixed -size arrays:
+    // For fixed-size Array<Scalar,...>
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
@@ -846,6 +853,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       Base::setConstant(val0);
     }
     
+    // For fixed-size Array<Index,...>
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Index& val0,
diff --git a/libs/tensorflow/include/Eigen/src/Core/Random.h b/libs/tensorflow/include/Eigen/src/Core/Random.h
index 6faf789..486e9ed 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Random.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Random.h
@@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
   * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
   */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
   return *this = Random(rows(), cols());
 }
diff --git a/libs/tensorflow/include/Eigen/src/Core/Redux.h b/libs/tensorflow/include/Eigen/src/Core/Redux.h
index b6e8f88..2b5b73b 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Redux.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Redux.h
@@ -407,7 +407,7 @@ class redux_evaluator
   */
 template<typename Derived>
 template<typename Func>
-typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@@ -422,7 +422,7 @@ DenseBase<Derived>::redux(const Func& func) const
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@@ -432,7 +432,7 @@ DenseBase<Derived>::minCoeff() const
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -445,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
   * \sa trace(), prod(), mean()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -458,7 +458,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -479,7 +479,7 @@ DenseBase<Derived>::mean() const
   * \sa sum(), mean(), trace()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -494,7 +494,7 @@ DenseBase<Derived>::prod() const
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
   return derived().diagonal().sum();
diff --git a/libs/tensorflow/include/Eigen/src/Core/Replicate.h b/libs/tensorflow/include/Eigen/src/Core/Replicate.h
index 9960ef8..0b2d6d7 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Replicate.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Replicate.h
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
   * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
   */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
   return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
diff --git a/libs/tensorflow/include/Eigen/src/Core/ReturnByValue.h b/libs/tensorflow/include/Eigen/src/Core/ReturnByValue.h
index c44b767..11dc86d 100644
--- a/libs/tensorflow/include/Eigen/src/Core/ReturnByValue.h
+++ b/libs/tensorflow/include/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
 
 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
   other.evalTo(derived());
   return derived();
diff --git a/libs/tensorflow/include/Eigen/src/Core/Reverse.h b/libs/tensorflow/include/Eigen/src/Core/Reverse.h
index 0640cda..8b6b3ab 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Reverse.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Reverse.h
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
   *
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
   return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
   *
   * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
   if(cols()>rows())
   {
@@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   *
   * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
   internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
diff --git a/libs/tensorflow/include/Eigen/src/Core/SelfAdjointView.h b/libs/tensorflow/include/Eigen/src/Core/SelfAdjointView.h
index 504c98f..7e71fe3 100644
--- a/libs/tensorflow/include/Eigen/src/Core/SelfAdjointView.h
+++ b/libs/tensorflow/include/Eigen/src/Core/SelfAdjointView.h
@@ -322,7 +322,7 @@ class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluat
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -339,7 +339,7 @@ MatrixBase<Derived>::selfadjointView() const
   */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
diff --git a/libs/tensorflow/include/Eigen/src/Core/SelfCwiseBinaryOp.h b/libs/tensorflow/include/Eigen/src/Core/SelfCwiseBinaryOp.h
index 719ed72..50099df 100644
--- a/libs/tensorflow/include/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/libs/tensorflow/include/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -15,7 +15,7 @@ namespace Eigen {
 // TODO generalize the scalar type of 'other'
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
@@ -23,7 +23,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
@@ -31,7 +31,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
@@ -39,7 +39,7 @@ EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 }
 
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
diff --git a/libs/tensorflow/include/Eigen/src/Core/SolveTriangular.h b/libs/tensorflow/include/Eigen/src/Core/SolveTriangular.h
index 049890b..a0011d4 100644
--- a/libs/tensorflow/include/Eigen/src/Core/SolveTriangular.h
+++ b/libs/tensorflow/include/Eigen/src/Core/SolveTriangular.h
@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
diff --git a/libs/tensorflow/include/Eigen/src/Core/StableNorm.h b/libs/tensorflow/include/Eigen/src/Core/StableNorm.h
index d2fe1e1..be04ed4 100644
--- a/libs/tensorflow/include/Eigen/src/Core/StableNorm.h
+++ b/libs/tensorflow/include/Eigen/src/Core/StableNorm.h
@@ -170,7 +170,8 @@ MatrixBase<Derived>::stableNorm() const
   enum {
     CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
                 || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
   };
   typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
                                                    typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
diff --git a/libs/tensorflow/include/Eigen/src/Core/Transpose.h b/libs/tensorflow/include/Eigen/src/Core/Transpose.h
index 79b767b..ba7d6e6 100644
--- a/libs/tensorflow/include/Eigen/src/Core/Transpose.h
+++ b/libs/tensorflow/include/Eigen/src/Core/Transpose.h
@@ -168,7 +168,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
   return TransposeReturnType(derived());
@@ -180,7 +180,7 @@ DenseBase<Derived>::transpose()
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
@@ -206,7 +206,7 @@ DenseBase<Derived>::transpose() const
   *
   * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
   return AdjointReturnType(this->transpose());
@@ -281,7 +281,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
   eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
                && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -312,7 +312,7 @@ inline void DenseBase<Derived>::transposeInPlace()
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
   derived() = adjoint().eval();
 }
diff --git a/libs/tensorflow/include/Eigen/src/Core/TriangularMatrix.h b/libs/tensorflow/include/Eigen/src/Core/TriangularMatrix.h
index 667ef09..ed80da3 100644
--- a/libs/tensorflow/include/Eigen/src/Core/TriangularMatrix.h
+++ b/libs/tensorflow/include/Eigen/src/Core/TriangularMatrix.h
@@ -488,7 +488,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \sa TriangularView::solveInPlace()
       */
     template<int Side, typename Other>
-    EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
@@ -554,7 +553,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -564,7 +563,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -573,7 +572,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
@@ -583,7 +582,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
   internal::call_assignment_no_alias(derived(), other.derived());
@@ -598,7 +597,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
   evalToLazy(other.derived());
 }
@@ -624,6 +623,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
   */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@@ -633,6 +633,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
+EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@@ -930,7 +931,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
   other.derived().resize(this->rows(), this->cols());
   internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
diff --git a/libs/tensorflow/include/Eigen/src/Core/VectorwiseOp.h b/libs/tensorflow/include/Eigen/src/Core/VectorwiseOp.h
index 4fe267e..893bc79 100644
--- a/libs/tensorflow/include/Eigen/src/Core/VectorwiseOp.h
+++ b/libs/tensorflow/include/Eigen/src/Core/VectorwiseOp.h
@@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
   return ColwiseReturnType(derived());
@@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
   return RowwiseReturnType(derived());
diff --git a/libs/tensorflow/include/Eigen/src/Core/arch/AltiVec/PacketMath.h b/libs/tensorflow/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/Half.h b/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/Half.h
index db98787..67518da 100644
--- a/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/Half.h
+++ b/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/Half.h
@@ -13,7 +13,7 @@
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
diff --git a/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/PacketMath.h b/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/PacketMath.h
index ad66399..8c46af0 100644
--- a/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/libs/tensorflow/include/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -167,10 +167,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
   return make_double2(from[0], from[1]);
 }
 
-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
   return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
   return make_double2(from[0], from[0]);
 }
 
@@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
-  double tmp = kernel.packet[0].y;
+  float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
   kernel.packet[1].x = tmp;
 
diff --git a/libs/tensorflow/include/Eigen/src/Core/arch/NEON/PacketMath.h b/libs/tensorflow/include/Eigen/src/Core/arch/NEON/PacketMath.h
index d392bf3..84a56bd 100644
--- a/libs/tensorflow/include/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/libs/tensorflow/include/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -46,7 +46,7 @@ typedef uint32x4_t  Packet4ui;
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@@ -83,7 +83,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasSqrt = 0
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
+template<> struct packet_traits<int32_t>    : default_packet_traits
 {
   typedef Packet4i type;
   typedef Packet4i half; // Packet2i intrinsics not implemented yet
@@ -105,11 +105,11 @@ EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }
 
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
@@ -117,7 +117,7 @@ template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
   Packet4f countdown = vld1q_f32(f);
   return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
 {
   const int32_t i[] = {0, 1, 2, 3};
   Packet4i countdown = vld1q_s32(i);
@@ -240,20 +240,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
+template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
   float32x2_t lo, hi;
   lo = vld1_dup_f32(from);
   hi = vld1_dup_f32(from+1);
   return vcombine_f32(lo, hi);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
 {
   int32x2_t lo, hi;
   lo = vld1_dup_s32(from);
@@ -261,11 +261,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vcombine_s32(lo, hi);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
@@ -276,7 +276,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
   res = vsetq_lane_f32(from[3*stride], res, 3);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
 {
   Packet4i res = pset1<Packet4i>(0);
   res = vsetq_lane_s32(from[0*stride], res, 0);
@@ -293,7 +293,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
   to[stride*2] = vgetq_lane_f32(from, 2);
   to[stride*3] = vgetq_lane_f32(from, 3);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
 {
   to[stride*0] = vgetq_lane_s32(from, 0);
   to[stride*1] = vgetq_lane_s32(from, 1);
@@ -301,12 +301,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[stride*3] = vgetq_lane_s32(from, 3);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }
 
 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
   float32x2_t a_lo, a_hi;
@@ -361,7 +361,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
   return sum;
 }
 
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, sum;
 
@@ -408,7 +408,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 
   return vget_lane_f32(prod, 0);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, prod;
 
@@ -436,7 +436,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
   return vget_lane_f32(min, 0);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, min;
 
@@ -461,7 +461,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
   return vget_lane_f32(max, 0);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
 {
   int32x2_t a_lo, a_hi, max;
 
diff --git a/libs/tensorflow/include/Eigen/src/Core/arch/SSE/PacketMath.h b/libs/tensorflow/include/Eigen/src/Core/arch/SSE/PacketMath.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/arch/ZVector/PacketMath.h b/libs/tensorflow/include/Eigen/src/Core/arch/ZVector/PacketMath.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/libs/tensorflow/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 5cd2794..ad38bcf 100644
--- a/libs/tensorflow/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/libs/tensorflow/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -148,7 +148,7 @@ struct tribb_kernel
     ResMapper res(_res, resStride);
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
 
-    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));
 
     // let's process the block per panel of actual_mc x BlockSize,
     // again, each is split into three parts, etc.
@@ -292,12 +292,12 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename ProductType>
-TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
   eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-  
+
   general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
-  
+
   return derived();
 }
 
diff --git a/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointProduct.h b/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointProduct.h
index f038d68..39c5b59 100644
--- a/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointProduct.h
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
   selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
diff --git a/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointRank2Update.h b/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointRank2Update.h
index 2ae3641..d395888 100644
--- a/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/libs/tensorflow/include/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -57,7 +57,7 @@ template<bool Cond, typename T> struct conj_expr_if
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
diff --git a/libs/tensorflow/include/Eigen/src/Core/products/TriangularMatrixMatrix.h b/libs/tensorflow/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 8a2f7cd..6ec5a8a 100644
--- a/libs/tensorflow/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/libs/tensorflow/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -137,7 +137,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
@@ -284,7 +284,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
+    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert()));
     triangularBuffer.setZero();
     if((Mode&ZeroDiag)==ZeroDiag)
       triangularBuffer.diagonal().setZero();
diff --git a/libs/tensorflow/include/Eigen/src/Core/util/BlasUtil.h b/libs/tensorflow/include/Eigen/src/Core/util/BlasUtil.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/util/DisableStupidWarnings.h b/libs/tensorflow/include/Eigen/src/Core/util/DisableStupidWarnings.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/util/IntegralConstant.h b/libs/tensorflow/include/Eigen/src/Core/util/IntegralConstant.h
index ae41015..78a4705 100644
--- a/libs/tensorflow/include/Eigen/src/Core/util/IntegralConstant.h
+++ b/libs/tensorflow/include/Eigen/src/Core/util/IntegralConstant.h
@@ -151,9 +151,9 @@ struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
   static const int value = N;
 };
 
-template<typename T> Index get_runtime_value(const T &x) { return x; }
+template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
 #if !EIGEN_HAS_CXX14
-template<int N> Index get_runtime_value(FixedInt<N> (*)()) { return N; }
+template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
 #endif
 
 // Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
diff --git a/libs/tensorflow/include/Eigen/src/Core/util/MKL_support.h b/libs/tensorflow/include/Eigen/src/Core/util/MKL_support.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Core/util/Macros.h b/libs/tensorflow/include/Eigen/src/Core/util/Macros.h
index 12531e3..14ec87d 100644
--- a/libs/tensorflow/include/Eigen/src/Core/util/Macros.h
+++ b/libs/tensorflow/include/Eigen/src/Core/util/Macros.h
@@ -542,8 +542,8 @@
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
 
 #ifdef NDEBUG
 # ifndef EIGEN_NO_DEBUG
@@ -837,7 +837,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 ||  __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 ||  defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
diff --git a/libs/tensorflow/include/Eigen/src/Core/util/Meta.h b/libs/tensorflow/include/Eigen/src/Core/util/Meta.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/libs/tensorflow/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 4fec8af..dbbd480 100644
--- a/libs/tensorflow/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/libs/tensorflow/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -85,7 +85,7 @@ MatrixBase<Derived>::eigenvalues() const
   * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
   */
 template<typename MatrixType, unsigned int UpLo> 
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
   typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
@@ -149,7 +149,7 @@ MatrixBase<Derived>::operatorNorm() const
   * \sa eigenvalues(), MatrixBase::operatorNorm()
   */
 template<typename MatrixType, unsigned int UpLo>
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
   return eigenvalues().cwiseAbs().maxCoeff();
diff --git a/libs/tensorflow/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/libs/tensorflow/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index a9f56c4..9ddd553 100644
--- a/libs/tensorflow/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/libs/tensorflow/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -414,7 +414,8 @@ ::compute(const EigenBase<InputType>& a_matrix, int options)
 
   if(n==1)
   {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.diagonal()[0]);
+    m_eivec = matrix;
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0));
     if(computeEigenvectors)
       m_eivec.setOnes(n,n);
     m_info = Success;
diff --git a/libs/tensorflow/include/Eigen/src/Geometry/Scaling.h b/libs/tensorflow/include/Eigen/src/Geometry/Scaling.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/Eigen/src/Householder/BlockHouseholder.h b/libs/tensorflow/include/Eigen/src/Householder/BlockHouseholder.h
index 39bf8c8..01a7ed1 100644
--- a/libs/tensorflow/include/Eigen/src/Householder/BlockHouseholder.h
+++ b/libs/tensorflow/include/Eigen/src/Householder/BlockHouseholder.h
@@ -87,7 +87,8 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
   const TriangularView<const VectorsType, UnitLower> V(vectors);
 
   // A -= V T V^* A
-  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
+  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,
+         (VectorsType::MaxColsAtCompileTime==1 && MatrixType::MaxColsAtCompileTime!=1)?RowMajor:ColMajor,
          VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
   // FIXME add .noalias() once the triangular product can work inplace
   if(forward) tmp = T.template triangularView<Upper>()           * tmp;
diff --git a/libs/tensorflow/include/Eigen/src/LU/FullPivLU.h b/libs/tensorflow/include/Eigen/src/LU/FullPivLU.h
index 03b6af7..ec61086 100644
--- a/libs/tensorflow/include/Eigen/src/LU/FullPivLU.h
+++ b/libs/tensorflow/include/Eigen/src/LU/FullPivLU.h
@@ -411,11 +411,9 @@ template<typename _MatrixType> class FullPivLU
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
 
     template<bool Conjugate, typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/libs/tensorflow/include/Eigen/src/QR/ColPivHouseholderQR.h b/libs/tensorflow/include/Eigen/src/QR/ColPivHouseholderQR.h
index 0e47c83..d35395d 100644
--- a/libs/tensorflow/include/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/libs/tensorflow/include/Eigen/src/QR/ColPivHouseholderQR.h
@@ -416,7 +416,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
diff --git a/libs/tensorflow/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/libs/tensorflow/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 34c637b..13b61fc 100644
--- a/libs/tensorflow/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/libs/tensorflow/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -367,7 +367,7 @@ class CompleteOrthogonalDecomposition {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const;
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
 #endif
 
  protected:
diff --git a/libs/tensorflow/include/Eigen/src/QR/FullPivHouseholderQR.h b/libs/tensorflow/include/Eigen/src/QR/FullPivHouseholderQR.h
index e489bdd..c31e47c 100644
--- a/libs/tensorflow/include/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/libs/tensorflow/include/Eigen/src/QR/FullPivHouseholderQR.h
@@ -392,22 +392,21 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
diff --git a/libs/tensorflow/include/Eigen/src/QR/HouseholderQR.h b/libs/tensorflow/include/Eigen/src/QR/HouseholderQR.h
index 3513d99..762b21c 100644
--- a/libs/tensorflow/include/Eigen/src/QR/HouseholderQR.h
+++ b/libs/tensorflow/include/Eigen/src/QR/HouseholderQR.h
@@ -204,28 +204,27 @@ template<typename _MatrixType> class HouseholderQR
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
-    
+
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
       * 
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
-    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
 
     void computeInPlace();
-    
+
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
diff --git a/libs/tensorflow/include/Eigen/src/SVD/JacobiSVD.h b/libs/tensorflow/include/Eigen/src/SVD/JacobiSVD.h
index 1337ae9..43488b1 100644
--- a/libs/tensorflow/include/Eigen/src/SVD/JacobiSVD.h
+++ b/libs/tensorflow/include/Eigen/src/SVD/JacobiSVD.h
@@ -112,9 +112,11 @@ class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, Pre
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
+              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
+              : MatrixType::Options
   };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
+  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
           TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
diff --git a/libs/tensorflow/include/Eigen/src/SVD/SVDBase.h b/libs/tensorflow/include/Eigen/src/SVD/SVDBase.h
index cc90a3b..4294147 100644
--- a/libs/tensorflow/include/Eigen/src/SVD/SVDBase.h
+++ b/libs/tensorflow/include/Eigen/src/SVD/SVDBase.h
@@ -212,7 +212,6 @@ class SVDBase
   
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
-  EIGEN_DEVICE_FUNC
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
   #endif
 
diff --git a/libs/tensorflow/include/Eigen/src/misc/lapacke.h b/libs/tensorflow/include/Eigen/src/misc/lapacke.h
old mode 100644
new mode 100755
index 8c7e79b..3d8e24f
--- a/libs/tensorflow/include/Eigen/src/misc/lapacke.h
+++ b/libs/tensorflow/include/Eigen/src/misc/lapacke.h
@@ -43,10 +43,6 @@
 #include "lapacke_config.h"
 #endif
 
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
 #include <stdlib.h>
 
 #ifndef lapack_int
@@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im );
 
 #endif
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
 #ifndef LAPACKE_malloc
 #define LAPACKE_malloc( size ) malloc( size )
 #endif
diff --git a/libs/tensorflow/include/tensorflow/cc/BUILD b/libs/tensorflow/include/tensorflow/cc/BUILD
index a488440..9a41d2b 100644
--- a/libs/tensorflow/include/tensorflow/cc/BUILD
+++ b/libs/tensorflow/include/tensorflow/cc/BUILD
@@ -229,6 +229,7 @@ cc_library(
         ":cc_ops_internal",
         ":grad_op_registry",
         ":gradients",
+        "//tensorflow/core:lib_proto_parsing",
     ],
 )
 
@@ -374,6 +375,16 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "resource_variable_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "resource_variable_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 tf_gen_op_wrappers_cc(
     name = "remote_fused_graph_ops",
     op_lib_names = [
diff --git a/libs/tensorflow/include/tensorflow/cc/client/client_session.h b/libs/tensorflow/include/tensorflow/cc/client/client_session.h
index a6fe020..5fb4109 100644
--- a/libs/tensorflow/include/tensorflow/cc/client/client_session.h
+++ b/libs/tensorflow/include/tensorflow/cc/client/client_session.h
@@ -23,10 +23,6 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -67,6 +63,8 @@ class ClientSession {
   /// Create a new session, configuring it with `session_options`.
   ClientSession(const Scope& scope, const SessionOptions& session_options);
 
+  ~ClientSession();
+
   /// Evaluate the tensors in `fetch_outputs`. The values are returned as
   /// `Tensor` objects in `outputs`. The number and order of `outputs` will
   /// match `fetch_outputs`.
@@ -92,16 +90,10 @@ class ClientSession {
   // TODO(keveman): Add support for partial run.
 
  private:
-  SessionOptions MakeDefaultSessionOptions(const string& target) const;
-  Status MaybeExtendGraph() const;
-
-  std::unique_ptr<Session> session_;
-  std::shared_ptr<Graph> graph_;
-
-  mutable mutex mu_;
-  mutable int last_num_graph_nodes_ GUARDED_BY(mu_) = 0;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ClientSession);
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  Impl* impl() { return impl_.get(); }
+  const Impl* impl() const { return impl_.get(); }
 };
 
 /// @}
diff --git a/libs/tensorflow/include/tensorflow/cc/framework/scope.h b/libs/tensorflow/include/tensorflow/cc/framework/scope.h
index 2963442..ce70da7 100644
--- a/libs/tensorflow/include/tensorflow/cc/framework/scope.h
+++ b/libs/tensorflow/include/tensorflow/cc/framework/scope.h
@@ -23,12 +23,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
+class Graph;
 class GraphDef;
 class NodeBuilder;
 struct CompositeOpScopes;
@@ -94,6 +94,10 @@ struct CompositeOpScopes;
 /// op-constructor functions on the same `Scope` object.
 class Scope {
  public:
+  Scope(const Scope& other);
+  ~Scope();
+  Scope& operator=(const Scope& other);
+
   // The following functions are for users making graphs. They return brand new
   // scopes, or scopes derived from an existing scope object.
 
@@ -164,20 +168,21 @@ class Scope {
   // START_SKIP_DOXYGEN
 
   /// Update the builder with properties accumulated in this scope.
+  // TODO(skyewm): NodeBuilder is not part of public API
   void UpdateBuilder(NodeBuilder* builder) const;
   // END_SKIP_DOXYGEN
 
   CompositeOpScopes GetCompositeOpScopes(const string& composite_op_name) const;
 
-  bool ok() const { return status_->ok(); }
-
-  Graph* graph() const { return graph_.get(); }
+  bool ok() const;
 
-  ShapeRefiner* refiner() const { return refiner_.get(); }
+  // TODO(skyewm): Graph is not part of public API
+  Graph* graph() const;
 
-  std::shared_ptr<Graph> graph_as_shared_ptr() const { return graph_; }
+  // TODO(skyewm): Graph is not part of public API
+  std::shared_ptr<Graph> graph_as_shared_ptr() const;
 
-  Status status() const { return *status_; }
+  Status status() const;
 
   /// If status() is Status::OK(), convert the Graph object stored in this scope
   /// to a GraphDef proto and return Status::OK(). Otherwise, return the error
@@ -196,74 +201,14 @@ class Scope {
   Status ToGraph(Graph* g) const;
   // END_SKIP_DOXYGEN
 
-  const std::vector<Operation>& control_deps() const { return control_deps_; }
+  const std::vector<Operation>& control_deps() const;
 
  private:
-  // Tag types to choose the constructor to dispatch.
-  struct Tags {
-    enum class ScopeName;
-    enum class OpName;
-    enum class ControlDeps;
-    enum class Device;
-    enum class SingleUseScope;
-    enum class ExitOnError;
-    enum class KernelLabel;
-    enum class Colocate;
-  };
-
-  // A NameMap is used to keep track of suffixes for names used in a scope. A
-  // name that has not been used so far in a scope will get no suffix. Later
-  // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
-  // can share the same NameMap. For instance, a new scope created using
-  // WithControlDependencies() should would share the same NameMap with the
-  // parent.
-  typedef std::unordered_map<string, int> NameMap;
-
-  Scope(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner);
-  Scope(const Scope& other, Tags::ScopeName, const string& name,
-        bool copy_names);
-  Scope(const Scope& other, Tags::OpName, const string& name,
-        const string& op_name);
-  Scope(const Scope& other, Tags::ControlDeps,
-        std::vector<Operation> control_deps, bool clear_control_deps);
-  Scope(const Scope& other, Tags::Device, const string& device);
-  Scope(const Scope& other, Tags::SingleUseScope, const string& op_name);
-  Scope(const Scope& other, Tags::ExitOnError);
-  Scope(const Scope& other, Tags::KernelLabel, const string& kernel_label);
-  Scope(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
-        bool clear_colocations);
-
-  std::unordered_set<string> GetColocationConstraints(
-      const Operation& colocate_with_op) const;
-
-  // Helper functions to get a unique names.
-  string GetUniqueName(const string& prefix, bool check_single_use) const;
-  string GetNameForOp(const string& default_name) const;
-
-  bool single_use_scope() const { return scope_used_ != nullptr; }
-
-  // The graph, status, and name maps are shared by all child scopes
-  // created from a single 'root' scope. A root scope is created by calling the
-  // Scope::NewRootScope function, which creates a new graph, a new status and
-  // the name maps.
-  std::shared_ptr<Graph> graph_ = nullptr;
-  std::shared_ptr<Status> status_ = nullptr;
-  std::shared_ptr<NameMap> name_map_ = nullptr;
-  std::shared_ptr<ShapeRefiner> refiner_ = nullptr;
-
-  // If scope_used_ is not nullptr, op_name_ should be empty and
-  // GetUniqueNameForOp can only be called once on this scope. More calls to
-  // GetUniqueNameForOp will cause an error status to be set on this scope.
-  std::shared_ptr<bool> scope_used_ = nullptr;
-
-  const std::vector<Operation> control_deps_;
-
-  const string name_ = "";
-  const string op_name_ = "";
-  const bool exit_on_error_ = false;
-  const string kernel_label_ = "";
-  const string device_ = "";
-  const std::unordered_set<string> colocation_constraints_;
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  Impl* impl() { return impl_.get(); }
+  const Impl* impl() const { return impl_.get(); }
+  explicit Scope(Impl*);
 };
 
 /// A helper struct to hold the scopes that would be used by a function
diff --git a/libs/tensorflow/include/tensorflow/cc/gradients/README.md b/libs/tensorflow/include/tensorflow/cc/gradients/README.md
new file mode 100644
index 0000000..3253163
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/cc/gradients/README.md
@@ -0,0 +1,52 @@
+# C++ gradients
+
+Gradients are currently being ported from
+[python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/ops)
+to C++ (in this directory).
+
+Contributions are welcome and much appreciated; please follow the instructions
+below.
+
+1.  Create the op gradient function in `foo_grad.cc` corresponding to the
+    `foo_grad.py` file where the op originated (i.e. `array_grad.py` op
+    gradients should be written in `array_grad.cc`).
+
+2.  Write the op gradient with the following naming scheme:
+
+        Status OpNameGrad(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+          ...
+          return scope.status();
+        }
+        REGISTER_GRADIENT_OP("OpName", OpNameGrad);
+
+3.  Ops gradients are implemented by using the [C++
+    API](https://www.tensorflow.org/api_docs/cc/).
+
+4.  Tests should be included in `foo_grad_test.cc`. Please see
+    [`array_grad_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/gradients/array_grad_test.cc)
+    for an many examples. Tests are as simple as, creating a placeholder input
+    for the op's inputs and calling `RunTest` (`RunTest` uses a [gradient
+    checker](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/framework/gradient_checker.cc)
+    to verify that the theoretical gradient matches the numeric gradient). For
+    example:
+
+        TEST_F(ArrayGradTest, IdentityGrad) {
+          TensorShape shape({5, 2});
+          auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+          auto y = Identity(scope_, x);
+          RunTest(x, shape, y, shape);
+        }
+
+NOTE: There are some ops that require features from the C++ API that are not yet
+implemented.
+
+*   Ops that require PartialTensorShape information cannot yet be implemented.
+
+*   Ops that require SparseTensor or IndexSlices (currently only in python)
+    cannot yet be implemented.
+
+*   Maybe more.
+
+For questions: Please create an issue assigned to suharshs.
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/array_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/array_ops.h
index c61b2c5..f1e56fe 100755
--- a/libs/tensorflow/include/tensorflow/cc/ops/array_ops.h
+++ b/libs/tensorflow/include/tensorflow/cc/ops/array_ops.h
@@ -354,288 +354,6 @@ class Concat {
   ::tensorflow::Output output;
 };
 
-/// Copy Op.
-///
-/// Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
-/// device on which the tensor is allocated.
-///
-/// Unlike the CopyHost Op, this op does not have HostMemory constraint on its
-/// input or output.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: Input tensor.
-///
-/// Optional attributes (see `Attrs`):
-/// * tensor_name: The name of the input tensor.
-///
-/// Returns:
-/// * `Output`: Output tensor, deep-copied from input.
-class Copy {
- public:
-  /// Optional attribute setters for Copy
-  struct Attrs {
-    /// The name of the input tensor.
-    ///
-    /// Defaults to ""
-    Attrs TensorName(StringPiece x) {
-      Attrs ret = *this;
-      ret.tensor_name_ = x;
-      return ret;
-    }
-
-    StringPiece tensor_name_ = "";
-  };
-  Copy(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  Copy(const ::tensorflow::Scope& scope, ::tensorflow::Input input, const
-     Copy::Attrs& attrs);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  static Attrs TensorName(StringPiece x) {
-    return Attrs().TensorName(x);
-  }
-
-  ::tensorflow::Output output;
-};
-
-/// Copy Host Op.
-///
-/// Performs CPU-to-CPU deep-copying of tensor.
-///
-/// Unlike the Copy Op, this op has HostMemory constraint on its input or output.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: Input tensor.
-///
-/// Optional attributes (see `Attrs`):
-/// * tensor_name: The name of the input tensor.
-///
-/// Returns:
-/// * `Output`: Output tensor, deep-copied from input.
-class CopyHost {
- public:
-  /// Optional attribute setters for CopyHost
-  struct Attrs {
-    /// The name of the input tensor.
-    ///
-    /// Defaults to ""
-    Attrs TensorName(StringPiece x) {
-      Attrs ret = *this;
-      ret.tensor_name_ = x;
-      return ret;
-    }
-
-    StringPiece tensor_name_ = "";
-  };
-  CopyHost(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  CopyHost(const ::tensorflow::Scope& scope, ::tensorflow::Input input, const
-         CopyHost::Attrs& attrs);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  static Attrs TensorName(StringPiece x) {
-    return Attrs().TensorName(x);
-  }
-
-  ::tensorflow::Output output;
-};
-
-/// Debug Identity Op.
-///
-/// Provides an identity mapping of the non-Ref type input tensor for debugging.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: Input tensor, non-Reference type.
-///
-/// Optional attributes (see `Attrs`):
-/// * tensor_name: Name of the input tensor.
-/// * debug_urls: List of URLs to debug targets, e.g.,
-/// file:///foo/tfdbg_dump, grpc:://localhost:11011
-///
-/// Returns:
-/// * `Output`: Output tensor that equals the input tensor.
-class DebugIdentity {
- public:
-  /// Optional attribute setters for DebugIdentity
-  struct Attrs {
-    /// Name of the input tensor.
-    ///
-    /// Defaults to ""
-    Attrs TensorName(StringPiece x) {
-      Attrs ret = *this;
-      ret.tensor_name_ = x;
-      return ret;
-    }
-
-    /// List of URLs to debug targets, e.g.,
-    /// file:///foo/tfdbg_dump, grpc:://localhost:11011
-    ///
-    /// Defaults to []
-    Attrs DebugUrls(const gtl::ArraySlice<string>& x) {
-      Attrs ret = *this;
-      ret.debug_urls_ = x;
-      return ret;
-    }
-
-    StringPiece tensor_name_ = "";
-    gtl::ArraySlice<string> debug_urls_ = {};
-  };
-  DebugIdentity(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  DebugIdentity(const ::tensorflow::Scope& scope, ::tensorflow::Input input,
-              const DebugIdentity::Attrs& attrs);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  static Attrs TensorName(StringPiece x) {
-    return Attrs().TensorName(x);
-  }
-  static Attrs DebugUrls(const gtl::ArraySlice<string>& x) {
-    return Attrs().DebugUrls(x);
-  }
-
-  ::tensorflow::Output output;
-};
-
-/// Debug NaN Value Counter Op
-///
-/// Counts number of NaNs in the input tensor, for debugging.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: Input tensor, non-Reference type.
-///
-/// Optional attributes (see `Attrs`):
-/// * tensor_name: Name of the input tensor.
-/// * debug_urls: List of URLs to debug targets, e.g.,
-/// file:///foo/tfdbg_dump, grpc:://localhost:11011
-///
-/// Returns:
-/// * `Output`: An integer output tensor that is the number of NaNs in the input.
-class DebugNanCount {
- public:
-  /// Optional attribute setters for DebugNanCount
-  struct Attrs {
-    /// Name of the input tensor.
-    ///
-    /// Defaults to ""
-    Attrs TensorName(StringPiece x) {
-      Attrs ret = *this;
-      ret.tensor_name_ = x;
-      return ret;
-    }
-
-    /// List of URLs to debug targets, e.g.,
-    /// file:///foo/tfdbg_dump, grpc:://localhost:11011
-    ///
-    /// Defaults to []
-    Attrs DebugUrls(const gtl::ArraySlice<string>& x) {
-      Attrs ret = *this;
-      ret.debug_urls_ = x;
-      return ret;
-    }
-
-    StringPiece tensor_name_ = "";
-    gtl::ArraySlice<string> debug_urls_ = {};
-  };
-  DebugNanCount(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  DebugNanCount(const ::tensorflow::Scope& scope, ::tensorflow::Input input,
-              const DebugNanCount::Attrs& attrs);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  static Attrs TensorName(StringPiece x) {
-    return Attrs().TensorName(x);
-  }
-  static Attrs DebugUrls(const gtl::ArraySlice<string>& x) {
-    return Attrs().DebugUrls(x);
-  }
-
-  ::tensorflow::Output output;
-};
-
-/// Debug Numeric Summary Op.
-///
-/// Provide a basic summary of numeric value types, range and distribution.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: Input tensor, non-Reference type, float or double.
-///
-/// Optional attributes (see `Attrs`):
-/// * tensor_name: Name of the input tensor.
-/// * debug_urls: List of URLs to debug targets, e.g.,
-/// file:///foo/tfdbg_dump, grpc:://localhost:11011
-///
-/// Returns:
-/// * `Output`: A double tensor of shape [12], the elements of which are:
-///   [0]: is initialized (1.0) or not (0.0).
-///   [1]: total number of elements
-///   [2]: -inf count
-///   [3]: negative element count (excluding -inf)
-///   [4]: zero element count
-///   [5]: positive element count (excluding +inf)
-///   [6]: +inf element count
-///   [7]: NaN element count
-/// Output elements [1:8] are all zero, if the tensor is uninitialized.
-///   [8]: minimum of all non-inf and non-NaN elements.
-///        If uninitialized or no such element exists: +inf.
-///   [9]: maximum of all non-inf and non-NaN elements.
-///        If uninitialized or no such element exists: -inf.
-///   [10]: mean of all non-inf and non-NaN elements.
-///         If uninitialized or no such element exists: NaN.
-///   [11]: variance of all non-inf and non-NaN elements.
-///         If uninitialized or no such element exists: NaN.
-class DebugNumericSummary {
- public:
-  /// Optional attribute setters for DebugNumericSummary
-  struct Attrs {
-    /// Name of the input tensor.
-    ///
-    /// Defaults to ""
-    Attrs TensorName(StringPiece x) {
-      Attrs ret = *this;
-      ret.tensor_name_ = x;
-      return ret;
-    }
-
-    /// List of URLs to debug targets, e.g.,
-    /// file:///foo/tfdbg_dump, grpc:://localhost:11011
-    ///
-    /// Defaults to []
-    Attrs DebugUrls(const gtl::ArraySlice<string>& x) {
-      Attrs ret = *this;
-      ret.debug_urls_ = x;
-      return ret;
-    }
-
-    StringPiece tensor_name_ = "";
-    gtl::ArraySlice<string> debug_urls_ = {};
-  };
-  DebugNumericSummary(const ::tensorflow::Scope& scope, ::tensorflow::Input
-                    input);
-  DebugNumericSummary(const ::tensorflow::Scope& scope, ::tensorflow::Input
-                    input, const DebugNumericSummary::Attrs& attrs);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  static Attrs TensorName(StringPiece x) {
-    return Attrs().TensorName(x);
-  }
-  static Attrs DebugUrls(const gtl::ArraySlice<string>& x) {
-    return Attrs().DebugUrls(x);
-  }
-
-  ::tensorflow::Output output;
-};
-
 /// DepthToSpace for tensors of type T.
 ///
 /// Rearranges data from depth into blocks of spatial data.
@@ -2280,6 +1998,87 @@ class PreventGradient {
   ::tensorflow::Output output;
 };
 
+/// Use QuantizeAndDequantizeV2 instead.
+/// DEPRECATED at GraphDef version 22:
+/// Replaced by QuantizeAndDequantizeV2.
+///
+/// Arguments:
+/// * scope: A Scope object
+///
+/// Returns:
+/// * `Output`: The output tensor.
+class QuantizeAndDequantize {
+ public:
+  /// Optional attribute setters for QuantizeAndDequantize
+  struct Attrs {
+    /// Defaults to true
+    Attrs SignedInput(bool x) {
+      Attrs ret = *this;
+      ret.signed_input_ = x;
+      return ret;
+    }
+
+    /// Defaults to 8
+    Attrs NumBits(int64 x) {
+      Attrs ret = *this;
+      ret.num_bits_ = x;
+      return ret;
+    }
+
+    /// Defaults to false
+    Attrs RangeGiven(bool x) {
+      Attrs ret = *this;
+      ret.range_given_ = x;
+      return ret;
+    }
+
+    /// Defaults to 0
+    Attrs InputMin(float x) {
+      Attrs ret = *this;
+      ret.input_min_ = x;
+      return ret;
+    }
+
+    /// Defaults to 0
+    Attrs InputMax(float x) {
+      Attrs ret = *this;
+      ret.input_max_ = x;
+      return ret;
+    }
+
+    bool signed_input_ = true;
+    int64 num_bits_ = 8;
+    bool range_given_ = false;
+    float input_min_ = 0.0f;
+    float input_max_ = 0.0f;
+  };
+  QuantizeAndDequantize(const ::tensorflow::Scope& scope, ::tensorflow::Input
+                      input);
+  QuantizeAndDequantize(const ::tensorflow::Scope& scope, ::tensorflow::Input
+                      input, const QuantizeAndDequantize::Attrs& attrs);
+  operator ::tensorflow::Output() const { return output; }
+  operator ::tensorflow::Input() const { return output; }
+  ::tensorflow::Node* node() const { return output.node(); }
+
+  static Attrs SignedInput(bool x) {
+    return Attrs().SignedInput(x);
+  }
+  static Attrs NumBits(int64 x) {
+    return Attrs().NumBits(x);
+  }
+  static Attrs RangeGiven(bool x) {
+    return Attrs().RangeGiven(x);
+  }
+  static Attrs InputMin(float x) {
+    return Attrs().InputMin(x);
+  }
+  static Attrs InputMax(float x) {
+    return Attrs().InputMax(x);
+  }
+
+  ::tensorflow::Output output;
+};
+
 /// Quantizes then dequantizes a tensor.
 ///
 /// This op simulates the precision loss from the quantized forward pass by:
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/array_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/array_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/candidate_sampling_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/candidate_sampling_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/control_flow_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/control_flow_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops.h
index da128cd..d639643 100755
--- a/libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops.h
+++ b/libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops.h
@@ -710,14 +710,14 @@ class FIFOQueue {
   ::tensorflow::Output handle;
 };
 
-/// Store the input tensor in the state of the current session.
+/// DEPRECATED at GraphDef version 23:
+/// Use GetSessionHandleV2.
 ///
 /// Arguments:
 /// * scope: A Scope object
-/// * value: The tensor to be stored.
 ///
 /// Returns:
-/// * `Output`: The handle for the tensor stored in the session state.
+/// * `Output`: The handle tensor.
 class GetSessionHandle {
  public:
   GetSessionHandle(const ::tensorflow::Scope& scope, ::tensorflow::Input value);
@@ -728,6 +728,25 @@ class GetSessionHandle {
   ::tensorflow::Output handle;
 };
 
+/// Store the input tensor in the state of the current session.
+///
+/// Arguments:
+/// * scope: A Scope object
+/// * value: The tensor to be stored.
+///
+/// Returns:
+/// * `Output`: The handle for the tensor stored in the session state, represented
+/// as a ResourceHandle object.
+class GetSessionHandleV2 {
+ public:
+  GetSessionHandleV2(const ::tensorflow::Scope& scope, ::tensorflow::Input value);
+  operator ::tensorflow::Output() const { return handle; }
+  operator ::tensorflow::Input() const { return handle; }
+  ::tensorflow::Node* node() const { return handle.node(); }
+
+  ::tensorflow::Output handle;
+};
+
 /// Get the value of the tensor specified by its handle.
 ///
 /// Arguments:
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/data_flow_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/image_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/image_ops.h
index d235b26..faa1d03 100755
--- a/libs/tensorflow/include/tensorflow/cc/ops/image_ops.h
+++ b/libs/tensorflow/include/tensorflow/cc/ops/image_ops.h
@@ -823,7 +823,7 @@ class EncodePng {
 /// to extract.  The glimpse height must be specified first, following
 /// by the glimpse width.
 /// * offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-/// the x, y locations of the center of each window.
+/// the y, x locations of the center of each window.
 ///
 /// Optional attributes (see `Attrs`):
 /// * centered: indicates if the offset coordinates are centered relative to
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/image_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/image_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/io_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/io_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/linalg_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/linalg_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/logging_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/logging_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/math_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/math_ops.h
index bf5d857..3265b77 100755
--- a/libs/tensorflow/include/tensorflow/cc/ops/math_ops.h
+++ b/libs/tensorflow/include/tensorflow/cc/ops/math_ops.h
@@ -197,6 +197,41 @@ class Any {
 };
 typedef Any ReduceAny;
 
+/// Returns the truth value of abs(x-y) < tolerance element-wise.
+///
+/// Arguments:
+/// * scope: A Scope object
+///
+/// Returns:
+/// * `Output`: The z tensor.
+class ApproximateEqual {
+ public:
+  /// Optional attribute setters for ApproximateEqual
+  struct Attrs {
+    /// Defaults to 1e-05
+    Attrs Tolerance(float x) {
+      Attrs ret = *this;
+      ret.tolerance_ = x;
+      return ret;
+    }
+
+    float tolerance_ = 1e-05f;
+  };
+  ApproximateEqual(const ::tensorflow::Scope& scope, ::tensorflow::Input x,
+                 ::tensorflow::Input y);
+  ApproximateEqual(const ::tensorflow::Scope& scope, ::tensorflow::Input x,
+                 ::tensorflow::Input y, const ApproximateEqual::Attrs& attrs);
+  operator ::tensorflow::Output() const { return z; }
+  operator ::tensorflow::Input() const { return z; }
+  ::tensorflow::Node* node() const { return z.node(); }
+
+  static Attrs Tolerance(float x) {
+    return Attrs().Tolerance(x);
+  }
+
+  ::tensorflow::Output z;
+};
+
 /// Returns the index with the largest value across dimensions of a tensor.
 ///
 /// Arguments:
@@ -378,6 +413,38 @@ class Betainc {
   ::tensorflow::Output z;
 };
 
+/// Counts the number of occurrences of each value in an integer array.
+///
+/// Outputs a vector with length `size` and the same dtype as `weights`. If
+/// `weights` are empty, then index `i` stores the number of times the value `i` is
+/// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+/// the value in `weights` at each index where the corresponding value in `arr` is
+/// `i`.
+///
+/// Values in `arr` outside of the range [0, size) are ignored.
+///
+/// Arguments:
+/// * scope: A Scope object
+/// * arr: int32 `Tensor`.
+/// * size: non-negative int32 scalar `Tensor`.
+/// * weights: is an int32, int64, float32, or float64 `Tensor` with the same
+/// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+/// equal to 1.
+///
+/// Returns:
+/// * `Output`: 1D `Tensor` with length equal to `size`. The counts or summed weights for
+/// each value in the range [0, size).
+class Bincount {
+ public:
+  Bincount(const ::tensorflow::Scope& scope, ::tensorflow::Input arr,
+         ::tensorflow::Input size, ::tensorflow::Input weights);
+  operator ::tensorflow::Output() const { return bins; }
+  operator ::tensorflow::Input() const { return bins; }
+  ::tensorflow::Node* node() const { return bins.node(); }
+
+  ::tensorflow::Output bins;
+};
+
 /// Cast x of type SrcT to y of DstT.
 ///
 /// Arguments:
@@ -844,77 +911,6 @@ class Expm1 {
   ::tensorflow::Output y;
 };
 
-/// Compute the 1-dimensional discrete Fourier Transform over the inner-most
-///
-/// dimension of `input`.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: A complex64 tensor.
-///
-/// Returns:
-/// * `Output`: A complex64 tensor of the same shape as `input`. The inner-most
-/// dimension of `input` is replaced with its 1D Fourier Transform.
-class FFT {
- public:
-  FFT(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  ::tensorflow::Output output;
-};
-
-/// Compute the 2-dimensional discrete Fourier Transform over the inner-most
-///
-/// 2 dimensions of `input`.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: A complex64 tensor.
-///
-/// Returns:
-/// * `Output`: A complex64 tensor of the same shape as `input`. The inner-most 2
-///   dimensions of `input` are replaced with their 2D Fourier Transform.
-///
-/// @compatibility(numpy)
-/// Equivalent to np.fft2
-/// @end_compatibility
-class FFT2D {
- public:
-  FFT2D(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  ::tensorflow::Output output;
-};
-
-/// Compute the 3-dimensional discrete Fourier Transform over the inner-most 3
-///
-/// dimensions of `input`.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: A complex64 tensor.
-///
-/// Returns:
-/// * `Output`: A complex64 tensor of the same shape as `input`. The inner-most 3
-///   dimensions of `input` are replaced with their 3D Fourier Transform.
-///
-/// @compatibility(numpy)
-/// Equivalent to np.fft3
-/// @end_compatibility
-class FFT3D {
- public:
-  FFT3D(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  ::tensorflow::Output output;
-};
-
 /// Returns element-wise largest integer not greater than x.
 ///
 /// Arguments:
@@ -1019,77 +1015,6 @@ class GreaterEqual {
   ::tensorflow::Output z;
 };
 
-/// Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most
-///
-/// dimension of `input`.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: A complex64 tensor.
-///
-/// Returns:
-/// * `Output`: A complex64 tensor of the same shape as `input`. The inner-most
-/// dimension of `input` is replaced with its inverse 1D Fourier Transform.
-class IFFT {
- public:
-  IFFT(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  ::tensorflow::Output output;
-};
-
-/// Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most
-///
-/// 2 dimensions of `input`.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: A complex64 tensor.
-///
-/// Returns:
-/// * `Output`: A complex64 tensor of the same shape as `input`. The inner-most 2
-///   dimensions of `input` are replaced with their inverse 2D Fourier Transform.
-///
-/// @compatibility(numpy)
-/// Equivalent to np.ifft2
-/// @end_compatibility
-class IFFT2D {
- public:
-  IFFT2D(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  ::tensorflow::Output output;
-};
-
-/// Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most
-///
-/// 3 dimensions of `input`.
-///
-/// Arguments:
-/// * scope: A Scope object
-/// * input: A complex64 tensor.
-///
-/// Returns:
-/// * `Output`: A complex64 tensor of the same shape as `input`. The inner-most 3
-///   dimensions of `input` are replaced with their inverse 3D Fourier Transform.
-///
-/// @compatibility(numpy)
-/// Equivalent to np.fft3
-/// @end_compatibility
-class IFFT3D {
- public:
-  IFFT3D(const ::tensorflow::Scope& scope, ::tensorflow::Input input);
-  operator ::tensorflow::Output() const { return output; }
-  operator ::tensorflow::Input() const { return output; }
-  ::tensorflow::Node* node() const { return output.node(); }
-
-  ::tensorflow::Output output;
-};
-
 /// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 ///
 /// The lower regularized incomplete Gamma function is defined as:
@@ -2337,15 +2262,17 @@ class Rsqrt {
 
 /// Computes the maximum along segments of a tensor.
 ///
-/// Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-/// for an explanation of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Computes a tensor such that
 /// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
 /// that `segment_ids[j] == i`.
 ///
+/// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/SegmentMax.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 /// </div>
 ///
 /// Arguments:
@@ -2369,17 +2296,18 @@ class SegmentMax {
 
 /// Computes the mean along segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Computes a tensor such that
 /// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
 /// over `j` such that `segment_ids[j] == i` and `N` is the total number of
 /// values summed.
 ///
+/// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/SegmentMean.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 /// </div>
 ///
 /// Arguments:
@@ -2403,16 +2331,17 @@ class SegmentMean {
 
 /// Computes the minimum along segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Computes a tensor such that
 /// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
 /// that `segment_ids[j] == i`.
 ///
+/// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/SegmentMin.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 /// </div>
 ///
 /// Arguments:
@@ -2436,16 +2365,17 @@ class SegmentMin {
 
 /// Computes the product along segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Computes a tensor such that
 /// \\(output_i = \prod_j data_j\\) where the product is over `j` such
 /// that `segment_ids[j] == i`.
 ///
+/// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/SegmentProd.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
 /// </div>
 ///
 /// Arguments:
@@ -2469,15 +2399,17 @@ class SegmentProd {
 
 /// Computes the sum along segments of a tensor.
 ///
-/// Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-/// for an explanation of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Computes a tensor such that
 /// \\(output_i = \sum_j data_j\\) where sum is over `j` such
 /// that `segment_ids[j] == i`.
 ///
+/// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/SegmentSum.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
 /// </div>
 ///
 /// Arguments:
@@ -2693,9 +2625,8 @@ class SparseMatMul {
 
 /// Computes the mean along sparse segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 /// dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -2749,9 +2680,8 @@ class SparseSegmentMeanGrad {
 ///
 /// N is the size of the segment being reduced.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Arguments:
 /// * scope: A Scope object
@@ -2801,9 +2731,8 @@ class SparseSegmentSqrtNGrad {
 
 /// Computes the sum along sparse segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
 /// dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -3071,11 +3000,10 @@ class TruncateMod {
 
 /// Computes the Max along segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
-/// This operator is similar to the [unsorted segment sum operator](../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+/// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
 /// Instead of computing the sum over segments, it computes the maximum
 /// such that:
 ///
@@ -3086,7 +3014,7 @@ class TruncateMod {
 ///  `output[i] = numeric_limits<T>::min()`.
 ///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 /// </div>
 ///
 /// Arguments:
@@ -3111,9 +3039,8 @@ class UnsortedSegmentMax {
 
 /// Computes the sum along segments of a tensor.
 ///
-/// Read [the section on
-/// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-/// of segments.
+/// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+/// segments.
 ///
 /// Computes a tensor such that
 /// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
@@ -3126,7 +3053,7 @@ class UnsortedSegmentMax {
 /// `num_segments` should equal the number of distinct segment IDs.
 ///
 /// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-/// <img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
+/// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 /// </div>
 ///
 /// Arguments:
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/math_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/math_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/nn_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/nn_ops.h
index f24f2ca..ee7cd0a 100755
--- a/libs/tensorflow/include/tensorflow/cc/ops/nn_ops.h
+++ b/libs/tensorflow/include/tensorflow/cc/ops/nn_ops.h
@@ -266,19 +266,25 @@ class BiasAddGrad {
 ///
 /// Arguments:
 /// * scope: A Scope object
-/// * strides: 1-D of length 4.  The stride of the sliding window for each dimension
-/// of `input`. Must be in the same order as the dimension specified with format.
+/// * input: A 4-D tensor. The dimension order is interpreted according to the value
+/// of `data_format`, see below for details.
+/// * filter: A 4-D tensor of shape
+/// `[filter_height, filter_width, in_channels, out_channels]`
+/// * strides: 1-D tensor of length 4.  The stride of the sliding window for each
+/// dimension of `input`. The dimension order is determined by the value of
+///   `data_format`, see below for details.
 /// * padding: The type of padding algorithm to use.
 ///
 /// Optional attributes (see `Attrs`):
 /// * data_format: Specify the data format of the input and output data. With the
 /// default format "NHWC", the data is stored in the order of:
-///     [batch, in_height, in_width, in_channels].
+///     [batch, height, width, channels].
 /// Alternatively, the format could be "NCHW", the data storage order of:
-///     [batch, in_channels, in_height, in_width].
+///     [batch, channels, height, width].
 ///
 /// Returns:
-/// * `Output`: The output tensor.
+/// * `Output`: A 4-D tensor. The dimension order is determined by the value of
+/// `data_format`, see below for details.
 class Conv2D {
  public:
   /// Optional attribute setters for Conv2D
@@ -292,9 +298,9 @@ class Conv2D {
 
     /// Specify the data format of the input and output data. With the
     /// default format "NHWC", the data is stored in the order of:
-    ///     [batch, in_height, in_width, in_channels].
+    ///     [batch, height, width, channels].
     /// Alternatively, the format could be "NCHW", the data storage order of:
-    ///     [batch, in_channels, in_height, in_width].
+    ///     [batch, channels, height, width].
     ///
     /// Defaults to "NHWC"
     Attrs DataFormat(StringPiece x) {
@@ -593,17 +599,49 @@ class Conv3DBackpropInputV2 {
 /// of `input`.
 /// * padding: The type of padding algorithm to use.
 ///
+/// Optional attributes (see `Attrs`):
+/// * data_format: Specify the data format of the input and output data. With the
+/// default format "NHWC", the data is stored in the order of:
+///     [batch, height, width, channels].
+/// Alternatively, the format could be "NCHW", the data storage order of:
+///     [batch, channels, height, width].
+///
 /// Returns:
 /// * `Output`: The output tensor.
 class DepthwiseConv2dNative {
  public:
+  /// Optional attribute setters for DepthwiseConv2dNative
+  struct Attrs {
+    /// Specify the data format of the input and output data. With the
+    /// default format "NHWC", the data is stored in the order of:
+    ///     [batch, height, width, channels].
+    /// Alternatively, the format could be "NCHW", the data storage order of:
+    ///     [batch, channels, height, width].
+    ///
+    /// Defaults to "NHWC"
+    Attrs DataFormat(StringPiece x) {
+      Attrs ret = *this;
+      ret.data_format_ = x;
+      return ret;
+    }
+
+    StringPiece data_format_ = "NHWC";
+  };
   DepthwiseConv2dNative(const ::tensorflow::Scope& scope, ::tensorflow::Input
                       input, ::tensorflow::Input filter, const
                       gtl::ArraySlice<int>& strides, StringPiece padding);
+  DepthwiseConv2dNative(const ::tensorflow::Scope& scope, ::tensorflow::Input
+                      input, ::tensorflow::Input filter, const
+                      gtl::ArraySlice<int>& strides, StringPiece padding, const
+                      DepthwiseConv2dNative::Attrs& attrs);
   operator ::tensorflow::Output() const { return output; }
   operator ::tensorflow::Input() const { return output; }
   ::tensorflow::Node* node() const { return output.node(); }
 
+  static Attrs DataFormat(StringPiece x) {
+    return Attrs().DataFormat(x);
+  }
+
   ::tensorflow::Output output;
 };
 
@@ -611,32 +649,72 @@ class DepthwiseConv2dNative {
 ///
 /// Arguments:
 /// * scope: A Scope object
-/// * input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+/// * input: 4-D with shape based on `data_format`.  For example, if
+/// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+/// in_width, in_channels]` tensor.
 /// * filter_sizes: An integer vector representing the tensor shape of `filter`,
 /// where `filter` is a 4-D
 /// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-/// * out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+/// * out_backprop: 4-D with shape  based on `data_format`.
+/// For example, if `data_format` is 'NHWC' then
+/// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
 /// Gradients w.r.t. the output of the convolution.
 /// * strides: The stride of the sliding window for each dimension of the input
 /// of the convolution.
 /// * padding: The type of padding algorithm to use.
 ///
+/// Optional attributes (see `Attrs`):
+/// * data_format: Specify the data format of the input and output data. With the
+/// default format "NHWC", the data is stored in the order of:
+///     [batch, height, width, channels].
+/// Alternatively, the format could be "NCHW", the data storage order of:
+///     [batch, channels, height, width].
+///
 /// Returns:
 /// * `Output`: 4-D with shape
 /// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
 /// the `filter` input of the convolution.
 class DepthwiseConv2dNativeBackpropFilter {
  public:
+  /// Optional attribute setters for DepthwiseConv2dNativeBackpropFilter
+  struct Attrs {
+    /// Specify the data format of the input and output data. With the
+    /// default format "NHWC", the data is stored in the order of:
+    ///     [batch, height, width, channels].
+    /// Alternatively, the format could be "NCHW", the data storage order of:
+    ///     [batch, channels, height, width].
+    ///
+    /// Defaults to "NHWC"
+    Attrs DataFormat(StringPiece x) {
+      Attrs ret = *this;
+      ret.data_format_ = x;
+      return ret;
+    }
+
+    StringPiece data_format_ = "NHWC";
+  };
   DepthwiseConv2dNativeBackpropFilter(const ::tensorflow::Scope& scope,
                                     ::tensorflow::Input input,
                                     ::tensorflow::Input filter_sizes,
                                     ::tensorflow::Input out_backprop, const
                                     gtl::ArraySlice<int>& strides, StringPiece
                                     padding);
+  DepthwiseConv2dNativeBackpropFilter(const ::tensorflow::Scope& scope,
+                                    ::tensorflow::Input input,
+                                    ::tensorflow::Input filter_sizes,
+                                    ::tensorflow::Input out_backprop, const
+                                    gtl::ArraySlice<int>& strides, StringPiece
+                                    padding, const
+                                    DepthwiseConv2dNativeBackpropFilter::Attrs&
+                                    attrs);
   operator ::tensorflow::Output() const { return output; }
   operator ::tensorflow::Input() const { return output; }
   ::tensorflow::Node* node() const { return output.node(); }
 
+  static Attrs DataFormat(StringPiece x) {
+    return Attrs().DataFormat(x);
+  }
+
   ::tensorflow::Output output;
 };
 
@@ -644,31 +722,72 @@ class DepthwiseConv2dNativeBackpropFilter {
 ///
 /// Arguments:
 /// * scope: A Scope object
-/// * input_sizes: An integer vector representing the shape of `input`,
-/// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+/// * input_sizes: An integer vector representing the shape of `input`, based
+/// on `data_format`.  For example, if `data_format` is 'NHWC' then
+///  `input` is a 4-D `[batch, height, width, channels]` tensor.
 /// * filter: 4-D with shape
 /// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-/// * out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+/// * out_backprop: 4-D with shape  based on `data_format`.
+/// For example, if `data_format` is 'NHWC' then
+/// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
 /// Gradients w.r.t. the output of the convolution.
 /// * strides: The stride of the sliding window for each dimension of the input
 /// of the convolution.
 /// * padding: The type of padding algorithm to use.
 ///
+/// Optional attributes (see `Attrs`):
+/// * data_format: Specify the data format of the input and output data. With the
+/// default format "NHWC", the data is stored in the order of:
+///     [batch, height, width, channels].
+/// Alternatively, the format could be "NCHW", the data storage order of:
+///     [batch, channels, height, width].
+///
 /// Returns:
-/// * `Output`: 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-/// w.r.t. the input of the convolution.
+/// * `Output`: 4-D with shape according to `data_format`.  For example, if
+/// `data_format` is 'NHWC', output shape is `[batch, in_height,
+/// in_width, in_channels]`.  Gradient w.r.t. the input of the
+/// convolution.
 class DepthwiseConv2dNativeBackpropInput {
  public:
+  /// Optional attribute setters for DepthwiseConv2dNativeBackpropInput
+  struct Attrs {
+    /// Specify the data format of the input and output data. With the
+    /// default format "NHWC", the data is stored in the order of:
+    ///     [batch, height, width, channels].
+    /// Alternatively, the format could be "NCHW", the data storage order of:
+    ///     [batch, channels, height, width].
+    ///
+    /// Defaults to "NHWC"
+    Attrs DataFormat(StringPiece x) {
+      Attrs ret = *this;
+      ret.data_format_ = x;
+      return ret;
+    }
+
+    StringPiece data_format_ = "NHWC";
+  };
   DepthwiseConv2dNativeBackpropInput(const ::tensorflow::Scope& scope,
                                    ::tensorflow::Input input_sizes,
                                    ::tensorflow::Input filter,
                                    ::tensorflow::Input out_backprop, const
                                    gtl::ArraySlice<int>& strides, StringPiece
                                    padding);
+  DepthwiseConv2dNativeBackpropInput(const ::tensorflow::Scope& scope,
+                                   ::tensorflow::Input input_sizes,
+                                   ::tensorflow::Input filter,
+                                   ::tensorflow::Input out_backprop, const
+                                   gtl::ArraySlice<int>& strides, StringPiece
+                                   padding, const
+                                   DepthwiseConv2dNativeBackpropInput::Attrs&
+                                   attrs);
   operator ::tensorflow::Output() const { return output; }
   operator ::tensorflow::Input() const { return output; }
   ::tensorflow::Node* node() const { return output.node(); }
 
+  static Attrs DataFormat(StringPiece x) {
+    return Attrs().DataFormat(x);
+  }
+
   ::tensorflow::Output output;
 };
 
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/nn_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/nn_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/no_op_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/no_op_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/parsing_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/parsing_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/random_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/random_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops.h
deleted file mode 100644
index 8fb4459..0000000
--- a/libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// This file is MACHINE GENERATED! Do not edit.
-
-#ifndef TENSORFLOW_CC_OPS_REMOTE_FUSED_GRAPH_OPS_H_
-#define TENSORFLOW_CC_OPS_REMOTE_FUSED_GRAPH_OPS_H_
-
-// This file is MACHINE GENERATED! Do not edit.
-
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-
-namespace tensorflow {
-namespace ops {
-
-/// @defgroup remote_fused_graph_ops Remote Fused Graph Ops
-/// @{
-
-/// Execute a sub graph on a remote processor transferred by GraphTransferer.
-///
-/// The graph specifications are serialized by protobuf as graph_transfer_info.
-/// The implementation / limitations may differ for each platform
-/// and each available peripheral.
-///
-/// Arguments:
-/// * scope: A Scope object
-///
-/// Returns:
-/// * `OutputList`: The output tensor.
-class RemoteFusedGraphExecute {
- public:
-  RemoteFusedGraphExecute(const ::tensorflow::Scope& scope,
-                        ::tensorflow::InputList values, int64 N, StringPiece
-                        serialized_graph_transfer_info);
-  ::tensorflow::Output operator[](size_t index) const { return output[index]; }
-
-
-  ::tensorflow::OutputList output;
-};
-
-/// @}
-
-}  // namespace ops
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CC_OPS_REMOTE_FUSED_GRAPH_OPS_H_
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops_internal.h
deleted file mode 100644
index 0b9d348..0000000
--- a/libs/tensorflow/include/tensorflow/cc/ops/remote_fused_graph_ops_internal.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// This file is MACHINE GENERATED! Do not edit.
-
-#ifndef TENSORFLOW_CC_OPS_REMOTE_FUSED_GRAPH_OPS_INTERNAL_H_
-#define TENSORFLOW_CC_OPS_REMOTE_FUSED_GRAPH_OPS_INTERNAL_H_
-
-// This file is MACHINE GENERATED! Do not edit.
-
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-
-namespace tensorflow {
-namespace ops {
-namespace internal {
-// NOTE: This namespace has internal TensorFlow details that
-// are not part of TensorFlow's public API.
-
-/// @defgroup remote_fused_graph_ops_internal Remote Fused Graph Ops Internal
-/// @{
-
-}  // namespace internal
-}  // namespace ops
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CC_OPS_REMOTE_FUSED_GRAPH_OPS_INTERNAL_H_
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/sparse_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/sparse_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/state_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/state_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/string_ops.h b/libs/tensorflow/include/tensorflow/cc/ops/string_ops.h
index 3029009..e18991f 100755
--- a/libs/tensorflow/include/tensorflow/cc/ops/string_ops.h
+++ b/libs/tensorflow/include/tensorflow/cc/ops/string_ops.h
@@ -424,7 +424,7 @@ class StringToHashBucketFast {
 /// A strong hash is important when inputs may be malicious, e.g. URLs with
 /// additional components. Adversaries could try to make their inputs hash to the
 /// same bucket for a denial-of-service attack or to skew the results. A strong
-/// hash prevents this by making it dificult, if not infeasible, to compute inputs
+/// hash prevents this by making it difficult, if not infeasible, to compute inputs
 /// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
 /// time than `tf.string_to_hash_bucket_fast`.
 ///
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/string_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/string_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/training_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/training_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/ops/user_ops_internal.h b/libs/tensorflow/include/tensorflow/cc/ops/user_ops_internal.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/BUILD b/libs/tensorflow/include/tensorflow/cc/saved_model/BUILD
index 36fec7a..b402570 100644
--- a/libs/tensorflow/include/tensorflow/cc/saved_model/BUILD
+++ b/libs/tensorflow/include/tensorflow/cc/saved_model/BUILD
@@ -66,6 +66,7 @@ filegroup(
     name = "saved_model_half_plus_two",
     srcs = glob([
         "testdata/half_plus_two_pbtxt/**",
+        "testdata/half_plus_two_main_op/**",
         "testdata/half_plus_two/**",
     ]),
 )
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/constants.h b/libs/tensorflow/include/tensorflow/cc/saved_model/constants.h
index 7f2d560..94a3b3c 100644
--- a/libs/tensorflow/include/tensorflow/cc/saved_model/constants.h
+++ b/libs/tensorflow/include/tensorflow/cc/saved_model/constants.h
@@ -33,6 +33,9 @@ constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 /// SavedModel legacy init op key.
 constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
+/// SavedModel main op key.
+constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
+
 /// Directory in which to save the SavedModel variables.
 constexpr char kSavedModelVariablesDirectory[] = "variables";
 
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/variables/variables.data-00000-of-00001 b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/variables/variables.data-00000-of-00001
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/variables/variables.index b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two/00000123/variables/variables.index
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.data-00000-of-00001 b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..15b75d6ef6bffc336d138d923badb3928b8c4c13
GIT binary patch
literal 12
RcmZQzV6bOkU~m8;2LJ>^0RR91

literal 0
HcmV?d00001

diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.index b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_main_op/00000123/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9fb4fe2dd21d0a6c324aecd7658fc37cf2326
GIT binary patch
literal 151
zcmZQzVB=tvV&Y(AVB}8ZU=(7|U@>L0P?u+5<V^x`6<B}*0)H#~fda{30S=(R`)f^y
w8GwKhL_87Qwg=8R@M+-&21ce71}C^s#HN?u-8n+|Kmvq7`0s{pl~VWH07ax0asU7T

literal 0
HcmV?d00001

diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
old mode 100644
new mode 100755
index 356dbe6..b24ebc7
--- a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
+++ b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/saved_model.pbtxt
@@ -447,7 +447,7 @@ meta_graphs {
       }
     }
     tags: "serve"
-    tensorflow_version: "0.12.head"
+    tensorflow_version: "1.0.0"
     tensorflow_git_version: "unknown"
   }
   graph_def {
@@ -1714,7 +1714,7 @@ meta_graphs {
             dtype: DT_STRING
             tensor_shape {
             }
-            string_val: "_temp_aeab824a1fc94305a10a2504f5995de2/part"
+            string_val: "_temp_d286b725003942fd8bac94b6c67e7c0c/part"
           }
         }
       }
@@ -2503,6 +2503,42 @@ meta_graphs {
       }
     }
   }
+  signature_def {
+    key: "classify_x2_to_y3"
+    value {
+      inputs {
+        key: "inputs"
+        value {
+          name: "x2:0"
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: -1
+            }
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      outputs {
+        key: "scores"
+        value {
+          name: "y3:0"
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: -1
+            }
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      method_name: "tensorflow/serving/classify"
+    }
+  }
   signature_def {
     key: "classify_x_to_y"
     value {
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/variables/variables.data-00000-of-00001 b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/variables/variables.data-00000-of-00001
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/variables/variables.index b/libs/tensorflow/include/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/00000123/variables/variables.index
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/cc/training/coordinator.h b/libs/tensorflow/include/tensorflow/cc/training/coordinator.h
index dbcf072..632418c 100644
--- a/libs/tensorflow/include/tensorflow/cc/training/coordinator.h
+++ b/libs/tensorflow/include/tensorflow/cc/training/coordinator.h
@@ -21,19 +21,24 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
-/// The abstract interface for runners which must implement the Join function.
+/// The abstract interface for runners which must implement the Join and the
+/// IsRunning function.
 class RunnerInterface {
  public:
   virtual ~RunnerInterface() {}
   virtual Status Join() = 0;
-
+  virtual Status ExportRunMetadata(RunMetadata* metadata) const {
+    return Status(error::INVALID_ARGUMENT, "No RunMetadata to export.");
+  }
   /// Returns true iff the runner is running, i.e. if it is trying to populate
   /// its queue.
   virtual bool IsRunning() const = 0;
@@ -101,6 +106,9 @@ class Coordinator {
   /// RequestStop() is called.
   void WaitForStop();
 
+  // Returns the cost graph from stored run metadata in registered runners.
+  Status ExportCostGraph(CostGraphDef* cost_graph) const;
+
  private:
   std::unordered_set<int> clean_stop_errors_;
   condition_variable wait_for_stop_;
@@ -111,12 +119,10 @@ class Coordinator {
   mutex status_lock_;
   Status status_ GUARDED_BY(status_lock_);
 
-  mutex runners_lock_;
+  mutable mutex runners_lock_;
   std::vector<std::unique_ptr<RunnerInterface>> runners_
       GUARDED_BY(runners_lock_);
 
-  std::atomic<int> num_runners_to_cancel_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(Coordinator);
 };
 
diff --git a/libs/tensorflow/include/tensorflow/cc/training/queue_runner.h b/libs/tensorflow/include/tensorflow/cc/training/queue_runner.h
index bfe6a30..c69f287 100644
--- a/libs/tensorflow/include/tensorflow/cc/training/queue_runner.h
+++ b/libs/tensorflow/include/tensorflow/cc/training/queue_runner.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
 #include "tensorflow/core/public/session.h"
 
@@ -58,9 +59,16 @@ class QueueRunner : public RunnerInterface {
   /// Starts the queue runner with the given session.
   Status Start(Session* sess);
 
+  /// Starts the queue runner with the given session and sets the run arguments
+  /// for sess->Run. It also collects and stores the run metedata.
+  Status StartAndCollectRunMetadata(Session* sess,
+                                    const RunOptions* run_options = nullptr);
+
   /// Starts the queue runner with the given session, and wait for up to the
   /// specified time (in milliseconds) for the queues to start to fill up.
   Status Start(Session* sess, int wait_for_ms);
+  Status StartAndCollectRunMetadata(Session* session, int wait_for_ms,
+                                    const RunOptions* run_options = nullptr);
 
   /// Requests to stop and runs the cancel op. It would be called in a separate
   /// thread when coordinator is set. If there is no coordinator it should be
@@ -74,8 +82,11 @@ class QueueRunner : public RunnerInterface {
   /// Returns the latest status.
   Status GetStatus();
 
+  // Returns the stored run metadata.
+  Status ExportRunMetadata(RunMetadata* metadata) const override;
+
  private:
-  QueueRunner() : coord_(nullptr), stopped_(false) {}
+  QueueRunner() : coord_(nullptr), stopped_(false), rm_mu_(nullptr) {}
 
   // Initializes the instance with the QueueRunnerDef proto.
   Status Init(const QueueRunnerDef& queue_runner_def);
@@ -94,6 +105,10 @@ class QueueRunner : public RunnerInterface {
 
   bool IsRunning() const override { return !stopped_; }
 
+  void SetRunArgumentsAndRunMetadata(const RunOptions* run_options);
+
+  Status RealRun(Session* sess, const string& op);
+
   string queue_name_;
   std::vector<string> enqueue_op_names_;
   string close_op_name_;
@@ -114,6 +129,10 @@ class QueueRunner : public RunnerInterface {
 
   mutex cb_mu_;
   std::vector<std::function<void(Status)>> callbacks_;
+
+  mutable std::unique_ptr<mutex> rm_mu_;
+  std::unique_ptr<RunMetadata> run_metadata_ GUARDED_BY(rm_mu_);
+  RunOptions run_options_;
 };
 
 }  // namespace tensorflow
diff --git a/libs/tensorflow/include/tensorflow/core/BUILD b/libs/tensorflow/include/tensorflow/core/BUILD
index e558e6e..34e6a04 100644
--- a/libs/tensorflow/include/tensorflow/core/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/BUILD
@@ -107,6 +107,7 @@ load(
     "tf_kernel_tests_linkstatic",
     "tf_additional_cloud_op_deps",
     "tf_additional_cloud_kernel_deps",
+    "tf_lib_proto_parsing_deps",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -139,6 +140,7 @@ CORE_PROTO_SRCS = [
     "framework/log_memory.proto",
     "framework/node_def.proto",
     "framework/op_def.proto",
+    "framework/remote_fused_graph_execute_info.proto",
     "framework/resource_handle.proto",
     "framework/step_stats.proto",
     "framework/summary.proto",
@@ -147,10 +149,13 @@ CORE_PROTO_SRCS = [
     "framework/tensor_shape.proto",
     "framework/tensor_slice.proto",
     "framework/types.proto",
+    "framework/variable.proto",
     "framework/versions.proto",
     "lib/core/error_codes.proto",
     "protobuf/config.proto",
     "protobuf/debug.proto",
+    "protobuf/queue_runner.proto",
+    "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
     "util/memmapped_file_system.proto",
@@ -164,28 +169,40 @@ CORE_PROTO_SRCS = [
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
-    "framework/variable.proto",
     "protobuf/control_flow.proto",
     "protobuf/meta_graph.proto",
     "protobuf/named_tensor.proto",
-    "protobuf/queue_runner.proto",
     "protobuf/saved_model.proto",
     "protobuf/tensorflow_server.proto",
     "util/event.proto",
     "util/test_log.proto",
 ]
 
+tf_proto_library(
+    name = "reader_base_proto",
+    srcs = ["framework/reader_base.proto"],
+    cc_api_version = 2,
+    go_api_version = 2,
+    java_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "protos_all",
     srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
     go_api_version = 2,
+    j2objc_api_version = 1,
     java_api_version = 2,
     js_api_version = 2,
     js_codegen = "jspb",
     visibility = ["//visibility:public"],
 )
 
+exports_files([
+    "framework/types.proto",
+])
+
 tf_proto_library(
     name = "protos_test",
     srcs = ["util/example_proto_fast_parsing_test.proto"],
@@ -213,10 +230,7 @@ cc_library(
         "platform/types.h",
     ] + glob(tf_additional_proto_hdrs()) + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
-    deps = [
-        ":protos_all_cc",
-        "//tensorflow/core/platform/default/build_config:proto_parsing",
-    ],
+    deps = tf_lib_proto_parsing_deps(),
 )
 
 cc_library(
@@ -238,6 +252,7 @@ cc_library(
         "lib/gtl/flatmap.h",
         "lib/gtl/flatset.h",
         "lib/gtl/inlined_vector.h",
+        "lib/gtl/optional.h",
         "lib/gtl/priority_queue_util.h",
         "lib/hash/crc32c.h",
         "lib/histogram/histogram.h",
@@ -324,6 +339,7 @@ tf_cuda_library(
     hdrs = [
         "example/feature_util.h",
         "framework/allocator.h",
+        "framework/allocator_registry.h",
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
         "framework/cancellation.h",
@@ -393,11 +409,25 @@ tf_cuda_library(
             "util/memmapped_file_system.h",
             "util/memmapped_file_system_writer.h",
         ],
-    }),
+    }) + if_mkl([
+        "util/mkl_util.h",
+    ]),
     visibility = ["//visibility:public"],
     deps = [":framework_internal"],
 )
 
+cc_library(
+    name = "reader_base",
+    srcs = ["framework/reader_base.cc"],
+    hdrs = ["framework/reader_base.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":framework",
+        ":lib",
+        ":reader_base_proto_cc",
+    ],
+)
+
 tf_proto_library_cc(
     name = "op_gen_overrides_proto",
     srcs = ["framework/op_gen_overrides.proto"],
@@ -417,6 +447,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "session_options",
+    hdrs = ["public/session_options.h"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "framework_lite",
     srcs = tf_additional_minimal_lib_srcs(),
@@ -426,7 +462,9 @@ cc_library(
         "framework/type_traits.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
+        "platform/default/logging.h",
         "platform/default/mutex.h",
+        "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/dynamic_annotations.h",
         "platform/macros.h",
@@ -470,12 +508,26 @@ tf_gen_op_libs(
         "script_ops",
         "sendrecv_ops",
         "sparse_ops",
+        "spectral_ops",
         "state_ops",
         "string_ops",
         "training_ops",
     ],
 )
 
+cc_library(
+    name = "debug_ops_op_lib",
+    srcs = ["ops/debug_ops.cc"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":framework",
+        "//tensorflow/core/kernels:debug_ops",
+    ],
+    alwayslink = 1,
+)
+
 # And one for all user ops
 cc_library(
     name = "user_ops_op_lib",
@@ -496,16 +548,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "cloud_ops_op_lib",
-    srcs = ["ops/cloud_ops.cc"],
-    copts = tf_copts(),
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [":framework"],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
@@ -531,6 +573,7 @@ cc_library(
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
         ":sparse_ops_op_lib",
+        ":spectral_ops_op_lib",
         ":state_ops_op_lib",
         ":string_ops_op_lib",
         ":training_ops_op_lib",
@@ -633,11 +676,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:debug_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:image",
@@ -666,6 +709,10 @@ cc_library(
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
         "//tensorflow/core/kernels:quantized_ops",
+    ]) + if_mkl([
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
 
@@ -773,6 +820,7 @@ filegroup(
         "//tensorflow/core/platform/default/build_config:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
         "//tensorflow/core/util/tensor_bundle:android_srcs",
+        "//tensorflow/core/grappler:android_srcs",
         "common_runtime/gpu/gpu_tracer.cc",
         "common_runtime/gpu/gpu_tracer.h",
     ] + glob(
@@ -799,6 +847,7 @@ filegroup(
             "**/*main.cc",
             "debug/**/*",
             "framework/op_gen_*",
+            "framework/reader_base.*",
             "graph/dot.*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
@@ -1107,6 +1156,8 @@ cc_library(
                 "lib/**/*.cc",
                 "platform/*.h",
                 "platform/*.cc",
+                "platform/profile_utils/**/*.h",
+                "platform/profile_utils/**/*.cc",
             ],
             exclude = [
                 "**/*test*",
@@ -1306,6 +1357,7 @@ tf_cuda_library(
             "util/reporter.cc",
             "framework/fake_input.*",
             "framework/op_gen_lib.*",
+            "framework/reader_base.*",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
             "util/version_info.cc",
@@ -1346,7 +1398,7 @@ tf_cuda_library(
         ":version_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(["//third_party/mkl:intel_binary_blob"]),
     alwayslink = 1,
 )
 
@@ -1355,6 +1407,7 @@ cc_header_only_library(
     visibility = ["//visibility:public"],
     deps = [
         ":framework",
+        ":reader_base",
     ],
 )
 
@@ -1434,18 +1487,21 @@ tf_cuda_library(
     ),
     copts = tf_copts(),
     deps = [
-        ":framework",
-        ":framework_internal",
-        ":function_ops_op_lib",
-        ":functional_grad",
-        ":functional_ops_op_lib",
-        ":lib",
-        ":lib_internal",
-        ":proto_text",
-        ":protos_all_cc",
-        "//third_party/eigen3",
-        "//tensorflow/core/kernels:required",
-    ] + tf_additional_core_deps(),
+               ":framework",
+               ":framework_internal",
+               ":function_ops_op_lib",
+               ":functional_grad",
+               ":functional_ops_op_lib",
+               ":lib",
+               ":lib_internal",
+               ":proto_text",
+               ":protos_all_cc",
+               "//tensorflow/core/grappler:grappler_item",
+               "//tensorflow/core/grappler/optimizers:meta_optimizer",
+               "//third_party/eigen3",
+               "//tensorflow/core/kernels:required",
+           ] + if_mkl(["//third_party/mkl:intel_binary_blob"]) +
+           tf_additional_core_deps(),
     alwayslink = 1,
 )
 
@@ -1674,6 +1730,7 @@ tf_cc_tests(
         "lib/gtl/iterator_range_test.cc",
         "lib/gtl/manual_constructor_test.cc",
         "lib/gtl/map_util_test.cc",
+        "lib/gtl/optional_test.cc",
         "lib/gtl/top_n_test.cc",
         "lib/hash/crc32c_test.cc",
         "lib/hash/hash_test.cc",
@@ -1809,12 +1866,35 @@ cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "quantize_training_test",
+    srcs = ["graph/quantize_training_test.cc"],
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
     srcs = [
         "common_runtime/device_set_test.cc",
         "common_runtime/optimization_registry_test.cc",
+        "common_runtime/resource_variable_read_optimizer_test.cc",
         "common_runtime/pending_counts_test.cc",
         "common_runtime/session_test.cc",
         "common_runtime/simple_placer_test.cc",
@@ -1851,13 +1931,11 @@ tf_cc_tests(
         "graph/algorithm_test.cc",
         "graph/edgeset_test.cc",
         "graph/equal_graph_def_test.cc",
-        "graph/graph_constructor_test.cc",
         "graph/graph_def_builder_test.cc",
         "graph/graph_partition_test.cc",
         "graph/graph_test.cc",
         "graph/node_builder_test.cc",
         "graph/optimizer_cse_test.cc",
-        "graph/quantize_training_test.cc",
         "graph/subgraph_test.cc",
         "graph/tensor_id_test.cc",
         "graph/validate_test.cc",
@@ -1909,11 +1987,51 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_tests(
+    name = "higher_level_tests_needing_kernels",
+    size = "small",
+    srcs = [
+        "graph/graph_constructor_test.cc",
+    ],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
+)
+
 if_mkl(
     tf_cc_test_mkl(
         name = "mkl_related_tests",
         size = "small",
-        srcs = ["graph/mkl_optimizer_merge_test.cc"],
+        srcs = [
+            "graph/mkl_layout_pass_test.cc",
+            "graph/mkl_optimizer_merge_test.cc",
+            "graph/mkl_tfconversion_pass_test.cc",
+        ],
         linkstatic = tf_kernel_tests_linkstatic(),
         deps = [
             ":core",
@@ -1932,6 +2050,9 @@ if_mkl(
             "//tensorflow/cc:cc_ops",
             "//tensorflow/cc:scope",
             "//tensorflow/cc:sendrecv_ops",
+            "//tensorflow/core/kernels:mkl_conv_op",
+            "//tensorflow/core/kernels:mkl_matmul_op",
+            "//tensorflow/core/kernels:mkl_tfconv_op",
             "//tensorflow/core/kernels:ops_util",
             "//third_party/eigen3",
         ],
@@ -2147,6 +2268,7 @@ tf_cc_test(
     srcs = ["common_runtime/direct_session_with_tracking_alloc_test.cc"],
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_gpu"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2362,6 +2484,7 @@ tf_cc_test(
     size = "small",
     srcs = ["ops/math_grad_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_gpu"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2426,6 +2549,7 @@ tf_cc_tests(
         "ops/random_ops_test.cc",
         "ops/set_ops_test.cc",
         "ops/sparse_ops_test.cc",
+        "ops/spectral_ops_test.cc",
         "ops/state_ops_test.cc",
         "ops/string_ops_test.cc",
         "ops/training_ops_test.cc",
diff --git a/libs/tensorflow/include/tensorflow/core/common_runtime/direct_session.h b/libs/tensorflow/include/tensorflow/core/common_runtime/direct_session.h
index 3e3a5ea..1495648 100644
--- a/libs/tensorflow/include/tensorflow/core/common_runtime/direct_session.h
+++ b/libs/tensorflow/include/tensorflow/core/common_runtime/direct_session.h
@@ -192,6 +192,9 @@ class DirectSession : public Session {
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_);
 
+  ::tensorflow::Status ResourceHandleToInputTensor(
+      const Tensor& resource_tensor, Tensor* retrieved_tensor);
+
   // Feeds more inputs to the executors, triggering further execution.
   ::tensorflow::Status SendInputs(
       const std::vector<std::pair<string, Tensor>>& inputs,
diff --git a/libs/tensorflow/include/tensorflow/core/common_runtime/gpu/gpu_device.h b/libs/tensorflow/include/tensorflow/core/common_runtime/gpu/gpu_device.h
index f2d76b8..370b3cc 100644
--- a/libs/tensorflow/include/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/libs/tensorflow/include/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -113,6 +113,8 @@ class BaseGPUDevice : public LocalDevice {
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
+
+  void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context);
 };
 
 class BaseGPUDeviceFactory : public DeviceFactory {
diff --git a/libs/tensorflow/include/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/libs/tensorflow/include/tensorflow/core/common_runtime/mkl_cpu_allocator.h
new file mode 100644
index 0000000..41bf23b
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A simple CPU allocator that intercepts malloc/free calls from MKL library
+// and redirects them to Tensorflow allocator
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+
+#ifdef INTEL_MKL
+
+#include <string>
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mem.h"
+
+#include "third_party/mkl/include/i_malloc.h"
+
+namespace tensorflow {
+
+class MklSubAllocator : public SubAllocator {
+ public:
+  ~MklSubAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
+};
+
+/// CPU allocator for MKL that wraps BFC allocator and intercepts
+/// and redirects memory allocation calls from MKL.
+class MklCPUAllocator : public Allocator {
+ public:
+  // Constructor and other standard functions
+
+  MklCPUAllocator() {
+    VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
+    allocator_ =
+        new BFCAllocator(new MklSubAllocator, kMaxMemSize, kAllowGrowth, kName);
+
+    // For redirecting all allocations from MKL to this allocator
+    // From: http://software.intel.com/en-us/node/528565
+    i_malloc = MallocHook;
+    i_calloc = CallocHook;
+    i_realloc = ReallocHook;
+    i_free = FreeHook;
+  }
+
+  ~MklCPUAllocator() override { delete allocator_; }
+
+  inline string Name() override { return kName; }
+
+  inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return allocator_->AllocateRaw(alignment, num_bytes);
+  }
+
+  inline void DeallocateRaw(void* ptr) override {
+    allocator_->DeallocateRaw(ptr);
+  }
+
+ private:
+  // Hooks provided by this allocator for memory allocation routines from MKL
+
+  static inline void* MallocHook(size_t size) {
+    VLOG(2) << "MklCPUAllocator: In MallocHook";
+    return cpu_allocator()->AllocateRaw(kAlignment, size);
+  }
+
+  static inline void FreeHook(void* ptr) {
+    VLOG(2) << "MklCPUAllocator: In FreeHook";
+    cpu_allocator()->DeallocateRaw(ptr);
+  }
+
+  static inline void* CallocHook(size_t num, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  static inline void* ReallocHook(void* ptr, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  // TODO(jbobba): We should ideally move this into CPUOptions in config.proto.
+  /// Memory limit - 64GB
+  static const size_t kMaxMemSize =
+      static_cast<size_t>(64) * 1024 * 1024 * 1024;
+
+  /// Do we allow growth in BFC Allocator
+  static const bool kAllowGrowth = true;
+
+  /// Name
+  static constexpr const char* kName = "mklcpu";
+
+  /// The alignment that we need for the allocations
+  static const size_t kAlignment = 64;
+
+  Allocator* allocator_;  // owned by this class
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
diff --git a/libs/tensorflow/include/tensorflow/core/common_runtime/optimization_registry.h b/libs/tensorflow/include/tensorflow/core/common_runtime/optimization_registry.h
index adfa17a..a469c8a 100644
--- a/libs/tensorflow/include/tensorflow/core/common_runtime/optimization_registry.h
+++ b/libs/tensorflow/include/tensorflow/core/common_runtime/optimization_registry.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/graph/graph.h"
@@ -39,6 +40,12 @@ struct GraphOptimizationPassOptions {
   const CostModel* cost_model = nullptr;
 
   FunctionLibraryDefinition* flib_def = nullptr;  // Not owned.
+  // The DeviceSet contains all the devices known to the system and is
+  // filled in for optimizations run by the session master, i.e.,
+  // PRE_PLACEMENT, POST_PLACEMENT, and POST_REWRITE_FOR_EXEC. It is
+  // nullptr for POST_PARTITIONING optimizations which are run at the
+  // workers.
+  const DeviceSet* device_set = nullptr;  // Not owned.
 
   // The graph to optimize, for optimization passes that run before
   // partitioning. Null for post-partitioning passes.
diff --git a/libs/tensorflow/include/tensorflow/core/common_runtime/session_factory.h b/libs/tensorflow/include/tensorflow/core/common_runtime/session_factory.h
index 66f4d31..2a1632e 100644
--- a/libs/tensorflow/include/tensorflow/core/common_runtime/session_factory.h
+++ b/libs/tensorflow/include/tensorflow/core/common_runtime/session_factory.h
@@ -33,7 +33,28 @@ class SessionFactory {
   virtual Session* NewSession(const SessionOptions& options) = 0;
   virtual bool AcceptsOptions(const SessionOptions& options) = 0;
 
-  // Sessions that support resource containers should override this functions.
+  // Abort and close all existing sessions, disconnecting their resources from
+  // future sessions.
+  //
+  // Reset() allows misbehaving or slow sessions to be aborted and closed, and
+  // causes their resources eventually to be released.  Reset() does not wait
+  // for the computations in old sessions to cease; it merely starts the
+  // process of tearing them down.  However, if a new session is started after
+  // a Reset(), the new session is isolated from changes that old sessions
+  // (started prior to the Reset()) may continue to make to resources, provided
+  // all those resources are in containers listed in "containers".
+  //
+  // Old sessions may continue to have side-effects on resources not in
+  // containers listed in "containers", and thus may affect future
+  // sessions' results in ways that are hard to predict.  Thus, if well-defined
+  // behaviour is desired, is it recommended that all containers be listed in
+  // "containers".
+  //
+  // If the "containers" vector is empty, the default container is assumed.
+  // If the "containers" vector is non-empty, the default container should be
+  // listed explicitly.
+  //
+  // Sessions that support resource containers should override this function.
   virtual Status Reset(const SessionOptions& options,
                        const std::vector<string>& containers) {
     return errors::Unimplemented("Reset()");
diff --git a/libs/tensorflow/include/tensorflow/core/common_runtime/shape_refiner.h b/libs/tensorflow/include/tensorflow/core/common_runtime/shape_refiner.h
index 4346672..b8d69fc 100644
--- a/libs/tensorflow/include/tensorflow/core/common_runtime/shape_refiner.h
+++ b/libs/tensorflow/include/tensorflow/core/common_runtime/shape_refiner.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 // construction time.
 class ShapeRefiner {
  public:
-  explicit ShapeRefiner(const OpRegistryInterface* ops);
+  ShapeRefiner(int graph_def_version, const OpRegistryInterface* ops);
 
   // Performs validation of 'node' and runs 'node's shape function,
   // storing its shape outputs.
@@ -98,7 +98,8 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
-  const OpRegistryInterface* ops_registry_ = nullptr;
+  const int graph_def_version_;
+  const OpRegistryInterface* const ops_registry_;
 
   // Stores a map from a node to its InferenceContext.
   //
diff --git a/libs/tensorflow/include/tensorflow/core/debug/BUILD b/libs/tensorflow/include/tensorflow/core/debug/BUILD
index 5a7e7bb..2035922 100644
--- a/libs/tensorflow/include/tensorflow/core/debug/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/debug/BUILD
@@ -17,11 +17,11 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "check_deps",
     "tf_copts",
     "tf_cc_test",
     "tf_cuda_library",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
 # For platform specific build config
 load(
@@ -34,6 +34,13 @@ load(
     "tf_cuda_tests_tags",
 )
 
+# Check that tensorflow/core:tensorflow does not depend on grpc.
+check_deps(
+    name = "core_tensorflow_check_deps",
+    disallowed_deps = ["@grpc//:grpc++_unsecure"],
+    deps = ["//tensorflow/core:tensorflow"],
+)
+
 tf_proto_library_cc(
     name = "debug_service_proto",
     srcs = ["debug_service.proto"],
@@ -52,6 +59,7 @@ cc_library(
     deps = [
         ":debug_graph_utils",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:debug_ops_op_lib",
     ],
     alwayslink = 1,
 )
@@ -146,13 +154,14 @@ cc_binary(
     ],
 )
 
-tf_cc_test_gpu(
+# TODO(cais): Fix flakiness on GPU and change this back to a tf_cc_test_gpu.
+#   See b/34081273.
+tf_cc_test(
     name = "debug_gateway_test",
     size = "small",
     srcs = ["debug_gateway_test.cc"],
     args = ["--heap_check=local"],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
         ":debug",
         ":debug_gateway_internal",
@@ -199,24 +208,14 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "debug_grpc_io_utils_test",
+    name = "debug_graph_utils_test",
     size = "small",
-    srcs = ["debug_grpc_io_utils_test.cc"],
-    data = [
-        ":debug_test_server_main",
-    ],
+    srcs = ["debug_graph_utils_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":debug_graph_utils",
-        ":debug_grpc_testlib",
-        ":debug_io_utils",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/libs/tensorflow/include/tensorflow/core/debug/debug_graph_utils.h b/libs/tensorflow/include/tensorflow/core/debug/debug_graph_utils.h
index 6edd26c..23bf0af 100644
--- a/libs/tensorflow/include/tensorflow/core/debug/debug_graph_utils.h
+++ b/libs/tensorflow/include/tensorflow/core/debug/debug_graph_utils.h
@@ -105,6 +105,12 @@ class DebugNodeInserter {
       const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
       Device* device);
 
+  // Set the parallel_iterations attribute of TensorFlow while loops
+  // (specifically the nodes for which IsEnter() returns true) to 1 to prevent
+  // any node from being executed multiple times concurrently and
+  // generating temporally-overlapping debug Tensor dumps.
+  static void DeparallelizeWhileLoops(Graph* graph, Device* device);
+
   // Get canonical name of a copy node.
   static const string GetCopyNodeName(const string& node_name,
                                       const int output_slot);
@@ -121,6 +127,19 @@ class DebugNodeInserter {
                                const int src_output, const DataType src_dt,
                                const string& tensor_name, Node** copy_node);
 
+  // Parse the debug_op_name string to extract proper op name and attributes.
+  // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
+  // It can also contain customizable keys and values. Each key-value pair is
+  // connected with an equal sign ("="). Multiple key-value pairs are separated
+  // with semicolons (";"), which optional whitespace in between, e.g.,
+  // "DebugNumericSummary(mute_if_healthy=true, lower_bound=-100.0)".
+  static Status ParseDebugOpName(
+      const string& debug_op_name, string* debug_op_name_proper,
+      std::unordered_map<string, string>* attributes);
+
+  static Status SetDebugNodeAttributes(
+      Node* debug_node, const std::unordered_map<string, string>& attributes);
+
   static Status CreateDebugNode(Graph* graph, const DeviceType device_type,
                                 const string& src_copy_node_name,
                                 const DataType src_dt,
@@ -128,6 +147,8 @@ class DebugNodeInserter {
                                 const std::vector<string>& debug_urls,
                                 const int debug_op_num,
                                 const string& debug_op_name, Node** debug_node);
+
+  friend class DebugGraphUtilsTest;
 };
 }  // namespace tensorflow
 
diff --git a/libs/tensorflow/include/tensorflow/core/debug/debug_io_utils.h b/libs/tensorflow/include/tensorflow/core/debug/debug_io_utils.h
index a12bea6..cd4462b 100644
--- a/libs/tensorflow/include/tensorflow/core/debug/debug_io_utils.h
+++ b/libs/tensorflow/include/tensorflow/core/debug/debug_io_utils.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
-#include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 
@@ -131,6 +131,15 @@ class DebugFileIO {
   static Status RecursiveCreateDir(Env* env, const string& dir);
 };
 
+}  // namespace tensorflow
+
+// TODO(cais): Support grpc:// debug URLs in open source once Python grpc
+//   genrule becomes available. See b/23796275.
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/debug/debug_service.grpc.pb.h"
+
+namespace tensorflow {
+
 class DebugGrpcChannel {
  public:
   // Constructor of DebugGrpcChannel.
@@ -198,5 +207,6 @@ class DebugGrpcIO {
 };
 
 }  // namespace tensorflow
+#endif  // #if defined(PLATFORM_GOOGLE)
 
 #endif  // TENSORFLOW_DEBUG_IO_UTILS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/debug/debug_service.grpc.pb.h b/libs/tensorflow/include/tensorflow/core/debug/debug_service.grpc.pb.h
deleted file mode 100644
index b64c40d..0000000
--- a/libs/tensorflow/include/tensorflow/core/debug/debug_service.grpc.pb.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Generated by the gRPC protobuf plugin.
-// If you make any local change, they will be lost.
-// source: tensorflow/core/debug/debug_service.proto
-// Original file comments:
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// ==============================================================================
-//
-#ifndef GRPC_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto__INCLUDED
-#define GRPC_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto__INCLUDED
-
-#include "tensorflow/core/debug/debug_service.pb.h"
-
-#include <grpc++/impl/codegen/async_stream.h>
-#include <grpc++/impl/codegen/async_unary_call.h>
-#include <grpc++/impl/codegen/proto_utils.h>
-#include <grpc++/impl/codegen/rpc_method.h>
-#include <grpc++/impl/codegen/service_type.h>
-#include <grpc++/impl/codegen/status.h>
-#include <grpc++/impl/codegen/stub_options.h>
-#include <grpc++/impl/codegen/sync_stream.h>
-
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-}  // namespace grpc
-
-namespace tensorflow {
-
-// EventListener: Receives Event protos, e.g., from debugged TensorFlow
-// runtime(s).
-class EventListener GRPC_FINAL {
- public:
-  class StubInterface {
-   public:
-    virtual ~StubInterface() {}
-    // Client(s) can use this RPC method to send the EventListener Event protos.
-    // The Event protos can hold information such as:
-    //   1) intermediate tensors from a debugged graph being executed, which can
-    //      be sent from DebugIdentity ops configured with grpc URLs.
-    //   2) GraphDefs of partition graphs, which can be sent from special debug
-    //      ops that get executed immediately after the beginning of the graph
-    //      execution.
-    std::unique_ptr< ::grpc::ClientReaderWriterInterface< ::tensorflow::Event, ::tensorflow::EventReply>> SendEvents(::grpc::ClientContext* context) {
-      return std::unique_ptr< ::grpc::ClientReaderWriterInterface< ::tensorflow::Event, ::tensorflow::EventReply>>(SendEventsRaw(context));
-    }
-    std::unique_ptr< ::grpc::ClientAsyncReaderWriterInterface< ::tensorflow::Event, ::tensorflow::EventReply>> AsyncSendEvents(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) {
-      return std::unique_ptr< ::grpc::ClientAsyncReaderWriterInterface< ::tensorflow::Event, ::tensorflow::EventReply>>(AsyncSendEventsRaw(context, cq, tag));
-    }
-  private:
-    virtual ::grpc::ClientReaderWriterInterface< ::tensorflow::Event, ::tensorflow::EventReply>* SendEventsRaw(::grpc::ClientContext* context) = 0;
-    virtual ::grpc::ClientAsyncReaderWriterInterface< ::tensorflow::Event, ::tensorflow::EventReply>* AsyncSendEventsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) = 0;
-  };
-  class Stub GRPC_FINAL : public StubInterface {
-   public:
-    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
-    std::unique_ptr< ::grpc::ClientReaderWriter< ::tensorflow::Event, ::tensorflow::EventReply>> SendEvents(::grpc::ClientContext* context) {
-      return std::unique_ptr< ::grpc::ClientReaderWriter< ::tensorflow::Event, ::tensorflow::EventReply>>(SendEventsRaw(context));
-    }
-    std::unique_ptr<  ::grpc::ClientAsyncReaderWriter< ::tensorflow::Event, ::tensorflow::EventReply>> AsyncSendEvents(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) {
-      return std::unique_ptr< ::grpc::ClientAsyncReaderWriter< ::tensorflow::Event, ::tensorflow::EventReply>>(AsyncSendEventsRaw(context, cq, tag));
-    }
-
-   private:
-    std::shared_ptr< ::grpc::ChannelInterface> channel_;
-    ::grpc::ClientReaderWriter< ::tensorflow::Event, ::tensorflow::EventReply>* SendEventsRaw(::grpc::ClientContext* context) GRPC_OVERRIDE;
-    ::grpc::ClientAsyncReaderWriter< ::tensorflow::Event, ::tensorflow::EventReply>* AsyncSendEventsRaw(::grpc::ClientContext* context, ::grpc::CompletionQueue* cq, void* tag) GRPC_OVERRIDE;
-    const ::grpc::RpcMethod rpcmethod_SendEvents_;
-  };
-  static std::unique_ptr<Stub> NewStub(const std::shared_ptr< ::grpc::ChannelInterface>& channel, const ::grpc::StubOptions& options = ::grpc::StubOptions());
-
-  class Service : public ::grpc::Service {
-   public:
-    Service();
-    virtual ~Service();
-    // Client(s) can use this RPC method to send the EventListener Event protos.
-    // The Event protos can hold information such as:
-    //   1) intermediate tensors from a debugged graph being executed, which can
-    //      be sent from DebugIdentity ops configured with grpc URLs.
-    //   2) GraphDefs of partition graphs, which can be sent from special debug
-    //      ops that get executed immediately after the beginning of the graph
-    //      execution.
-    virtual ::grpc::Status SendEvents(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>* stream);
-  };
-  template <class BaseClass>
-  class WithAsyncMethod_SendEvents : public BaseClass {
-   private:
-    void BaseClassMustBeDerivedFromService(const Service *service) {}
-   public:
-    WithAsyncMethod_SendEvents() {
-      ::grpc::Service::MarkMethodAsync(0);
-    }
-    ~WithAsyncMethod_SendEvents() GRPC_OVERRIDE {
-      BaseClassMustBeDerivedFromService(this);
-    }
-    // disable synchronous version of this method
-    ::grpc::Status SendEvents(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>* stream) GRPC_FINAL GRPC_OVERRIDE {
-      abort();
-      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
-    }
-    void RequestSendEvents(::grpc::ServerContext* context, ::grpc::ServerAsyncReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>* stream, ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq, void *tag) {
-      ::grpc::Service::RequestAsyncBidiStreaming(0, context, stream, new_call_cq, notification_cq, tag);
-    }
-  };
-  typedef WithAsyncMethod_SendEvents<Service > AsyncService;
-  template <class BaseClass>
-  class WithGenericMethod_SendEvents : public BaseClass {
-   private:
-    void BaseClassMustBeDerivedFromService(const Service *service) {}
-   public:
-    WithGenericMethod_SendEvents() {
-      ::grpc::Service::MarkMethodGeneric(0);
-    }
-    ~WithGenericMethod_SendEvents() GRPC_OVERRIDE {
-      BaseClassMustBeDerivedFromService(this);
-    }
-    // disable synchronous version of this method
-    ::grpc::Status SendEvents(::grpc::ServerContext* context, ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>* stream) GRPC_FINAL GRPC_OVERRIDE {
-      abort();
-      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
-    }
-  };
-};
-
-}  // namespace tensorflow
-
-
-#endif  // GRPC_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto__INCLUDED
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/graph_mgr.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/graph_mgr.h
index e9b8d41..18013aa 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -81,6 +81,8 @@ class GraphMgr {
 
   Status SendInputs(const int64 step_id, const NamedTensors& in);
   Status RecvOutputs(const int64 step_id, NamedTensors* out);
+  void RecvOutputsAsync(const int64 step_id, NamedTensors* out,
+                        StatusCallback done);
 
   // Deregisters a graph.
   Status Deregister(const string& handle);
@@ -156,6 +158,8 @@ class GraphMgr {
 
   Status SendInputsToRendezvous(Rendezvous* rendezvous, const NamedTensors& in);
   Status RecvOutputsFromRendezvous(Rendezvous* rendezvous, NamedTensors* out);
+  void RecvOutputsFromRendezvousAsync(Rendezvous* rendezvous, NamedTensors* out,
+                                      const StatusCallback& done);
 
   Status InitItem(const string& session, const GraphDef& gdef,
                   const GraphOptions& graph_options, Item* item);
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/local_master.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/local_master.h
index fe9cd93..33b81c3 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/local_master.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/local_master.h
@@ -67,6 +67,7 @@ class LocalMaster : public MasterInterface {
                      const ListDevicesRequest* request,
                      ListDevicesResponse* response) override;
 
+  // See tensorflow::Reset() and the comment on ResetRequest.
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override;
 
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/master.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/master.h
index 2bfebc1..ce05a65 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/master.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/master.h
@@ -58,6 +58,7 @@ class Master {
   void ListDevices(const ListDevicesRequest* req, ListDevicesResponse* resp,
                    MyClosure done);
 
+  // See tensorflow::Reset() and the comment on ResetRequest.
   void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
 
  private:
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_env.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_env.h
index 7f9c6ec..01218fb 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_env.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_env.h
@@ -54,11 +54,10 @@ struct MasterEnv {
   //
   // The caller of the function takes ownership of the returned
   // `MasterSession`, which may not be null. Ownership of the
-  // `MasterEnv*` is retained by the caller. The callee takes
-  // ownership of the `std::vector<Device*>*` argument, but does not
-  // take ownership of the `Device*` objects in the vector.
-  std::function<MasterSession*(const SessionOptions&, MasterEnv*,
-                               std::vector<Device*>*)>
+  // `MasterEnv*` is retained by the caller.
+  std::function<MasterSession*(
+      const SessionOptions&, MasterEnv*,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>>)>
       master_session_factory;
 };
 
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_session.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_session.h
index 2f29fbf..39206c2 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_session.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/master_session.h
@@ -45,9 +45,10 @@ class MasterSession : public core::RefCounted {
   // operations on these devices.
   //
   // The caller takes ownership of all remote devices.
-  MasterSession(const SessionOptions& options, const MasterEnv* env,
-                std::vector<Device*>* remote_devs,
-                StatsPublisherFactory stats_publisher_factory);
+  MasterSession(
+      const SessionOptions& options, const MasterEnv* env,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+      StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
   // Run(), or Close().
@@ -103,8 +104,7 @@ class MasterSession : public core::RefCounted {
   // The opaque session handle.
   const string handle_;
 
-  // Owned.
-  std::vector<Device*> remote_devs_;
+  std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
 
   // The device set used by this session.
   DeviceSet devices_;
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/BUILD b/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/BUILD
index c24b2e6..13e357c 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -302,6 +302,7 @@ cc_binary(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/kernels:data_flow",
         "@grpc//:grpc++_unsecure",
     ],
 )
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 0c07794..8d97523 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -65,7 +65,7 @@ class GrpcChannelCache {
   // was created to handle.  Worker names are in the format
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
-  virtual void ListWorkers(std::vector<string>* workers) = 0;
+  virtual void ListWorkers(std::vector<string>* workers) const = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker.h
index b52a809..6d1c8e3 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker.h
@@ -92,7 +92,10 @@ class Worker : public WorkerInterface {
 
   struct PartialRunState {
     CancellationManager* cancellation_manager;
-    Notification executor_done;
+
+    bool executor_done = false;
+    StatusCallback final_callback = nullptr;
+    Status final_status;
 
     explicit PartialRunState(CancellationManager* cm)
         : cancellation_manager(cm) {}
@@ -115,6 +118,12 @@ class Worker : public WorkerInterface {
 
   void RemovePartialRun(const string& graph_handle, int step_id);
 
+  void MaybeCallFinalCallback(const string& graph_handle, int step_id,
+                              const Status& executor_status);
+
+  void SetOrCallFinalCallback(const string& graph_handle, int step_id,
+                              StatusCallback done, const Status& s);
+
   Status PrepareRunGraph(RunGraphRequestWrapper* req,
                          GraphMgr::NamedTensors* in,
                          GraphMgr::NamedTensors* out);
diff --git a/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker_cache.h b/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker_cache.h
index 225f483..8521f89 100644
--- a/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker_cache.h
+++ b/libs/tensorflow/include/tensorflow/core/distributed_runtime/worker_cache.h
@@ -35,7 +35,7 @@ class WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  virtual void ListWorkers(std::vector<string>* workers) = 0;
+  virtual void ListWorkers(std::vector<string>* workers) const = 0;
 
   // If "target" names a remote task for which an RPC channel exists
   // or can be constructed, returns a pointer to a WorkerInterface object
diff --git a/libs/tensorflow/include/tensorflow/core/example/example.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/example/example.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/example/example.pb_text.h b/libs/tensorflow/include/tensorflow/core/example/example.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/example/example_parser_configuration.pb.h b/libs/tensorflow/include/tensorflow/core/example/example_parser_configuration.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/example/feature.pb.h b/libs/tensorflow/include/tensorflow/core/example/feature.pb.h
index 0f2238d..bc5bbcb 100755
--- a/libs/tensorflow/include/tensorflow/core/example/feature.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/example/feature.pb.h
@@ -156,10 +156,16 @@ class BytesList : public ::google::protobuf::Message /* @@protoc_insertion_point
   const ::std::string& value(int index) const;
   ::std::string* mutable_value(int index);
   void set_value(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_value(int index, ::std::string&& value);
+  #endif
   void set_value(int index, const char* value);
   void set_value(int index, const void* value, size_t size);
   ::std::string* add_value();
   void add_value(const ::std::string& value);
+  #if LANG_CXX11
+  void add_value(::std::string&& value);
+  #endif
   void add_value(const char* value);
   void add_value(const void* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& value() const;
@@ -915,6 +921,12 @@ inline void BytesList::set_value(int index, const ::std::string& value) {
   // @@protoc_insertion_point(field_set:tensorflow.BytesList.value)
   value_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void BytesList::set_value(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.BytesList.value)
+  value_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void BytesList::set_value(int index, const char* value) {
   value_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.BytesList.value)
@@ -932,6 +944,12 @@ inline void BytesList::add_value(const ::std::string& value) {
   value_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.BytesList.value)
 }
+#if LANG_CXX11
+inline void BytesList::add_value(::std::string&& value) {
+  value_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.BytesList.value)
+}
+#endif
 inline void BytesList::add_value(const char* value) {
   value_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.BytesList.value)
diff --git a/libs/tensorflow/include/tensorflow/core/example/feature.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/example/feature.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/example/feature.pb_text.h b/libs/tensorflow/include/tensorflow/core/example/feature.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/allocation_description.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/allocation_description.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/allocation_description.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/allocation_description.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/allocator.h b/libs/tensorflow/include/tensorflow/core/framework/allocator.h
index 06859c5..4a7fdab 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/allocator.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/allocator.h
@@ -286,6 +286,11 @@ struct AllocatorAttributes {
   void set_track_sizes(bool v) { value |= (static_cast<int>(v) << 3); }
   bool track_sizes() const { return value & (0x1 << 3); }
   void Merge(AllocatorAttributes other) { value |= other.value; }
+  // Returns true if the fields set in *this is a subset of or equal to
+  // those set in other.
+  bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
+    return (value | other.value) == other.value;
+  }
 
   // NOTE: The upper 8 bits of the value are reserved for
   // device-specific uses.  Implementors of a device can interpret these
diff --git a/libs/tensorflow/include/tensorflow/core/framework/allocator_registry.h b/libs/tensorflow/include/tensorflow/core/framework/allocator_registry.h
new file mode 100644
index 0000000..c419366
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/framework/allocator_registry.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of memory allocators
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+
+// A global AllocatorRegistry is used to hold allocators for CPU backends
+class AllocatorRegistry {
+ public:
+  // Add an allocator to the registry.
+  void Register(const string& name, int priority, Allocator* allocator);
+
+  // Return allocator with highest priority
+  // If multiple allocators have the same high priority, return one of them
+  Allocator* GetAllocator();
+
+  // Returns the global registry of allocators.
+  static AllocatorRegistry* Global();
+
+ private:
+  typedef struct {
+    string name;
+    int priority;
+    Allocator* allocator;  // not owned
+  } AllocatorRegistryEntry;
+
+  bool CheckForDuplicates(const string& name, int priority);
+
+  std::vector<AllocatorRegistryEntry> allocators_;
+  Allocator* m_curr_allocator_;  // not owned
+};
+
+namespace allocator_registration {
+
+class AllocatorRegistration {
+ public:
+  AllocatorRegistration(const string& name, int priority,
+                        Allocator* allocator) {
+    AllocatorRegistry::Global()->Register(name, priority, allocator);
+  }
+};
+
+}  // namespace allocator_registration
+
+#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
+  static allocator_registration::AllocatorRegistration              \
+      register_allocator_##ctr(name, priority, new allocator)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
diff --git a/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb.h b/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb.h
index 0af9a25..b7f9f92 100755
--- a/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb.h
@@ -156,10 +156,16 @@ class AttrValue_ListValue : public ::google::protobuf::Message /* @@protoc_inser
   const ::std::string& s(int index) const;
   ::std::string* mutable_s(int index);
   void set_s(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_s(int index, ::std::string&& value);
+  #endif
   void set_s(int index, const char* value);
   void set_s(int index, const void* value, size_t size);
   ::std::string* add_s();
   void add_s(const ::std::string& value);
+  #if LANG_CXX11
+  void add_s(::std::string&& value);
+  #endif
   void add_s(const char* value);
   void add_s(const void* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& s() const;
@@ -705,6 +711,12 @@ inline void AttrValue_ListValue::set_s(int index, const ::std::string& value) {
   // @@protoc_insertion_point(field_set:tensorflow.AttrValue.ListValue.s)
   s_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void AttrValue_ListValue::set_s(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.AttrValue.ListValue.s)
+  s_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void AttrValue_ListValue::set_s(int index, const char* value) {
   s_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.AttrValue.ListValue.s)
@@ -722,6 +734,12 @@ inline void AttrValue_ListValue::add_s(const ::std::string& value) {
   s_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.AttrValue.ListValue.s)
 }
+#if LANG_CXX11
+inline void AttrValue_ListValue::add_s(::std::string&& value) {
+  s_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.AttrValue.ListValue.s)
+}
+#endif
 inline void AttrValue_ListValue::add_s(const char* value) {
   s_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.AttrValue.ListValue.s)
diff --git a/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/attr_value.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb.h b/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb.h
old mode 100644
new mode 100755
index fee5548..c58d357
--- a/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb.h
@@ -455,11 +455,17 @@ class CostGraphDef_Node : public ::google::protobuf::Message /* @@protoc_inserti
   ::google::protobuf::int64 temporary_memory_size() const;
   void set_temporary_memory_size(::google::protobuf::int64 value);
 
-  // int64 host_peak_memory_size = 10;
-  void clear_host_peak_memory_size();
-  static const int kHostPeakMemorySizeFieldNumber = 10;
-  ::google::protobuf::int64 host_peak_memory_size() const;
-  void set_host_peak_memory_size(::google::protobuf::int64 value);
+  // int64 host_temp_memory_size = 10;
+  void clear_host_temp_memory_size();
+  static const int kHostTempMemorySizeFieldNumber = 10;
+  ::google::protobuf::int64 host_temp_memory_size() const;
+  void set_host_temp_memory_size(::google::protobuf::int64 value);
+
+  // int64 device_temp_memory_size = 11;
+  void clear_device_temp_memory_size();
+  static const int kDeviceTempMemorySizeFieldNumber = 11;
+  ::google::protobuf::int64 device_temp_memory_size() const;
+  void set_device_temp_memory_size(::google::protobuf::int64 value);
 
   // int32 id = 3;
   void clear_id();
@@ -473,23 +479,17 @@ class CostGraphDef_Node : public ::google::protobuf::Message /* @@protoc_inserti
   bool is_final() const;
   void set_is_final(bool value);
 
-  // int64 device_peak_memory_size = 11;
-  void clear_device_peak_memory_size();
-  static const int kDevicePeakMemorySizeFieldNumber = 11;
-  ::google::protobuf::int64 device_peak_memory_size() const;
-  void set_device_peak_memory_size(::google::protobuf::int64 value);
+  // int64 host_persistent_memory_size = 12;
+  void clear_host_persistent_memory_size();
+  static const int kHostPersistentMemorySizeFieldNumber = 12;
+  ::google::protobuf::int64 host_persistent_memory_size() const;
+  void set_host_persistent_memory_size(::google::protobuf::int64 value);
 
-  // int64 persisted_memory_size = 12;
-  void clear_persisted_memory_size();
-  static const int kPersistedMemorySizeFieldNumber = 12;
-  ::google::protobuf::int64 persisted_memory_size() const;
-  void set_persisted_memory_size(::google::protobuf::int64 value);
-
-  // int64 auxiliary_memory_size = 13;
-  void clear_auxiliary_memory_size();
-  static const int kAuxiliaryMemorySizeFieldNumber = 13;
-  ::google::protobuf::int64 auxiliary_memory_size() const;
-  void set_auxiliary_memory_size(::google::protobuf::int64 value);
+  // int64 device_persistent_memory_size = 16;
+  void clear_device_persistent_memory_size();
+  static const int kDevicePersistentMemorySizeFieldNumber = 16;
+  ::google::protobuf::int64 device_persistent_memory_size() const;
+  void set_device_persistent_memory_size(::google::protobuf::int64 value);
 
   // int64 compute_cost = 9;
   void clear_compute_cost();
@@ -497,6 +497,18 @@ class CostGraphDef_Node : public ::google::protobuf::Message /* @@protoc_inserti
   ::google::protobuf::int64 compute_cost() const;
   void set_compute_cost(::google::protobuf::int64 value);
 
+  // int64 compute_time = 14;
+  void clear_compute_time();
+  static const int kComputeTimeFieldNumber = 14;
+  ::google::protobuf::int64 compute_time() const;
+  void set_compute_time(::google::protobuf::int64 value);
+
+  // int64 memory_time = 15;
+  void clear_memory_time();
+  static const int kMemoryTimeFieldNumber = 15;
+  ::google::protobuf::int64 memory_time() const;
+  void set_memory_time(::google::protobuf::int64 value);
+
   // @@protoc_insertion_point(class_scope:tensorflow.CostGraphDef.Node)
  private:
 
@@ -511,13 +523,15 @@ class CostGraphDef_Node : public ::google::protobuf::Message /* @@protoc_inserti
   ::google::protobuf::internal::ArenaStringPtr name_;
   ::google::protobuf::internal::ArenaStringPtr device_;
   ::google::protobuf::int64 temporary_memory_size_;
-  ::google::protobuf::int64 host_peak_memory_size_;
+  ::google::protobuf::int64 host_temp_memory_size_;
+  ::google::protobuf::int64 device_temp_memory_size_;
   ::google::protobuf::int32 id_;
   bool is_final_;
-  ::google::protobuf::int64 device_peak_memory_size_;
-  ::google::protobuf::int64 persisted_memory_size_;
-  ::google::protobuf::int64 auxiliary_memory_size_;
+  ::google::protobuf::int64 host_persistent_memory_size_;
+  ::google::protobuf::int64 device_persistent_memory_size_;
   ::google::protobuf::int64 compute_cost_;
+  ::google::protobuf::int64 compute_time_;
+  ::google::protobuf::int64 memory_time_;
   mutable int _cached_size_;
   friend struct  protobuf_tensorflow_2fcore_2fframework_2fcost_5fgraph_2eproto::TableStruct;
 };
@@ -982,60 +996,60 @@ inline void CostGraphDef_Node::set_temporary_memory_size(::google::protobuf::int
   // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.temporary_memory_size)
 }
 
-// int64 host_peak_memory_size = 10;
-inline void CostGraphDef_Node::clear_host_peak_memory_size() {
-  host_peak_memory_size_ = GOOGLE_LONGLONG(0);
+// int64 host_temp_memory_size = 10;
+inline void CostGraphDef_Node::clear_host_temp_memory_size() {
+  host_temp_memory_size_ = GOOGLE_LONGLONG(0);
 }
-inline ::google::protobuf::int64 CostGraphDef_Node::host_peak_memory_size() const {
-  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.host_peak_memory_size)
-  return host_peak_memory_size_;
+inline ::google::protobuf::int64 CostGraphDef_Node::host_temp_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.host_temp_memory_size)
+  return host_temp_memory_size_;
 }
-inline void CostGraphDef_Node::set_host_peak_memory_size(::google::protobuf::int64 value) {
+inline void CostGraphDef_Node::set_host_temp_memory_size(::google::protobuf::int64 value) {
   
-  host_peak_memory_size_ = value;
-  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.host_peak_memory_size)
+  host_temp_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.host_temp_memory_size)
 }
 
-// int64 device_peak_memory_size = 11;
-inline void CostGraphDef_Node::clear_device_peak_memory_size() {
-  device_peak_memory_size_ = GOOGLE_LONGLONG(0);
+// int64 device_temp_memory_size = 11;
+inline void CostGraphDef_Node::clear_device_temp_memory_size() {
+  device_temp_memory_size_ = GOOGLE_LONGLONG(0);
 }
-inline ::google::protobuf::int64 CostGraphDef_Node::device_peak_memory_size() const {
-  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.device_peak_memory_size)
-  return device_peak_memory_size_;
+inline ::google::protobuf::int64 CostGraphDef_Node::device_temp_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.device_temp_memory_size)
+  return device_temp_memory_size_;
 }
-inline void CostGraphDef_Node::set_device_peak_memory_size(::google::protobuf::int64 value) {
+inline void CostGraphDef_Node::set_device_temp_memory_size(::google::protobuf::int64 value) {
   
-  device_peak_memory_size_ = value;
-  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.device_peak_memory_size)
+  device_temp_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.device_temp_memory_size)
 }
 
-// int64 persisted_memory_size = 12;
-inline void CostGraphDef_Node::clear_persisted_memory_size() {
-  persisted_memory_size_ = GOOGLE_LONGLONG(0);
+// int64 host_persistent_memory_size = 12;
+inline void CostGraphDef_Node::clear_host_persistent_memory_size() {
+  host_persistent_memory_size_ = GOOGLE_LONGLONG(0);
 }
-inline ::google::protobuf::int64 CostGraphDef_Node::persisted_memory_size() const {
-  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.persisted_memory_size)
-  return persisted_memory_size_;
+inline ::google::protobuf::int64 CostGraphDef_Node::host_persistent_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.host_persistent_memory_size)
+  return host_persistent_memory_size_;
 }
-inline void CostGraphDef_Node::set_persisted_memory_size(::google::protobuf::int64 value) {
+inline void CostGraphDef_Node::set_host_persistent_memory_size(::google::protobuf::int64 value) {
   
-  persisted_memory_size_ = value;
-  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.persisted_memory_size)
+  host_persistent_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.host_persistent_memory_size)
 }
 
-// int64 auxiliary_memory_size = 13;
-inline void CostGraphDef_Node::clear_auxiliary_memory_size() {
-  auxiliary_memory_size_ = GOOGLE_LONGLONG(0);
+// int64 device_persistent_memory_size = 16;
+inline void CostGraphDef_Node::clear_device_persistent_memory_size() {
+  device_persistent_memory_size_ = GOOGLE_LONGLONG(0);
 }
-inline ::google::protobuf::int64 CostGraphDef_Node::auxiliary_memory_size() const {
-  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.auxiliary_memory_size)
-  return auxiliary_memory_size_;
+inline ::google::protobuf::int64 CostGraphDef_Node::device_persistent_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.device_persistent_memory_size)
+  return device_persistent_memory_size_;
 }
-inline void CostGraphDef_Node::set_auxiliary_memory_size(::google::protobuf::int64 value) {
+inline void CostGraphDef_Node::set_device_persistent_memory_size(::google::protobuf::int64 value) {
   
-  auxiliary_memory_size_ = value;
-  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.auxiliary_memory_size)
+  device_persistent_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.device_persistent_memory_size)
 }
 
 // int64 compute_cost = 9;
@@ -1052,6 +1066,34 @@ inline void CostGraphDef_Node::set_compute_cost(::google::protobuf::int64 value)
   // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.compute_cost)
 }
 
+// int64 compute_time = 14;
+inline void CostGraphDef_Node::clear_compute_time() {
+  compute_time_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 CostGraphDef_Node::compute_time() const {
+  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.compute_time)
+  return compute_time_;
+}
+inline void CostGraphDef_Node::set_compute_time(::google::protobuf::int64 value) {
+  
+  compute_time_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.compute_time)
+}
+
+// int64 memory_time = 15;
+inline void CostGraphDef_Node::clear_memory_time() {
+  memory_time_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 CostGraphDef_Node::memory_time() const {
+  // @@protoc_insertion_point(field_get:tensorflow.CostGraphDef.Node.memory_time)
+  return memory_time_;
+}
+inline void CostGraphDef_Node::set_memory_time(::google::protobuf::int64 value) {
+  
+  memory_time_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.CostGraphDef.Node.memory_time)
+}
+
 // bool is_final = 7;
 inline void CostGraphDef_Node::clear_is_final() {
   is_final_ = false;
diff --git a/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/cost_graph.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/device_attributes.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/device_attributes.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/device_attributes.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/device_attributes.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/device_base.h b/libs/tensorflow/include/tensorflow/core/framework/device_base.h
index 6edbda1..8894671 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/device_base.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/device_base.h
@@ -132,6 +132,7 @@ class DeviceBase {
     perftools::gputools::Stream* stream = nullptr;
     DeviceContext* default_context = nullptr;
     EventMgr* event_mgr = nullptr;
+    int gpu_id = -1;
   };
 
   // Does not take ownership.
diff --git a/libs/tensorflow/include/tensorflow/core/framework/function.h b/libs/tensorflow/include/tensorflow/core/framework/function.h
index 2f166ad..e273110 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/function.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/function.h
@@ -295,6 +295,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // from 'LookUp' and included in the proto returned by 'ToProto'.
   Status AddFunctionDef(const FunctionDef& fdef);
 
+  // Adds gradient definition 'grad' to this function library.
+  // If 'grad' is successfully added, it will be accessible via 'FindGradient'
+  // and included in the proto returned by 'ToProto'.
+  Status AddGradientDef(const GradientDef& grad);
+
+  // Adds the functions and gradients in 'other' to this function library.
+  Status AddLibrary(const FunctionLibraryDefinition& other);
+
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
   // returns an empty string.
@@ -320,6 +328,10 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const;
 
+  const OpRegistryInterface* default_registry() const {
+    return default_registry_;
+  }
+
  private:
   // TODO(cwhipkey): support shape functions in FunctionDefLibrary.
   struct FunctionDefAndOpRegistration {
diff --git a/libs/tensorflow/include/tensorflow/core/framework/function.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/function.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/function.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/function.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/graph.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/graph.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/graph.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/graph.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb.h b/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb.h
old mode 100644
new mode 100755
index b8a1f94..2800993
--- a/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb.h
@@ -29,6 +29,7 @@
 #include <google/protobuf/extension_set.h>  // IWYU pragma: export
 #include <google/protobuf/generated_enum_reflection.h>
 #include <google/protobuf/unknown_field_set.h>
+#include "tensorflow/core/framework/types.pb.h"
 // @@protoc_insertion_point(includes)
 namespace tensorflow {
 class GraphTransferInfo;
@@ -479,6 +480,12 @@ class GraphTransferInfo_ConstNodeInfo : public ::google::protobuf::Message /* @@
   ::google::protobuf::int32 node_id() const;
   void set_node_id(::google::protobuf::int32 value);
 
+  // .tensorflow.DataType dtype = 5;
+  void clear_dtype();
+  static const int kDtypeFieldNumber = 5;
+  ::tensorflow::DataType dtype() const;
+  void set_dtype(::tensorflow::DataType value);
+
   // @@protoc_insertion_point(class_scope:tensorflow.GraphTransferInfo.ConstNodeInfo)
  private:
 
@@ -491,6 +498,7 @@ class GraphTransferInfo_ConstNodeInfo : public ::google::protobuf::Message /* @@
   ::google::protobuf::internal::ArenaStringPtr name_;
   ::google::protobuf::internal::ArenaStringPtr data_;
   ::google::protobuf::int32 node_id_;
+  int dtype_;
   mutable int _cached_size_;
   friend struct  protobuf_tensorflow_2fcore_2fframework_2fgraph_5ftransfer_5finfo_2eproto::TableStruct;
 };
@@ -824,6 +832,12 @@ class GraphTransferInfo_GraphInputNodeInfo : public ::google::protobuf::Message
   void unsafe_arena_set_allocated_name(
       ::std::string* name);
 
+  // .tensorflow.DataType dtype = 3;
+  void clear_dtype();
+  static const int kDtypeFieldNumber = 3;
+  ::tensorflow::DataType dtype() const;
+  void set_dtype(::tensorflow::DataType value);
+
   // @@protoc_insertion_point(class_scope:tensorflow.GraphTransferInfo.GraphInputNodeInfo)
  private:
 
@@ -834,6 +848,7 @@ class GraphTransferInfo_GraphInputNodeInfo : public ::google::protobuf::Message
   ::google::protobuf::RepeatedField< ::google::protobuf::int64 > shape_;
   mutable int _shape_cached_byte_size_;
   ::google::protobuf::internal::ArenaStringPtr name_;
+  int dtype_;
   mutable int _cached_size_;
   friend struct  protobuf_tensorflow_2fcore_2fframework_2fgraph_5ftransfer_5finfo_2eproto::TableStruct;
 };
@@ -944,6 +959,12 @@ class GraphTransferInfo_GraphOutputNodeInfo : public ::google::protobuf::Message
   void unsafe_arena_set_allocated_name(
       ::std::string* name);
 
+  // .tensorflow.DataType dtype = 3;
+  void clear_dtype();
+  static const int kDtypeFieldNumber = 3;
+  ::tensorflow::DataType dtype() const;
+  void set_dtype(::tensorflow::DataType value);
+
   // @@protoc_insertion_point(class_scope:tensorflow.GraphTransferInfo.GraphOutputNodeInfo)
  private:
 
@@ -954,6 +975,7 @@ class GraphTransferInfo_GraphOutputNodeInfo : public ::google::protobuf::Message
   ::google::protobuf::RepeatedField< ::google::protobuf::int64 > shape_;
   mutable int _shape_cached_byte_size_;
   ::google::protobuf::internal::ArenaStringPtr name_;
+  int dtype_;
   mutable int _cached_size_;
   friend struct  protobuf_tensorflow_2fcore_2fframework_2fgraph_5ftransfer_5finfo_2eproto::TableStruct;
 };
@@ -1589,6 +1611,20 @@ inline void GraphTransferInfo_ConstNodeInfo::unsafe_arena_set_allocated_data(
   // @@protoc_insertion_point(field_unsafe_arena_set_allocated:tensorflow.GraphTransferInfo.ConstNodeInfo.data)
 }
 
+// .tensorflow.DataType dtype = 5;
+inline void GraphTransferInfo_ConstNodeInfo::clear_dtype() {
+  dtype_ = 0;
+}
+inline ::tensorflow::DataType GraphTransferInfo_ConstNodeInfo::dtype() const {
+  // @@protoc_insertion_point(field_get:tensorflow.GraphTransferInfo.ConstNodeInfo.dtype)
+  return static_cast< ::tensorflow::DataType >(dtype_);
+}
+inline void GraphTransferInfo_ConstNodeInfo::set_dtype(::tensorflow::DataType value) {
+  
+  dtype_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.GraphTransferInfo.ConstNodeInfo.dtype)
+}
+
 // -------------------------------------------------------------------
 
 // GraphTransferInfo_NodeInputInfo
@@ -1785,6 +1821,20 @@ GraphTransferInfo_GraphInputNodeInfo::mutable_shape() {
   return &shape_;
 }
 
+// .tensorflow.DataType dtype = 3;
+inline void GraphTransferInfo_GraphInputNodeInfo::clear_dtype() {
+  dtype_ = 0;
+}
+inline ::tensorflow::DataType GraphTransferInfo_GraphInputNodeInfo::dtype() const {
+  // @@protoc_insertion_point(field_get:tensorflow.GraphTransferInfo.GraphInputNodeInfo.dtype)
+  return static_cast< ::tensorflow::DataType >(dtype_);
+}
+inline void GraphTransferInfo_GraphInputNodeInfo::set_dtype(::tensorflow::DataType value) {
+  
+  dtype_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.GraphTransferInfo.GraphInputNodeInfo.dtype)
+}
+
 // -------------------------------------------------------------------
 
 // GraphTransferInfo_GraphOutputNodeInfo
@@ -1885,6 +1935,20 @@ GraphTransferInfo_GraphOutputNodeInfo::mutable_shape() {
   return &shape_;
 }
 
+// .tensorflow.DataType dtype = 3;
+inline void GraphTransferInfo_GraphOutputNodeInfo::clear_dtype() {
+  dtype_ = 0;
+}
+inline ::tensorflow::DataType GraphTransferInfo_GraphOutputNodeInfo::dtype() const {
+  // @@protoc_insertion_point(field_get:tensorflow.GraphTransferInfo.GraphOutputNodeInfo.dtype)
+  return static_cast< ::tensorflow::DataType >(dtype_);
+}
+inline void GraphTransferInfo_GraphOutputNodeInfo::set_dtype(::tensorflow::DataType value) {
+  
+  dtype_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.GraphTransferInfo.GraphOutputNodeInfo.dtype)
+}
+
 // -------------------------------------------------------------------
 
 // GraphTransferInfo
diff --git a/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text-impl.h
old mode 100644
new mode 100755
index 7d6f5a4..fab7410
--- a/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text-impl.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text-impl.h
@@ -4,6 +4,8 @@
 
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/graph_transfer_info.pb_text.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/types.pb_text-impl.h"
 #include "tensorflow/core/lib/strings/proto_text_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 
diff --git a/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/graph_transfer_info.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb.h b/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb.h
index 05553be..b6916ef 100755
--- a/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb.h
@@ -288,10 +288,16 @@ class KernelDef : public ::google::protobuf::Message /* @@protoc_insertion_point
   const ::std::string& host_memory_arg(int index) const;
   ::std::string* mutable_host_memory_arg(int index);
   void set_host_memory_arg(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_host_memory_arg(int index, ::std::string&& value);
+  #endif
   void set_host_memory_arg(int index, const char* value);
   void set_host_memory_arg(int index, const char* value, size_t size);
   ::std::string* add_host_memory_arg();
   void add_host_memory_arg(const ::std::string& value);
+  #if LANG_CXX11
+  void add_host_memory_arg(::std::string&& value);
+  #endif
   void add_host_memory_arg(const char* value);
   void add_host_memory_arg(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& host_memory_arg() const;
@@ -662,6 +668,12 @@ inline void KernelDef::set_host_memory_arg(int index, const ::std::string& value
   // @@protoc_insertion_point(field_set:tensorflow.KernelDef.host_memory_arg)
   host_memory_arg_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void KernelDef::set_host_memory_arg(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.KernelDef.host_memory_arg)
+  host_memory_arg_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void KernelDef::set_host_memory_arg(int index, const char* value) {
   host_memory_arg_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.KernelDef.host_memory_arg)
@@ -679,6 +691,12 @@ inline void KernelDef::add_host_memory_arg(const ::std::string& value) {
   host_memory_arg_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.KernelDef.host_memory_arg)
 }
+#if LANG_CXX11
+inline void KernelDef::add_host_memory_arg(::std::string&& value) {
+  host_memory_arg_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.KernelDef.host_memory_arg)
+}
+#endif
 inline void KernelDef::add_host_memory_arg(const char* value) {
   host_memory_arg_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.KernelDef.host_memory_arg)
diff --git a/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/kernel_def.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/log_memory.pb.h b/libs/tensorflow/include/tensorflow/core/framework/log_memory.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/log_memory.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/log_memory.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/log_memory.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/log_memory.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/node_def.pb.h b/libs/tensorflow/include/tensorflow/core/framework/node_def.pb.h
old mode 100644
new mode 100755
index 7515132..a9fc153
--- a/libs/tensorflow/include/tensorflow/core/framework/node_def.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/node_def.pb.h
@@ -152,10 +152,16 @@ class NodeDef : public ::google::protobuf::Message /* @@protoc_insertion_point(c
   const ::std::string& input(int index) const;
   ::std::string* mutable_input(int index);
   void set_input(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_input(int index, ::std::string&& value);
+  #endif
   void set_input(int index, const char* value);
   void set_input(int index, const char* value, size_t size);
   ::std::string* add_input();
   void add_input(const ::std::string& value);
+  #if LANG_CXX11
+  void add_input(::std::string&& value);
+  #endif
   void add_input(const char* value);
   void add_input(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& input() const;
@@ -398,6 +404,12 @@ inline void NodeDef::set_input(int index, const ::std::string& value) {
   // @@protoc_insertion_point(field_set:tensorflow.NodeDef.input)
   input_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void NodeDef::set_input(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.NodeDef.input)
+  input_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void NodeDef::set_input(int index, const char* value) {
   input_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.NodeDef.input)
@@ -415,6 +427,12 @@ inline void NodeDef::add_input(const ::std::string& value) {
   input_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.NodeDef.input)
 }
+#if LANG_CXX11
+inline void NodeDef::add_input(::std::string&& value) {
+  input_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.NodeDef.input)
+}
+#endif
 inline void NodeDef::add_input(const char* value) {
   input_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.NodeDef.input)
diff --git a/libs/tensorflow/include/tensorflow/core/framework/node_def.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/node_def.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/node_def.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/node_def.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/numeric_op.h b/libs/tensorflow/include/tensorflow/core/framework/numeric_op.h
index f24bcfe..4538ff0 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/numeric_op.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/numeric_op.h
@@ -56,9 +56,9 @@ class UnaryElementWiseOp : public UnaryOp<T> {
   void Compute(OpKernelContext* context) override {
     // Output shape is the same as input shape.
     const Tensor& input = context->input(0);
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, input.shape(), &output));
     static_cast<CHILD*>(this)->Operate(context, input, output);
   }
 };
@@ -77,8 +77,9 @@ class BinaryElementWiseOp : public BinaryOp<T> {
       return;
     }
 
-    Tensor* output;
-    OP_REQUIRES_OK(context, context->allocate_output(0, a.shape(), &output));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0, 1}, 0, a.shape(), &output));
 
     // Dispatch to the descendant's Operate() function.
     switch (a.dims()) {
diff --git a/libs/tensorflow/include/tensorflow/core/framework/op_def.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/op_def.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/op_def.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/op_def.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/op_kernel.h b/libs/tensorflow/include/tensorflow/core/framework/op_kernel.h
index 75ad4bb..7c26b86 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/op_kernel.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/op_kernel.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/unique_tensor_references.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -206,6 +207,8 @@ class PersistentTensor {
 
   int64 NumElements() const { return tensor_.NumElements(); }
 
+  int64 AllocatedBytes() const { return tensor_.AllocatedBytes(); }
+
  private:
   Tensor tensor_;
 };
@@ -443,8 +446,7 @@ class OpOutputList {
 struct TensorValue {
   TensorValue() : mutex_if_ref(nullptr), tensor(nullptr) {}
   TensorValue(Tensor* t)  // NOLINT(runtime/explicit)
-      : mutex_if_ref(nullptr),
-        tensor(t) {}
+      : mutex_if_ref(nullptr), tensor(t) {}
   TensorValue(mutex* mu, Tensor* t) : mutex_if_ref(mu), tensor(t) {}
   Tensor* operator->() const { return tensor; }
   bool is_ref() const { return mutex_if_ref != nullptr; }
@@ -569,8 +571,11 @@ class OpKernelContext {
   int num_inputs() const { return params_->inputs->size(); }
   DataType input_dtype(int index) const;
   Status input_dtype(StringPiece name, DataType* dtype) const;
+  MemoryType input_memory_type(int index) const;
+
   int num_outputs() const { return outputs_.size(); }
   DataType expected_output_dtype(int index) const;
+  MemoryType output_memory_type(int index) const;
 
   // Input
 
@@ -643,12 +648,6 @@ class OpKernelContext {
   Status replace_ref_input(StringPiece name, const Tensor& tensor,
                            bool lock_held);
 
-  // Set the output Ref Tensor at output_index to be an alias of the
-  // input Ref Tensor at input_index.
-  // REQUIRES: IsRefType(input_dtype(input_index)).
-  // REQUIRES: IsRefType(output_dtype(output_index)).
-  void forward_ref_input_to_ref_output(int input_index, int output_index);
-
   // Deletes the Tensor object used as the Ref Input at
   // input_index. This is not usually necessary and should be used
   // with caution. If !lock_held the input mutex will be acquired
@@ -667,6 +666,70 @@ class OpKernelContext {
   // Usage: if (!context->ValidateInputsAreSameShape(this)) return;
   bool ValidateInputsAreSameShape(OpKernel* op);
 
+  // Input to output forwarding.
+
+  // Set the output Ref Tensor at output_index to be an alias of the
+  // input Ref Tensor at input_index.
+  // REQUIRES: IsRefType(input_dtype(input_index)).
+  // REQUIRES: IsRefType(output_dtype(output_index)).
+  void forward_ref_input_to_ref_output(int input_index, int output_index);
+
+  // Returns true when an alias to input[input_index], reshaped to output_shape,
+  // which is is safe to use for in-place computation was written to *output.
+  // Returns false if input[input_index] has a refcount greater than one, or if
+  // its type does not match the expected output type of output[output_index],
+  // or the number of elements in input[input_index] does not equal the number
+  // of elements in output_shape.
+  bool forward_input_to_output_with_shape(int input_index, int output_index,
+                                          const TensorShape& output_shape,
+                                          Tensor** output) TF_MUST_USE_RESULT;
+  Status forward_input_to_output_with_shape(StringPiece input_name,
+                                            StringPiece output_name,
+                                            const TensorShape& output_shape,
+                                            Tensor** output) TF_MUST_USE_RESULT;
+
+  // Returns a pointer to a Tensor aliasing the underlying buffer backing
+  // input[input_index] iff
+  //   * input[input_index] is not a ref,
+  //   * the data type, shape, memory type, and allocator attributes of
+  //     input[input_index] are compatible with those given in dtype, shape,
+  //     memory_type, and attr,
+  //   * refcount on the underlying buffer is one.
+  // Otherwise returns nullptr.
+  // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
+  // forwarding is only safe if there are no reads via __ldg() after writes
+  // to the same address.
+  std::unique_ptr<Tensor> forward_input(
+      int input_index, DataType dtype, const TensorShape& shape,
+      MemoryType memory_type,
+      const AllocatorAttributes& attr) TF_MUST_USE_RESULT;
+
+  // Tries to forward one of the inputs given in input_indices to
+  // output[output_index]. If none of the given inputs can be forwarded, calls
+  // allocate_output() to allocate a new output buffer.
+  Status forward_input_or_allocate_output(
+      gtl::ArraySlice<int> candidate_input_indices, int output_index,
+      const TensorShape& output_shape, Tensor** output) TF_MUST_USE_RESULT;
+  Status forward_input_or_allocate_output(
+      gtl::ArraySlice<StringPiece> candidate_input_names,
+      StringPiece output_name, const TensorShape& output_shape,
+      Tensor** output) TF_MUST_USE_RESULT;
+
+  // Tries to reuse one of of the inputs given in input_indices as a temporary.
+  // If none of the given inputs can be forwarded, calls
+  // allocate_temp() to allocate a new temporary buffer.
+  Status forward_input_or_allocate_temp(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, const AllocatorAttributes& allocator_attr,
+      Tensor* out_temp) TF_MUST_USE_RESULT;
+
+  Status forward_input_or_allocate_temp(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, Tensor* out_temp) TF_MUST_USE_RESULT {
+    return forward_input_or_allocate_temp(candidate_input_indices, type, shape,
+                                          AllocatorAttributes(), out_temp);
+  }
+
   // Output
 
   // Returns the named list-valued output in "list", as defined in the OpDef.
@@ -961,7 +1024,41 @@ class OpKernelContext {
   void set_output_ref(int index, mutex* mu, Tensor* tensor_for_ref);
   TensorValue release_output(int index);
 
+  bool track_allocations() const { return params_->track_allocations; }
+  bool allocate_on_host(AllocatorAttributes alloc_attr) const;
+
+  // Records temporary memory sizes.
+  void record_host_temp_memory_size(int64 size) {
+    host_temp_memory_size_ += size;
+  }
+  void record_device_temp_memory_size(int64 size) {
+    device_temp_memory_size_ += size;
+  }
+
+  // Returns recorded size of temporary memory;
+  int64 host_temp_memory_size() const { return host_temp_memory_size_; }
+  int64 device_temp_memory_size() const { return device_temp_memory_size_; }
+
+  // Records persistent memory allocation, size can be negative indicating
+  // deallocation.
+  void record_host_persistent_memory_allocation(int64 size,
+                                                int64 alloc_id = -1);
+  void record_device_persistent_memory_allocation(int64 size,
+                                                  int64 alloc_id = -1);
+
+  // Returns recorded size and ids of persistent memory.
+  int64 host_persistent_memory_allocated() const {
+    return host_persistent_memory_allocated_;
+  }
+  int64 device_persistent_memory_allocated() const {
+    return device_persistent_memory_allocated_;
+  }
+  std::vector<int64> host_persistent_alloc_ids() const;
+  std::vector<int64> device_persistent_alloc_ids() const;
+
  private:
+  bool input_is_ref(int index) const;
+
   Allocator* get_allocator(AllocatorAttributes attr);
 
   // Internal method to add a tensor's buffer to the list of buffers
@@ -1002,6 +1099,13 @@ class OpKernelContext {
 
   bool is_output_dead_ = false;
 
+  int64 host_temp_memory_size_;
+  int64 device_temp_memory_size_;
+  gtl::InlinedVector<int64, 2> host_persistent_alloc_ids_;
+  gtl::InlinedVector<int64, 2> device_persistent_alloc_ids_;
+  int64 host_persistent_memory_allocated_;
+  int64 device_persistent_memory_allocated_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
 };
 
@@ -1094,16 +1198,19 @@ class Name : public KernelDefBuilder {
 #define REGISTER_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
   REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
 
-#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)          \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar                \
-      registrar__body__##ctr##__object(                                 \
-          SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__)                       \
-              ? ::tensorflow::register_kernel::kernel_builder.Build()   \
-              : nullptr,                                                \
-          #__VA_ARGS__, [](::tensorflow::OpKernelConstruction* context) \
-                            -> ::tensorflow::OpKernel* {                \
-                              return new __VA_ARGS__(context);          \
-                            });
+#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
+  constexpr bool should_register_##ctr##__flag =                      \
+      SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__);                        \
+  static ::tensorflow::kernel_factory::OpKernelRegistrar              \
+      registrar__body__##ctr##__object(                               \
+          should_register_##ctr##__flag                               \
+              ? ::tensorflow::register_kernel::kernel_builder.Build() \
+              : nullptr,                                              \
+          #__VA_ARGS__,                                               \
+          [](::tensorflow::OpKernelConstruction* context)             \
+              -> ::tensorflow::OpKernel* {                            \
+            return new __VA_ARGS__(context);                          \
+          });
 
 void* GlobalKernelRegistry();
 
@@ -1149,7 +1256,7 @@ Status OpKernelConstruction::GetAttr(StringPiece attr_name, T* value) const {
 
 inline DataType OpKernelContext::input_dtype(int index) const {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
+  DCHECK_LT(index, num_inputs());
   const TensorValue& value((*params_->inputs)[index]);
   if (value.is_ref()) {
     return MakeRefType(value->dtype());
@@ -1158,12 +1265,28 @@ inline DataType OpKernelContext::input_dtype(int index) const {
   }
 }
 
+inline MemoryType OpKernelContext::input_memory_type(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_inputs());
+  return op_kernel().input_memory_types()[index];
+}
+
 inline DataType OpKernelContext::expected_output_dtype(int index) const {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->op_kernel->output_types().size());
+  DCHECK_LT(index, num_outputs());
   return params_->op_kernel->output_type(index);
 }
 
+inline MemoryType OpKernelContext::output_memory_type(int index) const {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, num_outputs());
+  return op_kernel().output_memory_types()[index];
+}
+
+inline bool OpKernelContext::input_is_ref(int index) const {
+  return IsRefType(input_dtype(index));
+}
+
 inline void OpKernelContext::record_tensor_reference(const Tensor& tensor) {
   DCHECK_EQ(params_->device->RequiresRecordingAccessedTensors(),
             params_->record_tensor_accesses);
@@ -1183,14 +1306,14 @@ inline void OpKernelContext::retrieve_accessed_tensors(
 // no input if tensor == nullptr.
 inline bool OpKernelContext::has_input(int index) const {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
+  DCHECK_LT(index, num_inputs());
   return (*params_->inputs)[index].tensor != nullptr;
 }
 
 inline mutex* OpKernelContext::input_ref_mutex(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, params_->inputs->size());
-  DCHECK((*params_->inputs)[index].is_ref());
+  DCHECK_LT(index, num_inputs());
+  DCHECK(input_is_ref(index));
   return (*params_->inputs)[index].mutex_if_ref;
 }
 
@@ -1202,7 +1325,7 @@ inline void OpKernelContext::NotifyUseOfPersistentTensor(const Tensor& t) {
 
 inline Tensor* OpKernelContext::mutable_output(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, outputs_.size());
+  DCHECK_LT(index, num_outputs());
   // No need to record_tensor_reference since the output must already
   // have been set by a call that did so.
   return outputs_[index].tensor;
@@ -1210,12 +1333,37 @@ inline Tensor* OpKernelContext::mutable_output(int index) {
 
 inline TensorValue OpKernelContext::release_output(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, outputs_.size());
+  DCHECK_LT(index, num_outputs());
   TensorValue value = outputs_[index];
   outputs_[index] = TensorValue();
   return value;
 }
 
+inline Status OpKernelContext::forward_input_or_allocate_output(
+    gtl::ArraySlice<int> candidate_input_indices, int output_index,
+    const TensorShape& output_shape, Tensor** output) {
+  for (int input_index : candidate_input_indices) {
+    if (forward_input_to_output_with_shape(input_index, output_index,
+                                           output_shape, output)) {
+      return Status::OK();
+    }
+  }
+  return allocate_output(output_index, output_shape, output);
+}
+
+inline Status OpKernelContext::forward_input_or_allocate_output(
+    gtl::ArraySlice<StringPiece> candidate_input_names, StringPiece output_name,
+    const TensorShape& output_shape, Tensor** output) {
+  for (const StringPiece& input_name : candidate_input_names) {
+    if (forward_input_to_output_with_shape(input_name, output_name,
+                                           output_shape, output)
+            .ok()) {
+      return Status::OK();
+    }
+  }
+  return allocate_output(output_name, output_shape, output);
+}
+
 template <typename T>
 T* OpKernelContext::op_device_context() {
   static_assert(std::is_base_of<DeviceContext, T>::value,
diff --git a/libs/tensorflow/include/tensorflow/core/framework/partial_tensor_shape.h b/libs/tensorflow/include/tensorflow/core/framework/partial_tensor_shape.h
index 1504b8c..7a70167 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/partial_tensor_shape.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/partial_tensor_shape.h
@@ -40,7 +40,7 @@ class PartialTensorShape {
   PartialTensorShape() : is_unknown_(true) {}
 
   /// \brief Construct a `PartialTensorShape` from the provided sizes.
-  /// REQUIRES: `dim_sizes[i] >= 0`
+  /// REQUIRES: `dim_sizes[i] >= -1`; `-1` means `unknown`.
   explicit PartialTensorShape(gtl::ArraySlice<int64> dim_sizes);
   PartialTensorShape(std::initializer_list<int64> dim_sizes)
       : PartialTensorShape(gtl::ArraySlice<int64>(dim_sizes)) {}
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/reader_base.h b/libs/tensorflow/include/tensorflow/core/framework/reader_base.h
similarity index 92%
rename from libs/tensorflow/include/tensorflow/core/kernels/reader_base.h
rename to libs/tensorflow/include/tensorflow/core/framework/reader_base.h
index 3cb9107..0528841 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/reader_base.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/reader_base.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_READER_BASE_H_
-#define TENSORFLOW_KERNELS_READER_BASE_H_
+#ifndef TENSORFLOW_FRAMEWORK_READER_BASE_H_
+#define TENSORFLOW_FRAMEWORK_READER_BASE_H_
 
 #include <memory>
 #include <string>
 #include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_base.pb.h"
 #include "tensorflow/core/framework/reader_interface.h"
-#include "tensorflow/core/kernels/reader_base.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
@@ -101,6 +101,12 @@ class ReaderBase : public ReaderInterface {
   Status RestoreBaseState(const ReaderBaseState& state);
 
  private:
+  // For descendants that wish to obtain the next work item in a different way.
+  // For implementing Read().  Dequeues the next work item from
+  // *queue, and if successful returns "work" (a string). May block.
+  virtual string GetNextWorkLocked(QueueInterface* queue,
+                                   OpKernelContext* context) const;
+
   // Implementations of ReaderInterface methods.  These ensure thread-safety
   // and call the methods above to do the work.
   void Read(QueueInterface* queue, string* key, string* value,
@@ -118,12 +124,6 @@ class ReaderBase : public ReaderInterface {
   Status SerializeState(string* state) override;
   Status RestoreState(const string& state) override;
 
-  // For implementing Read().  Dequeues the next work item from
-  // *queue, and if successful updates work_, work_started_
-  // (establishing work_in_progress() == true) and calls
-  // OnWorkStartedLocked().  May block.
-  void GetNextWorkLocked(QueueInterface* queue, OpKernelContext* context);
-
   mutable mutex mu_;
   const string name_;
   int64 work_started_ = 0;
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/reader_base.pb.h b/libs/tensorflow/include/tensorflow/core/framework/reader_base.pb.h
similarity index 95%
rename from libs/tensorflow/include/tensorflow/core/kernels/reader_base.pb.h
rename to libs/tensorflow/include/tensorflow/core/framework/reader_base.pb.h
index d83883c..c39bd32 100755
--- a/libs/tensorflow/include/tensorflow/core/kernels/reader_base.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/reader_base.pb.h
@@ -1,8 +1,8 @@
 // Generated by the protocol buffer compiler.  DO NOT EDIT!
-// source: tensorflow/core/kernels/reader_base.proto
+// source: tensorflow/core/framework/reader_base.proto
 
-#ifndef PROTOBUF_tensorflow_2fcore_2fkernels_2freader_5fbase_2eproto__INCLUDED
-#define PROTOBUF_tensorflow_2fcore_2fkernels_2freader_5fbase_2eproto__INCLUDED
+#ifndef PROTOBUF_tensorflow_2fcore_2fframework_2freader_5fbase_2eproto__INCLUDED
+#define PROTOBUF_tensorflow_2fcore_2fframework_2freader_5fbase_2eproto__INCLUDED
 
 #include <string>
 
@@ -37,7 +37,7 @@ extern ReaderBaseStateDefaultTypeInternal _ReaderBaseState_default_instance_;
 
 namespace tensorflow {
 
-namespace protobuf_tensorflow_2fcore_2fkernels_2freader_5fbase_2eproto {
+namespace protobuf_tensorflow_2fcore_2fframework_2freader_5fbase_2eproto {
 // Internal implementation detail -- do not call these.
 struct TableStruct {
   static const ::google::protobuf::uint32 offsets[];
@@ -46,7 +46,7 @@ struct TableStruct {
 };
 void AddDescriptors();
 void InitDefaults();
-}  // namespace protobuf_tensorflow_2fcore_2fkernels_2freader_5fbase_2eproto
+}  // namespace protobuf_tensorflow_2fcore_2fframework_2freader_5fbase_2eproto
 
 // ===================================================================
 
@@ -173,7 +173,7 @@ class ReaderBaseState : public ::google::protobuf::Message /* @@protoc_insertion
   ::google::protobuf::int64 work_finished_;
   ::google::protobuf::int64 num_records_produced_;
   mutable int _cached_size_;
-  friend struct  protobuf_tensorflow_2fcore_2fkernels_2freader_5fbase_2eproto::TableStruct;
+  friend struct  protobuf_tensorflow_2fcore_2fframework_2freader_5fbase_2eproto::TableStruct;
 };
 // ===================================================================
 
@@ -300,4 +300,4 @@ inline void ReaderBaseState::unsafe_arena_set_allocated_current_work(
 
 // @@protoc_insertion_point(global_scope)
 
-#endif  // PROTOBUF_tensorflow_2fcore_2fkernels_2freader_5fbase_2eproto__INCLUDED
+#endif  // PROTOBUF_tensorflow_2fcore_2fframework_2freader_5fbase_2eproto__INCLUDED
diff --git a/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb.h b/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb.h
new file mode 100755
index 0000000..65f64b1
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb.h
@@ -0,0 +1,833 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: tensorflow/core/framework/remote_fused_graph_execute_info.proto
+
+#ifndef PROTOBUF_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto__INCLUDED
+#define PROTOBUF_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3002000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3002000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+// @@protoc_insertion_point(includes)
+namespace tensorflow {
+class NodeDef;
+class NodeDefDefaultTypeInternal;
+extern NodeDefDefaultTypeInternal _NodeDef_default_instance_;
+class RemoteFusedGraphExecuteInfo;
+class RemoteFusedGraphExecuteInfoDefaultTypeInternal;
+extern RemoteFusedGraphExecuteInfoDefaultTypeInternal _RemoteFusedGraphExecuteInfo_default_instance_;
+class RemoteFusedGraphExecuteInfo_TensorShapeTypeProto;
+class RemoteFusedGraphExecuteInfo_TensorShapeTypeProtoDefaultTypeInternal;
+extern RemoteFusedGraphExecuteInfo_TensorShapeTypeProtoDefaultTypeInternal _RemoteFusedGraphExecuteInfo_TensorShapeTypeProto_default_instance_;
+class TensorShapeProto;
+class TensorShapeProtoDefaultTypeInternal;
+extern TensorShapeProtoDefaultTypeInternal _TensorShapeProto_default_instance_;
+class TensorShapeProto_Dim;
+class TensorShapeProto_DimDefaultTypeInternal;
+extern TensorShapeProto_DimDefaultTypeInternal _TensorShapeProto_Dim_default_instance_;
+}  // namespace tensorflow
+
+namespace tensorflow {
+
+namespace protobuf_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto {
+// Internal implementation detail -- do not call these.
+struct TableStruct {
+  static const ::google::protobuf::uint32 offsets[];
+  static void InitDefaultsImpl();
+  static void Shutdown();
+};
+void AddDescriptors();
+void InitDefaults();
+}  // namespace protobuf_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto
+
+// ===================================================================
+
+class RemoteFusedGraphExecuteInfo_TensorShapeTypeProto : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto) */ {
+ public:
+  RemoteFusedGraphExecuteInfo_TensorShapeTypeProto();
+  virtual ~RemoteFusedGraphExecuteInfo_TensorShapeTypeProto();
+
+  RemoteFusedGraphExecuteInfo_TensorShapeTypeProto(const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& from);
+
+  inline RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& operator=(const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& default_instance();
+
+  static inline const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* internal_default_instance() {
+    return reinterpret_cast<const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto*>(
+               &_RemoteFusedGraphExecuteInfo_TensorShapeTypeProto_default_instance_);
+  }
+
+  void UnsafeArenaSwap(RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* other);
+  void Swap(RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* other);
+
+  // implements Message ----------------------------------------------
+
+  inline RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& from);
+  void MergeFrom(const RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output)
+      const PROTOBUF_FINAL {
+    return InternalSerializeWithCachedSizesToArray(
+        ::google::protobuf::io::CodedOutputStream::IsDefaultSerializationDeterministic(), output);
+  }
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* other);
+  protected:
+  explicit RemoteFusedGraphExecuteInfo_TensorShapeTypeProto(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // .tensorflow.TensorShapeProto shape = 2;
+  bool has_shape() const;
+  void clear_shape();
+  static const int kShapeFieldNumber = 2;
+  private:
+  void _slow_mutable_shape();
+  void _slow_set_allocated_shape(
+      ::google::protobuf::Arena* message_arena, ::tensorflow::TensorShapeProto** shape);
+  ::tensorflow::TensorShapeProto* _slow_release_shape();
+  public:
+  const ::tensorflow::TensorShapeProto& shape() const;
+  ::tensorflow::TensorShapeProto* mutable_shape();
+  ::tensorflow::TensorShapeProto* release_shape();
+  void set_allocated_shape(::tensorflow::TensorShapeProto* shape);
+  ::tensorflow::TensorShapeProto* unsafe_arena_release_shape();
+  void unsafe_arena_set_allocated_shape(
+      ::tensorflow::TensorShapeProto* shape);
+
+  // .tensorflow.DataType dtype = 1;
+  void clear_dtype();
+  static const int kDtypeFieldNumber = 1;
+  ::tensorflow::DataType dtype() const;
+  void set_dtype(::tensorflow::DataType value);
+
+  // @@protoc_insertion_point(class_scope:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  friend class ::google::protobuf::Arena;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::tensorflow::TensorShapeProto* shape_;
+  int dtype_;
+  mutable int _cached_size_;
+  friend struct  protobuf_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto::TableStruct;
+};
+// -------------------------------------------------------------------
+
+class RemoteFusedGraphExecuteInfo : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.RemoteFusedGraphExecuteInfo) */ {
+ public:
+  RemoteFusedGraphExecuteInfo();
+  virtual ~RemoteFusedGraphExecuteInfo();
+
+  RemoteFusedGraphExecuteInfo(const RemoteFusedGraphExecuteInfo& from);
+
+  inline RemoteFusedGraphExecuteInfo& operator=(const RemoteFusedGraphExecuteInfo& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const RemoteFusedGraphExecuteInfo& default_instance();
+
+  static inline const RemoteFusedGraphExecuteInfo* internal_default_instance() {
+    return reinterpret_cast<const RemoteFusedGraphExecuteInfo*>(
+               &_RemoteFusedGraphExecuteInfo_default_instance_);
+  }
+
+  void UnsafeArenaSwap(RemoteFusedGraphExecuteInfo* other);
+  void Swap(RemoteFusedGraphExecuteInfo* other);
+
+  // implements Message ----------------------------------------------
+
+  inline RemoteFusedGraphExecuteInfo* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  RemoteFusedGraphExecuteInfo* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const RemoteFusedGraphExecuteInfo& from);
+  void MergeFrom(const RemoteFusedGraphExecuteInfo& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output)
+      const PROTOBUF_FINAL {
+    return InternalSerializeWithCachedSizesToArray(
+        ::google::protobuf::io::CodedOutputStream::IsDefaultSerializationDeterministic(), output);
+  }
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(RemoteFusedGraphExecuteInfo* other);
+  protected:
+  explicit RemoteFusedGraphExecuteInfo(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  typedef RemoteFusedGraphExecuteInfo_TensorShapeTypeProto TensorShapeTypeProto;
+
+  // accessors -------------------------------------------------------
+
+  // repeated .tensorflow.NodeDef node = 1;
+  int node_size() const;
+  void clear_node();
+  static const int kNodeFieldNumber = 1;
+  const ::tensorflow::NodeDef& node(int index) const;
+  ::tensorflow::NodeDef* mutable_node(int index);
+  ::tensorflow::NodeDef* add_node();
+  ::google::protobuf::RepeatedPtrField< ::tensorflow::NodeDef >*
+      mutable_node();
+  const ::google::protobuf::RepeatedPtrField< ::tensorflow::NodeDef >&
+      node() const;
+
+  // repeated string graph_input_node_name = 2;
+  int graph_input_node_name_size() const;
+  void clear_graph_input_node_name();
+  static const int kGraphInputNodeNameFieldNumber = 2;
+  const ::std::string& graph_input_node_name(int index) const;
+  ::std::string* mutable_graph_input_node_name(int index);
+  void set_graph_input_node_name(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_graph_input_node_name(int index, ::std::string&& value);
+  #endif
+  void set_graph_input_node_name(int index, const char* value);
+  void set_graph_input_node_name(int index, const char* value, size_t size);
+  ::std::string* add_graph_input_node_name();
+  void add_graph_input_node_name(const ::std::string& value);
+  #if LANG_CXX11
+  void add_graph_input_node_name(::std::string&& value);
+  #endif
+  void add_graph_input_node_name(const char* value);
+  void add_graph_input_node_name(const char* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField< ::std::string>& graph_input_node_name() const;
+  ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_graph_input_node_name();
+
+  // repeated string graph_output_node_name = 3;
+  int graph_output_node_name_size() const;
+  void clear_graph_output_node_name();
+  static const int kGraphOutputNodeNameFieldNumber = 3;
+  const ::std::string& graph_output_node_name(int index) const;
+  ::std::string* mutable_graph_output_node_name(int index);
+  void set_graph_output_node_name(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_graph_output_node_name(int index, ::std::string&& value);
+  #endif
+  void set_graph_output_node_name(int index, const char* value);
+  void set_graph_output_node_name(int index, const char* value, size_t size);
+  ::std::string* add_graph_output_node_name();
+  void add_graph_output_node_name(const ::std::string& value);
+  #if LANG_CXX11
+  void add_graph_output_node_name(::std::string&& value);
+  #endif
+  void add_graph_output_node_name(const char* value);
+  void add_graph_output_node_name(const char* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField< ::std::string>& graph_output_node_name() const;
+  ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_graph_output_node_name();
+
+  // repeated .tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto default_graph_input_tensor_shape = 6;
+  int default_graph_input_tensor_shape_size() const;
+  void clear_default_graph_input_tensor_shape();
+  static const int kDefaultGraphInputTensorShapeFieldNumber = 6;
+  const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& default_graph_input_tensor_shape(int index) const;
+  ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* mutable_default_graph_input_tensor_shape(int index);
+  ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* add_default_graph_input_tensor_shape();
+  ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >*
+      mutable_default_graph_input_tensor_shape();
+  const ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >&
+      default_graph_input_tensor_shape() const;
+
+  // repeated .tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto default_graph_output_tensor_shape = 7;
+  int default_graph_output_tensor_shape_size() const;
+  void clear_default_graph_output_tensor_shape();
+  static const int kDefaultGraphOutputTensorShapeFieldNumber = 7;
+  const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& default_graph_output_tensor_shape(int index) const;
+  ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* mutable_default_graph_output_tensor_shape(int index);
+  ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* add_default_graph_output_tensor_shape();
+  ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >*
+      mutable_default_graph_output_tensor_shape();
+  const ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >&
+      default_graph_output_tensor_shape() const;
+
+  // string executor_name = 4;
+  void clear_executor_name();
+  static const int kExecutorNameFieldNumber = 4;
+  const ::std::string& executor_name() const;
+  void set_executor_name(const ::std::string& value);
+  void set_executor_name(const char* value);
+  void set_executor_name(const char* value, size_t size);
+  ::std::string* mutable_executor_name();
+  ::std::string* release_executor_name();
+  void set_allocated_executor_name(::std::string* executor_name);
+  ::std::string* unsafe_arena_release_executor_name();
+  void unsafe_arena_set_allocated_executor_name(
+      ::std::string* executor_name);
+
+  // bytes serialized_executor_parameters = 5;
+  void clear_serialized_executor_parameters();
+  static const int kSerializedExecutorParametersFieldNumber = 5;
+  const ::std::string& serialized_executor_parameters() const;
+  void set_serialized_executor_parameters(const ::std::string& value);
+  void set_serialized_executor_parameters(const char* value);
+  void set_serialized_executor_parameters(const void* value, size_t size);
+  ::std::string* mutable_serialized_executor_parameters();
+  ::std::string* release_serialized_executor_parameters();
+  void set_allocated_serialized_executor_parameters(::std::string* serialized_executor_parameters);
+  ::std::string* unsafe_arena_release_serialized_executor_parameters();
+  void unsafe_arena_set_allocated_serialized_executor_parameters(
+      ::std::string* serialized_executor_parameters);
+
+  // @@protoc_insertion_point(class_scope:tensorflow.RemoteFusedGraphExecuteInfo)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  friend class ::google::protobuf::Arena;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedPtrField< ::tensorflow::NodeDef > node_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> graph_input_node_name_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> graph_output_node_name_;
+  ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto > default_graph_input_tensor_shape_;
+  ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto > default_graph_output_tensor_shape_;
+  ::google::protobuf::internal::ArenaStringPtr executor_name_;
+  ::google::protobuf::internal::ArenaStringPtr serialized_executor_parameters_;
+  mutable int _cached_size_;
+  friend struct  protobuf_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto::TableStruct;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#if !PROTOBUF_INLINE_NOT_IN_HEADERS
+// RemoteFusedGraphExecuteInfo_TensorShapeTypeProto
+
+// .tensorflow.DataType dtype = 1;
+inline void RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::clear_dtype() {
+  dtype_ = 0;
+}
+inline ::tensorflow::DataType RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::dtype() const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto.dtype)
+  return static_cast< ::tensorflow::DataType >(dtype_);
+}
+inline void RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::set_dtype(::tensorflow::DataType value) {
+  
+  dtype_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto.dtype)
+}
+
+// .tensorflow.TensorShapeProto shape = 2;
+inline bool RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::has_shape() const {
+  return this != internal_default_instance() && shape_ != NULL;
+}
+inline void RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::clear_shape() {
+  if (GetArenaNoVirtual() == NULL && shape_ != NULL) delete shape_;
+  shape_ = NULL;
+}
+inline const ::tensorflow::TensorShapeProto& RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::shape() const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto.shape)
+  return shape_ != NULL ? *shape_
+                         : *::tensorflow::TensorShapeProto::internal_default_instance();
+}
+inline ::tensorflow::TensorShapeProto* RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::mutable_shape() {
+  
+  if (shape_ == NULL) {
+    _slow_mutable_shape();
+  }
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto.shape)
+  return shape_;
+}
+inline ::tensorflow::TensorShapeProto* RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::release_shape() {
+  // @@protoc_insertion_point(field_release:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto.shape)
+  
+  if (GetArenaNoVirtual() != NULL) {
+    return _slow_release_shape();
+  } else {
+    ::tensorflow::TensorShapeProto* temp = shape_;
+    shape_ = NULL;
+    return temp;
+  }
+}
+inline  void RemoteFusedGraphExecuteInfo_TensorShapeTypeProto::set_allocated_shape(::tensorflow::TensorShapeProto* shape) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == NULL) {
+    delete shape_;
+  }
+  if (shape != NULL) {
+    _slow_set_allocated_shape(message_arena, &shape);
+  }
+  shape_ = shape;
+  if (shape) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto.shape)
+}
+
+// -------------------------------------------------------------------
+
+// RemoteFusedGraphExecuteInfo
+
+// repeated .tensorflow.NodeDef node = 1;
+inline int RemoteFusedGraphExecuteInfo::node_size() const {
+  return node_.size();
+}
+inline void RemoteFusedGraphExecuteInfo::clear_node() {
+  node_.Clear();
+}
+inline const ::tensorflow::NodeDef& RemoteFusedGraphExecuteInfo::node(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.node)
+  return node_.Get(index);
+}
+inline ::tensorflow::NodeDef* RemoteFusedGraphExecuteInfo::mutable_node(int index) {
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.node)
+  return node_.Mutable(index);
+}
+inline ::tensorflow::NodeDef* RemoteFusedGraphExecuteInfo::add_node() {
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.node)
+  return node_.Add();
+}
+inline ::google::protobuf::RepeatedPtrField< ::tensorflow::NodeDef >*
+RemoteFusedGraphExecuteInfo::mutable_node() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.RemoteFusedGraphExecuteInfo.node)
+  return &node_;
+}
+inline const ::google::protobuf::RepeatedPtrField< ::tensorflow::NodeDef >&
+RemoteFusedGraphExecuteInfo::node() const {
+  // @@protoc_insertion_point(field_list:tensorflow.RemoteFusedGraphExecuteInfo.node)
+  return node_;
+}
+
+// repeated string graph_input_node_name = 2;
+inline int RemoteFusedGraphExecuteInfo::graph_input_node_name_size() const {
+  return graph_input_node_name_.size();
+}
+inline void RemoteFusedGraphExecuteInfo::clear_graph_input_node_name() {
+  graph_input_node_name_.Clear();
+}
+inline const ::std::string& RemoteFusedGraphExecuteInfo::graph_input_node_name(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  return graph_input_node_name_.Get(index);
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::mutable_graph_input_node_name(int index) {
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  return graph_input_node_name_.Mutable(index);
+}
+inline void RemoteFusedGraphExecuteInfo::set_graph_input_node_name(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  graph_input_node_name_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void RemoteFusedGraphExecuteInfo::set_graph_input_node_name(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  graph_input_node_name_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void RemoteFusedGraphExecuteInfo::set_graph_input_node_name(int index, const char* value) {
+  graph_input_node_name_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+}
+inline void RemoteFusedGraphExecuteInfo::set_graph_input_node_name(int index, const char* value, size_t size) {
+  graph_input_node_name_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::add_graph_input_node_name() {
+  // @@protoc_insertion_point(field_add_mutable:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  return graph_input_node_name_.Add();
+}
+inline void RemoteFusedGraphExecuteInfo::add_graph_input_node_name(const ::std::string& value) {
+  graph_input_node_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+}
+#if LANG_CXX11
+inline void RemoteFusedGraphExecuteInfo::add_graph_input_node_name(::std::string&& value) {
+  graph_input_node_name_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+}
+#endif
+inline void RemoteFusedGraphExecuteInfo::add_graph_input_node_name(const char* value) {
+  graph_input_node_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+}
+inline void RemoteFusedGraphExecuteInfo::add_graph_input_node_name(const char* value, size_t size) {
+  graph_input_node_name_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+}
+inline const ::google::protobuf::RepeatedPtrField< ::std::string>&
+RemoteFusedGraphExecuteInfo::graph_input_node_name() const {
+  // @@protoc_insertion_point(field_list:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  return graph_input_node_name_;
+}
+inline ::google::protobuf::RepeatedPtrField< ::std::string>*
+RemoteFusedGraphExecuteInfo::mutable_graph_input_node_name() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.RemoteFusedGraphExecuteInfo.graph_input_node_name)
+  return &graph_input_node_name_;
+}
+
+// repeated string graph_output_node_name = 3;
+inline int RemoteFusedGraphExecuteInfo::graph_output_node_name_size() const {
+  return graph_output_node_name_.size();
+}
+inline void RemoteFusedGraphExecuteInfo::clear_graph_output_node_name() {
+  graph_output_node_name_.Clear();
+}
+inline const ::std::string& RemoteFusedGraphExecuteInfo::graph_output_node_name(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  return graph_output_node_name_.Get(index);
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::mutable_graph_output_node_name(int index) {
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  return graph_output_node_name_.Mutable(index);
+}
+inline void RemoteFusedGraphExecuteInfo::set_graph_output_node_name(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  graph_output_node_name_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void RemoteFusedGraphExecuteInfo::set_graph_output_node_name(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  graph_output_node_name_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void RemoteFusedGraphExecuteInfo::set_graph_output_node_name(int index, const char* value) {
+  graph_output_node_name_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+}
+inline void RemoteFusedGraphExecuteInfo::set_graph_output_node_name(int index, const char* value, size_t size) {
+  graph_output_node_name_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::add_graph_output_node_name() {
+  // @@protoc_insertion_point(field_add_mutable:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  return graph_output_node_name_.Add();
+}
+inline void RemoteFusedGraphExecuteInfo::add_graph_output_node_name(const ::std::string& value) {
+  graph_output_node_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+}
+#if LANG_CXX11
+inline void RemoteFusedGraphExecuteInfo::add_graph_output_node_name(::std::string&& value) {
+  graph_output_node_name_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+}
+#endif
+inline void RemoteFusedGraphExecuteInfo::add_graph_output_node_name(const char* value) {
+  graph_output_node_name_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+}
+inline void RemoteFusedGraphExecuteInfo::add_graph_output_node_name(const char* value, size_t size) {
+  graph_output_node_name_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+}
+inline const ::google::protobuf::RepeatedPtrField< ::std::string>&
+RemoteFusedGraphExecuteInfo::graph_output_node_name() const {
+  // @@protoc_insertion_point(field_list:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  return graph_output_node_name_;
+}
+inline ::google::protobuf::RepeatedPtrField< ::std::string>*
+RemoteFusedGraphExecuteInfo::mutable_graph_output_node_name() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.RemoteFusedGraphExecuteInfo.graph_output_node_name)
+  return &graph_output_node_name_;
+}
+
+// string executor_name = 4;
+inline void RemoteFusedGraphExecuteInfo::clear_executor_name() {
+  executor_name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& RemoteFusedGraphExecuteInfo::executor_name() const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+  return executor_name_.Get();
+}
+inline void RemoteFusedGraphExecuteInfo::set_executor_name(const ::std::string& value) {
+  
+  executor_name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+}
+inline void RemoteFusedGraphExecuteInfo::set_executor_name(const char* value) {
+  
+  executor_name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+}
+inline void RemoteFusedGraphExecuteInfo::set_executor_name(const char* value,
+    size_t size) {
+  
+  executor_name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::mutable_executor_name() {
+  
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+  return executor_name_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::release_executor_name() {
+  // @@protoc_insertion_point(field_release:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+  
+  return executor_name_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::unsafe_arena_release_executor_name() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  
+  return executor_name_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void RemoteFusedGraphExecuteInfo::set_allocated_executor_name(::std::string* executor_name) {
+  if (executor_name != NULL) {
+    
+  } else {
+    
+  }
+  executor_name_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), executor_name,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+}
+inline void RemoteFusedGraphExecuteInfo::unsafe_arena_set_allocated_executor_name(
+    ::std::string* executor_name) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (executor_name != NULL) {
+    
+  } else {
+    
+  }
+  executor_name_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      executor_name, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:tensorflow.RemoteFusedGraphExecuteInfo.executor_name)
+}
+
+// bytes serialized_executor_parameters = 5;
+inline void RemoteFusedGraphExecuteInfo::clear_serialized_executor_parameters() {
+  serialized_executor_parameters_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& RemoteFusedGraphExecuteInfo::serialized_executor_parameters() const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+  return serialized_executor_parameters_.Get();
+}
+inline void RemoteFusedGraphExecuteInfo::set_serialized_executor_parameters(const ::std::string& value) {
+  
+  serialized_executor_parameters_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+}
+inline void RemoteFusedGraphExecuteInfo::set_serialized_executor_parameters(const char* value) {
+  
+  serialized_executor_parameters_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+}
+inline void RemoteFusedGraphExecuteInfo::set_serialized_executor_parameters(const void* value,
+    size_t size) {
+  
+  serialized_executor_parameters_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::mutable_serialized_executor_parameters() {
+  
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+  return serialized_executor_parameters_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::release_serialized_executor_parameters() {
+  // @@protoc_insertion_point(field_release:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+  
+  return serialized_executor_parameters_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* RemoteFusedGraphExecuteInfo::unsafe_arena_release_serialized_executor_parameters() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  
+  return serialized_executor_parameters_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void RemoteFusedGraphExecuteInfo::set_allocated_serialized_executor_parameters(::std::string* serialized_executor_parameters) {
+  if (serialized_executor_parameters != NULL) {
+    
+  } else {
+    
+  }
+  serialized_executor_parameters_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), serialized_executor_parameters,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+}
+inline void RemoteFusedGraphExecuteInfo::unsafe_arena_set_allocated_serialized_executor_parameters(
+    ::std::string* serialized_executor_parameters) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (serialized_executor_parameters != NULL) {
+    
+  } else {
+    
+  }
+  serialized_executor_parameters_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      serialized_executor_parameters, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:tensorflow.RemoteFusedGraphExecuteInfo.serialized_executor_parameters)
+}
+
+// repeated .tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto default_graph_input_tensor_shape = 6;
+inline int RemoteFusedGraphExecuteInfo::default_graph_input_tensor_shape_size() const {
+  return default_graph_input_tensor_shape_.size();
+}
+inline void RemoteFusedGraphExecuteInfo::clear_default_graph_input_tensor_shape() {
+  default_graph_input_tensor_shape_.Clear();
+}
+inline const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& RemoteFusedGraphExecuteInfo::default_graph_input_tensor_shape(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_input_tensor_shape)
+  return default_graph_input_tensor_shape_.Get(index);
+}
+inline ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* RemoteFusedGraphExecuteInfo::mutable_default_graph_input_tensor_shape(int index) {
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_input_tensor_shape)
+  return default_graph_input_tensor_shape_.Mutable(index);
+}
+inline ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* RemoteFusedGraphExecuteInfo::add_default_graph_input_tensor_shape() {
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_input_tensor_shape)
+  return default_graph_input_tensor_shape_.Add();
+}
+inline ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >*
+RemoteFusedGraphExecuteInfo::mutable_default_graph_input_tensor_shape() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_input_tensor_shape)
+  return &default_graph_input_tensor_shape_;
+}
+inline const ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >&
+RemoteFusedGraphExecuteInfo::default_graph_input_tensor_shape() const {
+  // @@protoc_insertion_point(field_list:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_input_tensor_shape)
+  return default_graph_input_tensor_shape_;
+}
+
+// repeated .tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto default_graph_output_tensor_shape = 7;
+inline int RemoteFusedGraphExecuteInfo::default_graph_output_tensor_shape_size() const {
+  return default_graph_output_tensor_shape_.size();
+}
+inline void RemoteFusedGraphExecuteInfo::clear_default_graph_output_tensor_shape() {
+  default_graph_output_tensor_shape_.Clear();
+}
+inline const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& RemoteFusedGraphExecuteInfo::default_graph_output_tensor_shape(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_output_tensor_shape)
+  return default_graph_output_tensor_shape_.Get(index);
+}
+inline ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* RemoteFusedGraphExecuteInfo::mutable_default_graph_output_tensor_shape(int index) {
+  // @@protoc_insertion_point(field_mutable:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_output_tensor_shape)
+  return default_graph_output_tensor_shape_.Mutable(index);
+}
+inline ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* RemoteFusedGraphExecuteInfo::add_default_graph_output_tensor_shape() {
+  // @@protoc_insertion_point(field_add:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_output_tensor_shape)
+  return default_graph_output_tensor_shape_.Add();
+}
+inline ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >*
+RemoteFusedGraphExecuteInfo::mutable_default_graph_output_tensor_shape() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_output_tensor_shape)
+  return &default_graph_output_tensor_shape_;
+}
+inline const ::google::protobuf::RepeatedPtrField< ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto >&
+RemoteFusedGraphExecuteInfo::default_graph_output_tensor_shape() const {
+  // @@protoc_insertion_point(field_list:tensorflow.RemoteFusedGraphExecuteInfo.default_graph_output_tensor_shape)
+  return default_graph_output_tensor_shape_;
+}
+
+#endif  // !PROTOBUF_INLINE_NOT_IN_HEADERS
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+
+}  // namespace tensorflow
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_tensorflow_2fcore_2fframework_2fremote_5ffused_5fgraph_5fexecute_5finfo_2eproto__INCLUDED
diff --git a/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text-impl.h
new file mode 100755
index 0000000..4756257
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text-impl.h
@@ -0,0 +1,44 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_framework_remote_fused_graph_execute_info_proto_IMPL_H_
+#define tensorflow_core_framework_remote_fused_graph_execute_info_proto_IMPL_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value.pb_text-impl.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb_text-impl.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb_text.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/resource_handle.pb_text-impl.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor.pb_text-impl.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb_text-impl.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/types.pb_text-impl.h"
+#include "tensorflow/core/lib/strings/proto_text_util.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* msg);
+
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::RemoteFusedGraphExecuteInfo& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::RemoteFusedGraphExecuteInfo* msg);
+
+}  // namespace internal
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_framework_remote_fused_graph_execute_info_proto_IMPL_H_
diff --git a/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text.h
new file mode 100755
index 0000000..9e0f398
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/framework/remote_fused_graph_execute_info.pb_text.h
@@ -0,0 +1,34 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_framework_remote_fused_graph_execute_info_proto_H_
+#define tensorflow_core_framework_remote_fused_graph_execute_info_proto_H_
+
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Message-text conversion for tensorflow.RemoteFusedGraphExecuteInfo.TensorShapeTypeProto
+string ProtoDebugString(
+    const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::RemoteFusedGraphExecuteInfo_TensorShapeTypeProto* msg)
+        TF_MUST_USE_RESULT;
+
+// Message-text conversion for tensorflow.RemoteFusedGraphExecuteInfo
+string ProtoDebugString(
+    const ::tensorflow::RemoteFusedGraphExecuteInfo& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::RemoteFusedGraphExecuteInfo& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::RemoteFusedGraphExecuteInfo* msg)
+        TF_MUST_USE_RESULT;
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_framework_remote_fused_graph_execute_info_proto_H_
diff --git a/libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb.h b/libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/resource_handle.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/resource_mgr.h b/libs/tensorflow/include/tensorflow/core/framework/resource_mgr.h
index 882d56b..fe6e093 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/resource_mgr.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/resource_mgr.h
@@ -77,6 +77,9 @@ class ResourceBase : public core::RefCounted {
  public:
   // Returns a debug string for *this.
   virtual string DebugString() = 0;
+
+  // Returns memory used by this resource.
+  virtual int64 MemoryUsed() const { return 0; };
 };
 
 // Container used for per-step resources.
diff --git a/libs/tensorflow/include/tensorflow/core/framework/session_state.h b/libs/tensorflow/include/tensorflow/core/framework/session_state.h
index a3eafcf..8fbe940 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/session_state.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/session_state.h
@@ -41,6 +41,8 @@ class SessionState {
 
   int64 GetNewId();
 
+  static const char* kTensorHandleResourceTypeName;
+
  private:
   mutex state_lock_;
 
diff --git a/libs/tensorflow/include/tensorflow/core/framework/shape_inference.h b/libs/tensorflow/include/tensorflow/core/framework/shape_inference.h
index fd4e25c..b7f1725 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/shape_inference.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/shape_inference.h
@@ -144,7 +144,8 @@ class InferenceContext {
   // Values of <input_tensors_as_shapes> do not need to outlive the context.
   //
   // REQUIRES: <node_def> is not NULL, and must outlive the InferenceContext.
-  InferenceContext(const NodeDef* node_def, const OpDef& op_def,
+  InferenceContext(int graph_def_version, const NodeDef* node_def,
+                   const OpDef& op_def,
                    const std::vector<ShapeHandle>& input_shapes,
                    const std::vector<const Tensor*>& input_tensors,
                    const std::vector<ShapeHandle>& input_tensors_as_shapes,
@@ -161,7 +162,8 @@ class InferenceContext {
   // Values of <input_tensors_as_shapes> do not need to outlive the context.
   //
   // REQUIRES: <node_def> is not NULL, and must outlive the InferenceContext.
-  InferenceContext(const NodeDef* node_def, const OpDef& op_def,
+  InferenceContext(int graph_def_version, const NodeDef* node_def,
+                   const OpDef& op_def,
                    const std::vector<TensorShapeProto>& input_shapes,
                    const std::vector<const Tensor*>& input_tensors,
                    const std::vector<TensorShapeProto>& input_tensors_as_shapes,
@@ -180,6 +182,15 @@ class InferenceContext {
     if (!s.ok()) {
       return AttachContext(s);
     }
+#if 0
+    // TODO(cwhipkey): enable this check
+#ifndef NDEBUG
+    for (int i = 0; i < num_outputs(); ++i) {
+      DCHECK(output(i).IsSet()) << i << " for " << node_def().name()
+                                << " of type " << node_def().op();
+    }
+#endif  // NDEBUG
+#endif
     return s;
   }
 
@@ -237,7 +248,7 @@ class InferenceContext {
   }
   int32 Rank(ShapeHandle s) const {
     DCHECK(s.IsSet());
-    return s->rank_;
+    return s.IsSet() ? s->rank_ : kUnknownRank;
   }
   bool RankKnown(ShapeHandle s) const {
     return (s.IsSet() && (Rank(s) != kUnknownRank));
@@ -353,6 +364,13 @@ class InferenceContext {
   Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                  ShapeHandle* out);
 
+  // Returns in <out> a new shape corresponding to <partial_shape>.
+  Status MakeShapeFromPartialTensorShape(
+      const PartialTensorShape& partial_shape, ShapeHandle* out);
+
+  // Returns in <out> a new shape corresponding to <shape>.
+  Status MakeShapeFromTensorShape(const TensorShape& shape, ShapeHandle* out);
+
   // Returns a new dimension of the given size.  The returned value is owned by
   // this context.
   inline DimensionHandle MakeDim(DimensionOrConstant d) {
@@ -366,6 +384,11 @@ class InferenceContext {
   // the value.
   Status MakeDimForScalarInput(int idx, DimensionHandle* out);
 
+  // Returns the NodeDef. The returned reference does not outlive the
+  // InferenceContext, and it should not be used after InferenceContext is
+  // destroyed.
+  const NodeDef& node_def() { return node_def_; }
+
   // Look up the attr for the NodeDef being evaluated with name attr_name and
   // set *value to its value.  If no attr with attr_name is found in def(), or
   // the attr does not have a matching type, a non-ok status will be returned.
@@ -436,6 +459,8 @@ class InferenceContext {
   Status MakeShapeFromTensor(const Tensor* t, ShapeHandle tensor_shape,
                              ShapeHandle* out);
 
+  int graph_def_version() const { return graph_def_version_; }
+
  private:
   // Creates and stores shapes for use in InferenceContext.
   class ShapeManager {
@@ -508,6 +533,7 @@ class InferenceContext {
   std::vector<ShapeHandle> output_handle_shape_;
   std::vector<DataType> output_handle_dtype_;
 
+  const int graph_def_version_;
   const NodeDef& node_def_;
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
diff --git a/libs/tensorflow/include/tensorflow/core/framework/shape_inference_testutil.h b/libs/tensorflow/include/tensorflow/core/framework/shape_inference_testutil.h
index 6406746..996281e 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/shape_inference_testutil.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/shape_inference_testutil.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/version.h"
 
 // Contains utilities for writing tests for shape inference functions.
 
@@ -34,6 +35,7 @@ struct ShapeInferenceTestOp {
   string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
+  int graph_def_version = TF_GRAPH_DEF_VERSION;
 };
 
 namespace shape_inference {
diff --git a/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb.h b/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb.h
index 57d28f7..94b19b7 100755
--- a/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb.h
@@ -41,6 +41,9 @@ extern AllocatorMemoryUsedDefaultTypeInternal _AllocatorMemoryUsed_default_insta
 class DeviceStepStats;
 class DeviceStepStatsDefaultTypeInternal;
 extern DeviceStepStatsDefaultTypeInternal _DeviceStepStats_default_instance_;
+class MemoryStats;
+class MemoryStatsDefaultTypeInternal;
+extern MemoryStatsDefaultTypeInternal _MemoryStats_default_instance_;
 class NodeExecStats;
 class NodeExecStatsDefaultTypeInternal;
 extern NodeExecStatsDefaultTypeInternal _NodeExecStats_default_instance_;
@@ -314,6 +317,153 @@ class NodeOutput : public ::google::protobuf::Message /* @@protoc_insertion_poin
 };
 // -------------------------------------------------------------------
 
+class MemoryStats : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.MemoryStats) */ {
+ public:
+  MemoryStats();
+  virtual ~MemoryStats();
+
+  MemoryStats(const MemoryStats& from);
+
+  inline MemoryStats& operator=(const MemoryStats& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const MemoryStats& default_instance();
+
+  static inline const MemoryStats* internal_default_instance() {
+    return reinterpret_cast<const MemoryStats*>(
+               &_MemoryStats_default_instance_);
+  }
+
+  void UnsafeArenaSwap(MemoryStats* other);
+  void Swap(MemoryStats* other);
+
+  // implements Message ----------------------------------------------
+
+  inline MemoryStats* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  MemoryStats* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const MemoryStats& from);
+  void MergeFrom(const MemoryStats& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output)
+      const PROTOBUF_FINAL {
+    return InternalSerializeWithCachedSizesToArray(
+        ::google::protobuf::io::CodedOutputStream::IsDefaultSerializationDeterministic(), output);
+  }
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(MemoryStats* other);
+  protected:
+  explicit MemoryStats(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated int64 host_persistent_tensor_alloc_ids = 5;
+  int host_persistent_tensor_alloc_ids_size() const;
+  void clear_host_persistent_tensor_alloc_ids();
+  static const int kHostPersistentTensorAllocIdsFieldNumber = 5;
+  ::google::protobuf::int64 host_persistent_tensor_alloc_ids(int index) const;
+  void set_host_persistent_tensor_alloc_ids(int index, ::google::protobuf::int64 value);
+  void add_host_persistent_tensor_alloc_ids(::google::protobuf::int64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+      host_persistent_tensor_alloc_ids() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+      mutable_host_persistent_tensor_alloc_ids();
+
+  // repeated int64 device_persistent_tensor_alloc_ids = 6;
+  int device_persistent_tensor_alloc_ids_size() const;
+  void clear_device_persistent_tensor_alloc_ids();
+  static const int kDevicePersistentTensorAllocIdsFieldNumber = 6;
+  ::google::protobuf::int64 device_persistent_tensor_alloc_ids(int index) const;
+  void set_device_persistent_tensor_alloc_ids(int index, ::google::protobuf::int64 value);
+  void add_device_persistent_tensor_alloc_ids(::google::protobuf::int64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+      device_persistent_tensor_alloc_ids() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+      mutable_device_persistent_tensor_alloc_ids();
+
+  // int64 host_temp_memory_size = 1;
+  void clear_host_temp_memory_size();
+  static const int kHostTempMemorySizeFieldNumber = 1;
+  ::google::protobuf::int64 host_temp_memory_size() const;
+  void set_host_temp_memory_size(::google::protobuf::int64 value);
+
+  // int64 device_temp_memory_size = 2;
+  void clear_device_temp_memory_size();
+  static const int kDeviceTempMemorySizeFieldNumber = 2;
+  ::google::protobuf::int64 device_temp_memory_size() const;
+  void set_device_temp_memory_size(::google::protobuf::int64 value);
+
+  // int64 host_persistent_memory_size = 3;
+  void clear_host_persistent_memory_size();
+  static const int kHostPersistentMemorySizeFieldNumber = 3;
+  ::google::protobuf::int64 host_persistent_memory_size() const;
+  void set_host_persistent_memory_size(::google::protobuf::int64 value);
+
+  // int64 device_persistent_memory_size = 4;
+  void clear_device_persistent_memory_size();
+  static const int kDevicePersistentMemorySizeFieldNumber = 4;
+  ::google::protobuf::int64 device_persistent_memory_size() const;
+  void set_device_persistent_memory_size(::google::protobuf::int64 value);
+
+  // @@protoc_insertion_point(class_scope:tensorflow.MemoryStats)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  friend class ::google::protobuf::Arena;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 > host_persistent_tensor_alloc_ids_;
+  mutable int _host_persistent_tensor_alloc_ids_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 > device_persistent_tensor_alloc_ids_;
+  mutable int _device_persistent_tensor_alloc_ids_cached_byte_size_;
+  ::google::protobuf::int64 host_temp_memory_size_;
+  ::google::protobuf::int64 device_temp_memory_size_;
+  ::google::protobuf::int64 host_persistent_memory_size_;
+  ::google::protobuf::int64 device_persistent_memory_size_;
+  mutable int _cached_size_;
+  friend struct  protobuf_tensorflow_2fcore_2fframework_2fstep_5fstats_2eproto::TableStruct;
+};
+// -------------------------------------------------------------------
+
 class NodeExecStats : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.NodeExecStats) */ {
  public:
   NodeExecStats();
@@ -457,6 +607,24 @@ class NodeExecStats : public ::google::protobuf::Message /* @@protoc_insertion_p
   void unsafe_arena_set_allocated_timeline_label(
       ::std::string* timeline_label);
 
+  // .tensorflow.MemoryStats memory_stats = 12;
+  bool has_memory_stats() const;
+  void clear_memory_stats();
+  static const int kMemoryStatsFieldNumber = 12;
+  private:
+  void _slow_mutable_memory_stats();
+  void _slow_set_allocated_memory_stats(
+      ::google::protobuf::Arena* message_arena, ::tensorflow::MemoryStats** memory_stats);
+  ::tensorflow::MemoryStats* _slow_release_memory_stats();
+  public:
+  const ::tensorflow::MemoryStats& memory_stats() const;
+  ::tensorflow::MemoryStats* mutable_memory_stats();
+  ::tensorflow::MemoryStats* release_memory_stats();
+  void set_allocated_memory_stats(::tensorflow::MemoryStats* memory_stats);
+  ::tensorflow::MemoryStats* unsafe_arena_release_memory_stats();
+  void unsafe_arena_set_allocated_memory_stats(
+      ::tensorflow::MemoryStats* memory_stats);
+
   // int64 all_start_micros = 2;
   void clear_all_start_micros();
   static const int kAllStartMicrosFieldNumber = 2;
@@ -505,6 +673,7 @@ class NodeExecStats : public ::google::protobuf::Message /* @@protoc_insertion_p
   ::google::protobuf::RepeatedPtrField< ::tensorflow::AllocationDescription > referenced_tensor_;
   ::google::protobuf::internal::ArenaStringPtr node_name_;
   ::google::protobuf::internal::ArenaStringPtr timeline_label_;
+  ::tensorflow::MemoryStats* memory_stats_;
   ::google::protobuf::int64 all_start_micros_;
   ::google::protobuf::int64 op_start_rel_micros_;
   ::google::protobuf::int64 op_end_rel_micros_;
@@ -922,6 +1091,126 @@ inline  void NodeOutput::set_allocated_tensor_description(::tensorflow::TensorDe
 
 // -------------------------------------------------------------------
 
+// MemoryStats
+
+// int64 host_temp_memory_size = 1;
+inline void MemoryStats::clear_host_temp_memory_size() {
+  host_temp_memory_size_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 MemoryStats::host_temp_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.MemoryStats.host_temp_memory_size)
+  return host_temp_memory_size_;
+}
+inline void MemoryStats::set_host_temp_memory_size(::google::protobuf::int64 value) {
+  
+  host_temp_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.MemoryStats.host_temp_memory_size)
+}
+
+// int64 device_temp_memory_size = 2;
+inline void MemoryStats::clear_device_temp_memory_size() {
+  device_temp_memory_size_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 MemoryStats::device_temp_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.MemoryStats.device_temp_memory_size)
+  return device_temp_memory_size_;
+}
+inline void MemoryStats::set_device_temp_memory_size(::google::protobuf::int64 value) {
+  
+  device_temp_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.MemoryStats.device_temp_memory_size)
+}
+
+// int64 host_persistent_memory_size = 3;
+inline void MemoryStats::clear_host_persistent_memory_size() {
+  host_persistent_memory_size_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 MemoryStats::host_persistent_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.MemoryStats.host_persistent_memory_size)
+  return host_persistent_memory_size_;
+}
+inline void MemoryStats::set_host_persistent_memory_size(::google::protobuf::int64 value) {
+  
+  host_persistent_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.MemoryStats.host_persistent_memory_size)
+}
+
+// int64 device_persistent_memory_size = 4;
+inline void MemoryStats::clear_device_persistent_memory_size() {
+  device_persistent_memory_size_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 MemoryStats::device_persistent_memory_size() const {
+  // @@protoc_insertion_point(field_get:tensorflow.MemoryStats.device_persistent_memory_size)
+  return device_persistent_memory_size_;
+}
+inline void MemoryStats::set_device_persistent_memory_size(::google::protobuf::int64 value) {
+  
+  device_persistent_memory_size_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.MemoryStats.device_persistent_memory_size)
+}
+
+// repeated int64 host_persistent_tensor_alloc_ids = 5;
+inline int MemoryStats::host_persistent_tensor_alloc_ids_size() const {
+  return host_persistent_tensor_alloc_ids_.size();
+}
+inline void MemoryStats::clear_host_persistent_tensor_alloc_ids() {
+  host_persistent_tensor_alloc_ids_.Clear();
+}
+inline ::google::protobuf::int64 MemoryStats::host_persistent_tensor_alloc_ids(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.MemoryStats.host_persistent_tensor_alloc_ids)
+  return host_persistent_tensor_alloc_ids_.Get(index);
+}
+inline void MemoryStats::set_host_persistent_tensor_alloc_ids(int index, ::google::protobuf::int64 value) {
+  host_persistent_tensor_alloc_ids_.Set(index, value);
+  // @@protoc_insertion_point(field_set:tensorflow.MemoryStats.host_persistent_tensor_alloc_ids)
+}
+inline void MemoryStats::add_host_persistent_tensor_alloc_ids(::google::protobuf::int64 value) {
+  host_persistent_tensor_alloc_ids_.Add(value);
+  // @@protoc_insertion_point(field_add:tensorflow.MemoryStats.host_persistent_tensor_alloc_ids)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+MemoryStats::host_persistent_tensor_alloc_ids() const {
+  // @@protoc_insertion_point(field_list:tensorflow.MemoryStats.host_persistent_tensor_alloc_ids)
+  return host_persistent_tensor_alloc_ids_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+MemoryStats::mutable_host_persistent_tensor_alloc_ids() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.MemoryStats.host_persistent_tensor_alloc_ids)
+  return &host_persistent_tensor_alloc_ids_;
+}
+
+// repeated int64 device_persistent_tensor_alloc_ids = 6;
+inline int MemoryStats::device_persistent_tensor_alloc_ids_size() const {
+  return device_persistent_tensor_alloc_ids_.size();
+}
+inline void MemoryStats::clear_device_persistent_tensor_alloc_ids() {
+  device_persistent_tensor_alloc_ids_.Clear();
+}
+inline ::google::protobuf::int64 MemoryStats::device_persistent_tensor_alloc_ids(int index) const {
+  // @@protoc_insertion_point(field_get:tensorflow.MemoryStats.device_persistent_tensor_alloc_ids)
+  return device_persistent_tensor_alloc_ids_.Get(index);
+}
+inline void MemoryStats::set_device_persistent_tensor_alloc_ids(int index, ::google::protobuf::int64 value) {
+  device_persistent_tensor_alloc_ids_.Set(index, value);
+  // @@protoc_insertion_point(field_set:tensorflow.MemoryStats.device_persistent_tensor_alloc_ids)
+}
+inline void MemoryStats::add_device_persistent_tensor_alloc_ids(::google::protobuf::int64 value) {
+  device_persistent_tensor_alloc_ids_.Add(value);
+  // @@protoc_insertion_point(field_add:tensorflow.MemoryStats.device_persistent_tensor_alloc_ids)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+MemoryStats::device_persistent_tensor_alloc_ids() const {
+  // @@protoc_insertion_point(field_list:tensorflow.MemoryStats.device_persistent_tensor_alloc_ids)
+  return device_persistent_tensor_alloc_ids_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+MemoryStats::mutable_device_persistent_tensor_alloc_ids() {
+  // @@protoc_insertion_point(field_mutable_list:tensorflow.MemoryStats.device_persistent_tensor_alloc_ids)
+  return &device_persistent_tensor_alloc_ids_;
+}
+
+// -------------------------------------------------------------------
+
 // NodeExecStats
 
 // string node_name = 1;
@@ -1230,6 +1519,55 @@ NodeExecStats::referenced_tensor() const {
   return referenced_tensor_;
 }
 
+// .tensorflow.MemoryStats memory_stats = 12;
+inline bool NodeExecStats::has_memory_stats() const {
+  return this != internal_default_instance() && memory_stats_ != NULL;
+}
+inline void NodeExecStats::clear_memory_stats() {
+  if (GetArenaNoVirtual() == NULL && memory_stats_ != NULL) delete memory_stats_;
+  memory_stats_ = NULL;
+}
+inline const ::tensorflow::MemoryStats& NodeExecStats::memory_stats() const {
+  // @@protoc_insertion_point(field_get:tensorflow.NodeExecStats.memory_stats)
+  return memory_stats_ != NULL ? *memory_stats_
+                         : *::tensorflow::MemoryStats::internal_default_instance();
+}
+inline ::tensorflow::MemoryStats* NodeExecStats::mutable_memory_stats() {
+  
+  if (memory_stats_ == NULL) {
+    _slow_mutable_memory_stats();
+  }
+  // @@protoc_insertion_point(field_mutable:tensorflow.NodeExecStats.memory_stats)
+  return memory_stats_;
+}
+inline ::tensorflow::MemoryStats* NodeExecStats::release_memory_stats() {
+  // @@protoc_insertion_point(field_release:tensorflow.NodeExecStats.memory_stats)
+  
+  if (GetArenaNoVirtual() != NULL) {
+    return _slow_release_memory_stats();
+  } else {
+    ::tensorflow::MemoryStats* temp = memory_stats_;
+    memory_stats_ = NULL;
+    return temp;
+  }
+}
+inline  void NodeExecStats::set_allocated_memory_stats(::tensorflow::MemoryStats* memory_stats) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == NULL) {
+    delete memory_stats_;
+  }
+  if (memory_stats != NULL) {
+    _slow_set_allocated_memory_stats(message_arena, &memory_stats);
+  }
+  memory_stats_ = memory_stats;
+  if (memory_stats) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:tensorflow.NodeExecStats.memory_stats)
+}
+
 // -------------------------------------------------------------------
 
 // DeviceStepStats
@@ -1373,6 +1711,8 @@ StepStats::dev_stats() const {
 
 // -------------------------------------------------------------------
 
+// -------------------------------------------------------------------
+
 
 // @@protoc_insertion_point(namespace_scope)
 
diff --git a/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text-impl.h
old mode 100644
new mode 100755
index 3649a49..1e19d9e
--- a/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text-impl.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text-impl.h
@@ -33,6 +33,13 @@ bool ProtoParseFromScanner(
     ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
     ::tensorflow::NodeOutput* msg);
 
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::MemoryStats& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::MemoryStats* msg);
+
 void AppendProtoDebugString(
     ::tensorflow::strings::ProtoTextOutput* o,
     const ::tensorflow::NodeExecStats& msg);
diff --git a/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text.h
old mode 100644
new mode 100755
index 0c53f87..422b21e
--- a/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/step_stats.pb_text.h
@@ -29,6 +29,16 @@ bool ProtoParseFromString(
     ::tensorflow::NodeOutput* msg)
         TF_MUST_USE_RESULT;
 
+// Message-text conversion for tensorflow.MemoryStats
+string ProtoDebugString(
+    const ::tensorflow::MemoryStats& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::MemoryStats& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::MemoryStats* msg)
+        TF_MUST_USE_RESULT;
+
 // Message-text conversion for tensorflow.NodeExecStats
 string ProtoDebugString(
     const ::tensorflow::NodeExecStats& msg);
diff --git a/libs/tensorflow/include/tensorflow/core/framework/summary.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/summary.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/summary.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/summary.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor.h b/libs/tensorflow/include/tensorflow/core/framework/tensor.h
index c9ddad3..2d5e70c 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/tensor.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/tensor.h
@@ -144,6 +144,9 @@ class Tensor {
   /// Returns the estimated memory usage of this tensor.
   size_t TotalBytes() const;
 
+  // Returns the size of sallocated memory for this tensor.
+  size_t AllocatedBytes() const;
+
   /// Returns true iff this tensor is aligned.
   bool IsAligned() const {
 #if EIGEN_MAX_ALIGN_BYTES == 0
@@ -414,6 +417,9 @@ class Tensor {
                               const TensorShape&);
 
  private:
+  // Returns true if the refcount on buf_ and any possible underlying root
+  // buffer is one.
+  bool RefCountIsOne() const;
   void CheckType(DataType expected_dtype) const;
   void CheckTypeAndIsAligned(DataType expected_dtype) const;
   void CheckIsAlignedAndSingleElement() const;
@@ -439,6 +445,7 @@ class Tensor {
   friend class TensorTestHelper;      // For access to set_shape
   template <typename Device, typename T>
   friend class CreateVariableOp;
+  friend class OpKernelContext;  // For access to RefCountIsOne().
 
   // Creates a tensor with the input datatype, shape and buf.
   //
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor.pb.h b/libs/tensorflow/include/tensorflow/core/framework/tensor.pb.h
index 937ba1e..925a000 100755
--- a/libs/tensorflow/include/tensorflow/core/framework/tensor.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/tensor.pb.h
@@ -196,10 +196,16 @@ class TensorProto : public ::google::protobuf::Message /* @@protoc_insertion_poi
   const ::std::string& string_val(int index) const;
   ::std::string* mutable_string_val(int index);
   void set_string_val(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_string_val(int index, ::std::string&& value);
+  #endif
   void set_string_val(int index, const char* value);
   void set_string_val(int index, const void* value, size_t size);
   ::std::string* add_string_val();
   void add_string_val(const ::std::string& value);
+  #if LANG_CXX11
+  void add_string_val(::std::string&& value);
+  #endif
   void add_string_val(const char* value);
   void add_string_val(const void* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& string_val() const;
@@ -631,6 +637,12 @@ inline void TensorProto::set_string_val(int index, const ::std::string& value) {
   // @@protoc_insertion_point(field_set:tensorflow.TensorProto.string_val)
   string_val_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void TensorProto::set_string_val(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.TensorProto.string_val)
+  string_val_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void TensorProto::set_string_val(int index, const char* value) {
   string_val_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.TensorProto.string_val)
@@ -648,6 +660,12 @@ inline void TensorProto::add_string_val(const ::std::string& value) {
   string_val_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.TensorProto.string_val)
 }
+#if LANG_CXX11
+inline void TensorProto::add_string_val(::std::string&& value) {
+  string_val_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.TensorProto.string_val)
+}
+#endif
 inline void TensorProto::add_string_val(const char* value) {
   string_val_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.TensorProto.string_val)
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/tensor.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/tensor.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_description.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_description.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_description.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_description.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_shape.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_shape.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_shape.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_shape.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_slice.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_slice.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_slice.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_slice.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/tensor_util.h b/libs/tensorflow/include/tensorflow/core/framework/tensor_util.h
index 04b5bfe..6c218b6 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/tensor_util.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/tensor_util.h
@@ -38,7 +38,8 @@ Tensor DeepCopy(const Tensor& other);
 // REQUIRES: Each member of 'tensors' must point to data stored in CPU memory.
 // REQUIRES: Each member of 'tensors' must be a Tensor of a copy-able type if it
 //           is not appropriately memory-aligned.
-Tensor Concat(const gtl::ArraySlice<Tensor>& tensors);
+Status Concat(const gtl::ArraySlice<Tensor>& tensors,
+              Tensor* result) TF_MUST_USE_RESULT;
 
 // Splits 'tensor' into 'sizes.size()' individual tensors, along the 0th
 // dimension. The ith output tensor has 0th-dimension size 'sizes[i]'.
@@ -50,8 +51,8 @@ Tensor Concat(const gtl::ArraySlice<Tensor>& tensors);
 //           appropriately memory-aligned.
 //
 // Split() and Concat() are inverse operations.
-std::vector<Tensor> Split(const Tensor& tensor,
-                          const gtl::ArraySlice<int64>& sizes);
+Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
+             std::vector<Tensor>* result) TF_MUST_USE_RESULT;
 
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/libs/tensorflow/include/tensorflow/core/framework/type_index.h b/libs/tensorflow/include/tensorflow/core/framework/type_index.h
index dfde25c..b978d90 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/type_index.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/type_index.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
 
 #include <string>
-#ifdef __GXX_RTTI
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
 #include <typeindex>
 #include <typeinfo>
 #endif  // __GXX_RTTI
@@ -30,7 +30,7 @@ namespace tensorflow {
 // binary sizes. The following #ifdef section provides a non-RTTI
 // replacement for std::type_index (with a minimal set of functions needed by
 // the TensorFlow framework, and more can be added if necessary).
-#ifndef __GXX_RTTI
+#if !defined(__GXX_RTTI) && !defined(_CPPRTTI)
 
 // A thin TypeIndex class that mimics std::type_index but does not use RTTI. As
 // a result, it does not provide the actual name of the type, and only returns a
diff --git a/libs/tensorflow/include/tensorflow/core/framework/types.h b/libs/tensorflow/include/tensorflow/core/framework/types.h
index 589730b..932d788 100644
--- a/libs/tensorflow/include/tensorflow/core/framework/types.h
+++ b/libs/tensorflow/include/tensorflow/core/framework/types.h
@@ -68,9 +68,9 @@ class DeviceType {
 std::ostream& operator<<(std::ostream& os, const DeviceType& d);
 
 // Convenient constants that can be passed to a DeviceType constructor
-extern const char* const DEVICE_CPU;   // "CPU"
-extern const char* const DEVICE_GPU;   // "GPU"
-extern const char* const DEVICE_SYCL;  // "SYCL"
+TF_EXPORT extern const char* const DEVICE_CPU;   // "CPU"
+TF_EXPORT extern const char* const DEVICE_GPU;   // "GPU"
+TF_EXPORT extern const char* const DEVICE_SYCL;  // "SYCL"
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
diff --git a/libs/tensorflow/include/tensorflow/core/framework/types.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/types.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/types.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/types.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/variable.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/variable.pb_text-impl.h
new file mode 100755
index 0000000..1220ef1
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/framework/variable.pb_text-impl.h
@@ -0,0 +1,32 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_framework_variable_proto_IMPL_H_
+#define tensorflow_core_framework_variable_proto_IMPL_H_
+
+#include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/framework/variable.pb_text.h"
+#include "tensorflow/core/lib/strings/proto_text_util.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::VariableDef& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::VariableDef* msg);
+
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::SaveSliceInfoDef& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::SaveSliceInfoDef* msg);
+
+}  // namespace internal
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_framework_variable_proto_IMPL_H_
diff --git a/libs/tensorflow/include/tensorflow/core/framework/variable.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/variable.pb_text.h
new file mode 100755
index 0000000..f70e2c9
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/framework/variable.pb_text.h
@@ -0,0 +1,34 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_framework_variable_proto_H_
+#define tensorflow_core_framework_variable_proto_H_
+
+#include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Message-text conversion for tensorflow.VariableDef
+string ProtoDebugString(
+    const ::tensorflow::VariableDef& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::VariableDef& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::VariableDef* msg)
+        TF_MUST_USE_RESULT;
+
+// Message-text conversion for tensorflow.SaveSliceInfoDef
+string ProtoDebugString(
+    const ::tensorflow::SaveSliceInfoDef& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::SaveSliceInfoDef& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::SaveSliceInfoDef* msg)
+        TF_MUST_USE_RESULT;
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_framework_variable_proto_H_
diff --git a/libs/tensorflow/include/tensorflow/core/framework/versions.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/framework/versions.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/framework/versions.pb_text.h b/libs/tensorflow/include/tensorflow/core/framework/versions.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/graph/costmodel.h b/libs/tensorflow/include/tensorflow/core/graph/costmodel.h
index 6cb7b37..da7fab4 100644
--- a/libs/tensorflow/include/tensorflow/core/graph/costmodel.h
+++ b/libs/tensorflow/include/tensorflow/core/graph/costmodel.h
@@ -130,22 +130,16 @@ class CostModel {
   // Returns the size in bytes of temporary memory consumed by "node".
   Bytes TempMemorySize(const Node* node) const;
 
-  // Returns the size in bytes of host memory consumed by "node".
-  Bytes HostPeakMemorySize(const Node* node) const;
-
-  // Returns the size in bytes of device memory consumed by "node".
-  Bytes DevicePeakMemorySize(const Node* node) const;
-
-  // Returns the size in bytes of persisted memory consumed by "node".
-  Bytes PersistedMemorySize(const Node* node) const;
+  // Returns the size in bytes of temporary memory consumed by "node".
+  Bytes HostTempMemorySize(const Node* node) const;
+  Bytes DeviceTempMemorySize(const Node* node) const;
 
-  // Returns the size in bytes of auxiliary memory consumed by "node".
-  Bytes AuxiliaryMemorySize(const Node* node) const;
+  // Returns the size of persistent memory allocated by "node".
+  Bytes HostPersistentMemorySize(const Node* node) const;
+  Bytes DevicePersistentMemorySize(const Node* node) const;
 
-  // Records the memory allocated by allocators for a node.
-  void RecordAllocatorMemory(
-      const Node* node,
-      const protobuf::RepeatedPtrField<AllocatorMemoryUsed>& memory);
+  // Records memory stats such as temp momory and persistent memory.
+  void RecordMemoryStats(const Node* node, const MemoryStats& memory_stats);
 
   // Records the maximum execution time (in microseconds) of "node".
   void RecordMaxExecutionTime(const Node* node, Microseconds time);
@@ -161,6 +155,8 @@ class CostModel {
   // Return the unique id of the tensor generated by "output_slot" of "node".
   int64 AllocationId(const Node* node, int output_slot) const;
 
+  bool IsPersistentTensor(const Node* node, int64 alloc_id) const;
+
   // Helper routines to encapsulate static estimation heuristics
 
   // Compute an estimate of the time to copy "b" bytes over the network,
@@ -211,25 +207,21 @@ class CostModel {
 
   // Maximum memory usage
   struct MemUsage {
+    MemUsage()
+        : temp_memory_size(-1),
+          host_temp_memory_size(0),
+          device_temp_memory_size(0),
+          host_persistent_memory_size(0),
+          device_persistent_memory_size(0) {}
+
     // TODO(yuefengz): temp_memory_size is not being used, remove it.
     Bytes temp_memory_size;
 
-    // Peak memory includes temporary tensors, output tensors and persistent
-    // tensors. Some kernels may allocate temporary tensors on host even they
-    // are running on devices.
-    Bytes host_peak_memory_size;
-    Bytes device_peak_memory_size;
-
-    // Persisted memory includes the output memory, persistent tensors.
-    // The current set of kernels only allocate persistent tensors on their own
-    // devices.
-    Bytes persisted_memory_size;
+    Bytes host_temp_memory_size;
+    Bytes device_temp_memory_size;
 
-    // Auxiliary memory is the momery used by resources (i.e. those in
-    // ResourceMgr, e.g. lookup tables) excluding their underlying persistent
-    // tensors (e.g. in variable containers). The auxiliary memory is usually
-    // allocated on host.
-    Bytes auxiliary_memory_size;
+    Bytes host_persistent_memory_size;
+    Bytes device_persistent_memory_size;
 
     gtl::InlinedVector<Bytes, 2> output_port_mem;
     gtl::InlinedVector<TensorShapeProto, 2> output_port_shape;
@@ -239,6 +231,9 @@ class CostModel {
 
   std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
 
+  std::set<int64> host_persistent_alloc_ids_;
+  std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CostModel);
 };
 
diff --git a/libs/tensorflow/include/tensorflow/core/graph/graph.h b/libs/tensorflow/include/tensorflow/core/graph/graph.h
index 72884a2..4af4b0b 100644
--- a/libs/tensorflow/include/tensorflow/core/graph/graph.h
+++ b/libs/tensorflow/include/tensorflow/core/graph/graph.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include <functional>
 #include <string>
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
@@ -272,8 +273,20 @@ class Graph {
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
   //
-  // The graph can hold ops found in registry.
+  // The graph can hold ops found in registry. `registry`s lifetime must be at
+  // least that of the constructed graph's.
   explicit Graph(const OpRegistryInterface* registry);
+
+  // Constructs a graph with a single SOURCE (always id kSourceId) and a
+  // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
+  //
+  // The graph can hold ops found in `flib_def`. Unlike the constructor taking
+  // an OpRegistryInterface, this constructor copies the function definitions in
+  // `flib_def` so its lifetime may be shorter than that of the graph's. The
+  // OpRegistryInterface backing `flib_def` must still have the lifetime of the
+  // graph though.
+  explicit Graph(const FunctionLibraryDefinition& flib_def);
+
   ~Graph();
 
   static const int kControlSlot;
@@ -368,7 +381,8 @@ class Graph {
   Node* source_node() const { return FindNodeId(kSourceId); }
   Node* sink_node() const { return FindNodeId(kSinkId); }
 
-  const OpRegistryInterface* op_registry() const { return ops_; }
+  const OpRegistryInterface* op_registry() const { return &ops_; }
+  const FunctionLibraryDefinition& flib_def() const { return ops_; }
 
   // TODO(josh11b): uint64 hash() const;
 
@@ -380,8 +394,8 @@ class Graph {
   Node* AllocateNode(Node::Properties* props, const Node* cost_node);
   void ReleaseNode(Node* node);
 
-  // Registry of all known ops.  Not owned.
-  const OpRegistryInterface* const ops_;
+  // Registry of all known ops, including functions.
+  FunctionLibraryDefinition ops_;
 
   // GraphDef versions
   VersionDef versions_;
diff --git a/libs/tensorflow/include/tensorflow/core/graph/graph_constructor.h b/libs/tensorflow/include/tensorflow/core/graph/graph_constructor.h
index 186859d..4252b08 100644
--- a/libs/tensorflow/include/tensorflow/core/graph/graph_constructor.h
+++ b/libs/tensorflow/include/tensorflow/core/graph/graph_constructor.h
@@ -113,6 +113,8 @@ struct ImportGraphDefOptions {
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
   // python API.
+
+  // TODO(skyewm): Enable importing functions
 };
 
 // Each `return_tensors` entry is the requested node and output index. The index
diff --git a/libs/tensorflow/include/tensorflow/core/graph/mkl_layout_pass.h b/libs/tensorflow/include/tensorflow/core/graph/mkl_layout_pass.h
new file mode 100644
index 0000000..ffe5c1e
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/graph/mkl_layout_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A graph pass that rewrites graph for propagating MKL layout as a tensor
+
+#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/graph/mkl_optimizer_merge.h b/libs/tensorflow/include/tensorflow/core/graph/mkl_optimizer_merge.h
index 554709e..b2caec5 100644
--- a/libs/tensorflow/include/tensorflow/core/graph/mkl_optimizer_merge.h
+++ b/libs/tensorflow/include/tensorflow/core/graph/mkl_optimizer_merge.h
@@ -21,20 +21,14 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <sys/types.h>
-#include <vector>
-#include <string>
 #include <memory>
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
-
 // Interface to invoke the pass for unit test
 //
 // Returns true if and only if 'g' is mutated.
 extern bool OptimizeNodeMerge(std::unique_ptr<Graph>* g);
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/libs/tensorflow/include/tensorflow/core/graph/mkl_tfconversion_pass.h b/libs/tensorflow/include/tensorflow/core/graph/mkl_tfconversion_pass.h
new file mode 100644
index 0000000..0562d8b
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/graph/mkl_tfconversion_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization pass that inserts MklToTf conversion nodes in the graph
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/BUILD b/libs/tensorflow/include/tensorflow/core/grappler/BUILD
new file mode 100644
index 0000000..0b5b4ce
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/BUILD
@@ -0,0 +1,107 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "android_srcs",
+    srcs = [
+        "grappler_item.h",
+        "utils.cc",
+        "utils.h",
+        "//tensorflow/core/grappler/clusters:android_srcs",
+        "//tensorflow/core/grappler/inputs:android_srcs",
+        "//tensorflow/core/grappler/optimizers:android_srcs",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "devices",
+    srcs = ["devices.cc"],
+    hdrs = ["devices.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:stream_executor",
+    ],
+)
+
+cc_library(
+    name = "grappler_item",
+    srcs = [
+        "grappler_item.cc",
+    ],
+    hdrs = ["grappler_item.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "grappler_item_builder",
+    srcs = [
+        "grappler_item_builder.cc",
+    ],
+    hdrs = ["grappler_item_builder.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":grappler_item",
+        ":utils",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/inputs:utils",
+    ],
+)
+
+cc_test(
+    name = "grappler_item_test",
+    srcs = ["grappler_item_test.cc"],
+    deps = [
+        ":grappler_item",
+        ":grappler_item_builder",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/clusters/BUILD b/libs/tensorflow/include/tensorflow/core/grappler/clusters/BUILD
new file mode 100644
index 0000000..e6ff82e
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/clusters/BUILD
@@ -0,0 +1,76 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "android_srcs",
+    srcs = glob(
+        [
+            "cluster.*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "cluster",
+    srcs = ["cluster.cc"],
+    hdrs = [
+        "cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
+    name = "single_machine",
+    srcs = ["single_machine.cc"],
+    hdrs = [
+        "single_machine.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        "//tensorflow/cc:coordinator",
+        "//tensorflow/cc:queue_runner",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+cc_test(
+    name = "single_machine_test",
+    srcs = ["single_machine_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    deps = [
+        ":single_machine",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/clusters/cluster.h b/libs/tensorflow/include/tensorflow/core/grappler/clusters/cluster.h
new file mode 100644
index 0000000..45821db
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/clusters/cluster.h
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A cluster represents of collection of hardware resources available to run
+// the TensorFlow model.
+// A process can only create a single cluster at a time.
+class Cluster {
+ public:
+  explicit Cluster(int timeout_s);
+  virtual ~Cluster();
+
+  // Provision the hardware resources needed to run TensorFlow and start a
+  // TensorFlow session that can take advantage of these resources.
+  // The actual resources that are leveraged depend on the type of cluster
+  // instantiated.
+  // Returns OK iff all the requested resources could be reserved and a
+  // TensorFlow session successfully created. Returns an error otherwise.
+  // There is no graceful degradation to handle the case where only a subset
+  // of the requested resources are available.
+  virtual Status Provision() = 0;
+
+  // Whether soft placement is allowed. If allow_soft_placement is true,
+  // an op will be placed on CPU if there's no GPU implementation for the OP
+  // or if no GPU devices are known or registered or if we need to co-locate
+  // with reftype input(s) which are from CPU.
+  void AllowSoftPlacement(bool soft_placement_state);
+
+  // Set the number of steps required to warmup TensorFlow. Must be called
+  // before Provision().
+  void SetNumWarmupSteps(int num_steps);
+
+  // Disable the collection of detailed statistics.
+  void DisableDetailedStats(bool disable);
+
+  // Return the list of TensorFlow devices that are available to execute a
+  // graph. This is empty until provision() is called.
+  const std::vector<DeviceAttributes>& GetDevices() const { return devices_; }
+
+  // Convenience method that returns the set of device names.
+  const std::vector<string> GetDeviceNames() const {
+    std::vector<string> device_names;
+    device_names.reserve(devices_.size());
+    for (const auto& device : devices_) {
+      device_names.push_back(device.name());
+    }
+    return device_names;
+  }
+
+  // Prepare the session to run the specified grappler item. This include
+  // initializing all the model variables.
+  virtual Status Initialize(const GrapplerItem& item) = 0;
+
+  // Run the specified graph_def and return the corresponding metadata.
+  virtual Status Run(const GraphDef& graph_def,
+                     const std::vector<std::pair<string, Tensor>>& feed,
+                     const std::vector<string>& fetch,
+                     RunMetadata* metadata) = 0;
+
+ protected:
+  std::vector<DeviceAttributes> devices_;
+  const int timeout_s_;
+  SessionOptions options_;
+  RunOptions run_options_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/clusters/single_machine.h b/libs/tensorflow/include/tensorflow/core/grappler/clusters/single_machine.h
new file mode 100644
index 0000000..b739b39
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/clusters/single_machine.h
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+
+#include "tensorflow/cc/training/coordinator.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that makes available to grappler a subset of the
+// nodes available on a single local computer.
+class SingleMachine : public Cluster {
+ public:
+  SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus);
+  ~SingleMachine() override;
+
+  Status Provision() override;
+  Status Initialize(const GrapplerItem& item) override;
+  Status Run(const GraphDef& item,
+             const std::vector<std::pair<string, Tensor>>& feed,
+             const std::vector<string>& fetch, RunMetadata* metadata) override;
+
+ private:
+  Status RunWithTimeout(const std::vector<std::pair<string, Tensor>>& feed,
+                        const std::vector<string>& fetch,
+                        RunMetadata* run_metadata);
+  Status ResetSession();
+  Status CloseSession(bool use_timeout);
+
+  const int num_gpus_;
+  std::unique_ptr<Session> session_;
+  std::vector<QueueRunnerDef> queue_runner_defs_;
+  string last_graph_id_;
+  mutex last_graph_mu_;
+  const GraphDef* last_graph_ GUARDED_BY(last_graph_mu_) = nullptr;
+  std::vector<string> init_ops_;
+  std::unique_ptr<Coordinator> coordinator_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  RunMetadata init_metadata_;
+
+  mutex close_mu_;
+  bool closing_ GUARDED_BY(close_mu_);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/costs/BUILD b/libs/tensorflow/include/tensorflow/core/grappler/costs/BUILD
new file mode 100644
index 0000000..0266c4a
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/costs/BUILD
@@ -0,0 +1,118 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "if_cuda",
+)
+
+tf_proto_library(
+    name = "op_performance_data",
+    srcs = ["op_performance_data.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "graph_properties",
+    srcs = ["graph_properties.cc"],
+    hdrs = ["graph_properties.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_performance_data_cc",
+        ":utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "graph_properties_test",
+    srcs = ["graph_properties_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    deps = [
+        ":graph_properties",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "graph_memory",
+    srcs = ["graph_memory.cc"],
+    hdrs = ["graph_memory.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_properties",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "graph_memory_test",
+    srcs = ["graph_memory_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
+    deps = [
+        ":graph_memory",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_performance_data_cc",
+        "//third_party/eigen3",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ] + if_cuda([
+        "//tensorflow/core:cuda",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
+)
+
+cc_library(
+    name = "cost_estimator",
+    hdrs = ["cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/costs/cost_estimator.h b/libs/tensorflow/include/tensorflow/core/grappler/costs/cost_estimator.h
new file mode 100644
index 0000000..093b7e2
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/costs/cost_estimator.h
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+
+#include <chrono>
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class GraphDef;
+class CostGraphDef;
+
+namespace grappler {
+struct GrapplerItem;
+
+constexpr int64 kMemoryUnknown = -1ll;
+constexpr int64 kZeroMemory = 0ll;
+
+// Holds the set of things we might want to estimate or measure in Grappler.
+// Always produce execution time. Other fields are optional depending on the
+// estimator being used.
+struct Costs {
+  // Returns a Costs structure with default values for all of the fields.
+  inline Costs();
+
+  // Builds a Costs structure with all zero values, rather than unknowns.
+  static inline Costs ZeroCosts();
+
+  struct MicroSeconds : std::chrono::microseconds {
+    MicroSeconds() : std::chrono::microseconds(0) {}
+    MicroSeconds(double d) : std::chrono::microseconds(static_cast<int64>(d)) {}
+    MicroSeconds(std::chrono::microseconds& d) : std::chrono::microseconds(d) {}
+    MicroSeconds& operator=(const std::chrono::microseconds& d) {
+      std::chrono::microseconds::operator=(d);
+      return *this;
+    }
+  };
+  struct NanoSeconds : std::chrono::nanoseconds {
+    NanoSeconds() : std::chrono::nanoseconds(0) {}
+    NanoSeconds(double d) : std::chrono::nanoseconds(static_cast<int64>(d)) {}
+    NanoSeconds(std::chrono::nanoseconds& d) : std::chrono::nanoseconds(d) {}
+    NanoSeconds& operator=(const std::chrono::nanoseconds& d) {
+      std::chrono::nanoseconds::operator=(d);
+      return *this;
+    }
+    MicroSeconds asMicroSeconds() const {
+      std::chrono::microseconds us =
+          std::chrono::duration_cast<std::chrono::microseconds>(*this);
+      return MicroSeconds(us);
+    }
+  };
+
+  // We store all our times in nanoseconds. If needs be, we can always switch to
+  // picoseconds in the future by updating this typedef.
+  typedef NanoSeconds Duration;
+
+  // Overall cost of running the graph; latency.
+  // Mean
+  Duration execution_time;
+  Duration min_execution_time;
+  Duration max_execution_time;
+
+  // Computation cost of running the graph.
+  Duration compute_time;
+
+  // Memory access cost of running the graph.
+  Duration memory_time;
+
+  // This field can be a very pessimistic estimate of the main memory
+  // requirements of a graph. For example, it might assume that all activations
+  // are live for all of a graph's execution.
+  int64 max_memory;  // Maximum main memory requirement in bytes over all ops.
+
+  // These fields are used for TPU-related estimations. They are per-op
+  // maximums, so each op is evaluated independently, but we want the maximum of
+  // the value over all ops.
+  int64 max_per_op_buffers;    // Sum of all buffers used by the ops.
+  int64 max_per_op_streaming;  // Ignore largest input buffer, assuming it
+                               // streams from main memory.
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
+  os << d.count() << "us";
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const Costs::NanoSeconds d) {
+  os << d.count() << "ns";
+  return os;
+}
+
+Costs::Costs() {
+  execution_time = Duration::zero();
+  compute_time = Duration::zero();
+  memory_time = Duration::zero();
+  max_memory = kMemoryUnknown;
+  max_per_op_buffers = kMemoryUnknown;
+  max_per_op_streaming = kMemoryUnknown;
+}
+
+Costs Costs::ZeroCosts() {
+  Costs costs;
+  costs.execution_time = Duration::zero();
+  costs.max_memory = kZeroMemory;
+  costs.max_per_op_buffers = kZeroMemory;
+  costs.max_per_op_streaming = kZeroMemory;
+  return costs;
+}
+
+// Given a GrapperItem and an optimized implementation of the corresponding
+// TensorFlow graph, the CostEstimator attempts to predicts the actual cost of
+// running the graph.
+class CostEstimator {
+ public:
+  virtual ~CostEstimator() {}
+
+  // Initalizes the estimator for the specified grappler item.
+  // The estimator shouldn't be used if this function returns any status other
+  // that OK.
+  virtual Status Initialize(const GrapplerItem& item) = 0;
+
+  // Predicts the cost of running the given optimized version of the grappler
+  // item.
+  // If a CostGraphDef is passed, it will be populated with detailed information
+  // about the cost of running each operation of the optimized graph.
+  // if a double value is passed, it will be set to a value that reflects the
+  // overall cost of running the graph (e.g. the latency of the computation).
+  // Returns a status that indicate is the performance could be estimated or
+  // not.
+  virtual Status PredictCosts(const GraphDef& optimized_graph,
+                              CostGraphDef* cost_graph, Costs* cost) const = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/costs/graph_memory.h b/libs/tensorflow/include/tensorflow/core/grappler/costs/graph_memory.h
new file mode 100644
index 0000000..a3e152a
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/costs/graph_memory.h
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Infer the worst case memory usage for a given grappler item.
+class GraphMemory {
+ public:
+  explicit GraphMemory(const GrapplerItem& item)
+      : item_(item), worst_case_memory_usage_(-1) {}
+
+  Status InferStatically();
+  Status InferDynamically(Cluster* cluster);
+  Status InferFromGraphProperties(GraphProperties* properties);
+
+  // Worst case memory usage in bytes, or -1 if the usage is unknown.
+  int64 GetWorstCaseMemoryUsage() const { return worst_case_memory_usage_; }
+
+  // Best case memory usage in bytes, or -1 if the usage is unknown.
+  // This corresponds to the case where all the data is swapped out excepted
+  // that which is needed for a single node to perform its computations.
+  int64 GetBestCaseMemoryUsage() const { return best_case_memory_usage_; }
+
+ private:
+  void InferMemUsageForNodes(const std::vector<const NodeDef*>& nodes,
+                             GraphProperties* properties, int64* worst_case,
+                             int64* best_case) const;
+  int64 InferMemUsageForNeighbors(
+      const std::vector<OpInfo::TensorProperties>& props) const;
+
+  // Inputs
+  GrapplerItem item_;
+  int64 worst_case_memory_usage_;
+  int64 best_case_memory_usage_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/costs/graph_properties.h b/libs/tensorflow/include/tensorflow/core/grappler/costs/graph_properties.h
new file mode 100644
index 0000000..c49313a
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/costs/graph_properties.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+
+#include <unordered_map>
+#include <vector>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A TensorFlow model to optimize.
+// Models are represented by the combination of a graph, one of more fetch
+// nodes, and potentially a set of nodes to feed.
+class GraphProperties {
+ public:
+  // Factory method for creating a GrapplerShapes from a MetaGraphDef.
+  // Returns nullptr if the given meta_graph cannot be converted.
+  explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
+
+  Status InferStatically();
+  Status InferDynamically(Cluster* cluster);
+
+  std::vector<OpInfo::TensorProperties> GetInputProperties(
+      const string& node_name) const;
+  std::vector<OpInfo::TensorProperties> GetOutputProperties(
+      const string& node_name) const;
+  string GetDeviceName(const string& node_name) const;
+
+ private:
+  // Inputs
+  GrapplerItem item_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+  std::map<string, string> device_names_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/costs/utils.h b/libs/tensorflow/include/tensorflow/core/grappler/costs/utils.h
new file mode 100644
index 0000000..79be906
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/costs/utils.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns a vector of InputProperties for 'node'. The vector will contain one
+// entry for each input of 'node'.
+// For each node in the graph, the 'name_to_cost' map stores a pointer to the
+// corresponding cost graph node indexed by node name.
+std::vector<OpInfo::TensorProperties> FindInputFeatures(
+    const NodeDef& node,
+    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost);
+
+// Returns the DeviceProperties of the device on which 'node' runs.
+OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
+
+// Returns the DeviceProperties of the CPU on which grappler is running.
+OpInfo::DeviceProperties GetLocalCPUInfo();
+
+// Returns the DeviceProperties for the specified GPU attached to the server on
+// which grappler is running.
+OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/devices.h b/libs/tensorflow/include/tensorflow/core/grappler/devices.h
new file mode 100644
index 0000000..329e8e2
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/devices.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_DEVICES_H_
+#define TENSORFLOW_GRAPPLER_DEVICES_H_
+
+#include <functional>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Get the number of available GPUs whose number of multiprocessors is no less
+// than 8.
+int GetNumAvailableGPUs();
+
+// Get the number of logical CPU cores (aka hyperthreads) available.
+int GetNumAvailableLogicalCPUCores();
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_DEVICES_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/grappler_item.h b/libs/tensorflow/include/tensorflow/core/grappler/grappler_item.h
new file mode 100644
index 0000000..e40ab66
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/grappler_item.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A TensorFlow model to optimize.
+// Models are represented by the combination of a graph, one of more fetch
+// nodes, and potentially a set of nodes to feed.
+// TODO(volunteer_needed): turn this struct into a class.
+struct GrapplerItem {
+  string id;  // A unique id for this item
+
+  // Inputs
+  GraphDef graph;
+  std::vector<std::pair<string, Tensor>> feed;
+  std::vector<string> fetch;
+
+  // Initialization op(s).
+  std::vector<string> init_ops;
+
+  // Queue runner(s) required to run the queue(s) of this model.
+  std::vector<QueueRunnerDef> queue_runners;
+
+  // Return the set of node evaluated during a regular train/inference step.
+  std::vector<const NodeDef*> MainOpsFanin() const;
+  // Return the set nodes used by TensorFlow to initialize the graph.
+  std::vector<const NodeDef*> InitOpsFanin() const;
+};
+
+// Return the transitive fanin of a set of terminal nodes.
+std::vector<const NodeDef*> ComputeTransitiveFanin(
+    const GraphDef& graph, const std::vector<string>& terminal_nodes);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/grappler_item_builder.h b/libs/tensorflow/include/tensorflow/core/grappler/grappler_item_builder.h
new file mode 100644
index 0000000..7088636
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/grappler_item_builder.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+
+class MetaGraphDef;
+
+namespace grappler {
+
+struct ItemConfig {
+  // If true, ignore all user specified node placement.
+  bool ignore_user_placement = true;
+  // If true, ignore all user specified colocation attributes.
+  bool ignore_colocation = true;
+  // Dimension to use if a placeholder node has an _output_shapes attribute with
+  // a dimension of -1.
+  int placeholder_unknown_output_shape_dim = -1;
+};
+
+// Factory method for creating a GrapplerItem from a MetaGraphDef.
+// Returns nullptr if the given meta_graph cannot be converted.
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
+    const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/inputs/BUILD b/libs/tensorflow/include/tensorflow/core/grappler/inputs/BUILD
new file mode 100644
index 0000000..d7bb4be
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/inputs/BUILD
@@ -0,0 +1,83 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "android_srcs",
+    srcs = glob(
+        [
+            "utils.*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils.cc",
+    ],
+    hdrs = [
+        "utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "utils_test",
+    srcs = [
+        "utils_test.cc",
+    ],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "input_yielder",
+    hdrs = [
+        "input_yielder.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [],
+)
+
+cc_library(
+    name = "trivial_test_graph_input_yielder",
+    srcs = ["trivial_test_graph_input_yielder.cc"],
+    hdrs = [
+        "trivial_test_graph_input_yielder.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":input_yielder",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/kernels:aggregate_ops",
+        "//tensorflow/core/kernels:array",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/inputs/input_yielder.h b/libs/tensorflow/include/tensorflow/core/grappler/inputs/input_yielder.h
new file mode 100644
index 0000000..c9f9082
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/inputs/input_yielder.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+
+namespace tensorflow {
+namespace grappler {
+
+struct GrapplerItem;
+
+// Abstract interface for yielding graphs that we want to optimize.
+class InputYielder {
+ public:
+  virtual ~InputYielder() {}
+
+  virtual bool NextItem(GrapplerItem* item) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/libs/tensorflow/include/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
new file mode 100644
index 0000000..4c5600c
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/grappler/inputs/input_yielder.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+class GrapplerItem;
+
+class TrivialTestGraphInputYielder : public InputYielder {
+ public:
+  TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
+                               bool insert_queue,
+                               const std::vector<string>& device_names);
+  bool NextItem(GrapplerItem* item) override;
+
+ private:
+  const int num_stages_;
+  const int width_;
+  const int tensor_size_;
+  const bool insert_queue_;
+  std::vector<string> device_names_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/inputs/utils.h b/libs/tensorflow/include/tensorflow/core/grappler/inputs/utils.h
new file mode 100644
index 0000000..ee65ca0
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/inputs/utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+
+#include <set>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool FilesExist(const std::vector<string>& files,
+                std::vector<Status>* status = nullptr);
+bool FilesExist(const std::set<string>& files);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/optimizers/BUILD b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/BUILD
new file mode 100644
index 0000000..13dc959
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/BUILD
@@ -0,0 +1,70 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "android_srcs",
+    srcs = glob(
+        [
+            "*_optimizer.*",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "graph_optimizer",
+    hdrs = [
+        "graph_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "layout_optimizer",
+    srcs = ["layout_optimizer.cc"],
+    hdrs = [
+        "layout_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_library(
+    name = "meta_optimizer",
+    srcs = ["meta_optimizer.cc"],
+    hdrs = [
+        "meta_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":layout_optimizer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/optimizers/graph_optimizer.h b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/graph_optimizer.h
new file mode 100644
index 0000000..34e126b
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+class GrapplerItem;
+
+// An abstract interface for an algorithm for generating a candidate
+// optimization of a GrapplerItem for running on a cluster.
+class GraphOptimizer {
+ public:
+  virtual ~GraphOptimizer() {}
+
+  virtual string name() const = 0;
+
+  // Routine called to allow an algorithm to propose a rewritten graph
+  // for the graph, feeds and fetches in "item" to run more efficiently
+  // on "cluster".
+  // Returns true iff it managed to generate a solution, false otherwise.
+  virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                          GraphDef* optimized_graph) = 0;
+
+  // Method invoked by the framework so that it can provide feedback
+  // on how well the "optimize_output" (produced as *output from a
+  // call to Optimize) performed.  Lower "result" scores are better.
+  virtual void Feedback(Cluster* cluster, const GrapplerItem& item,
+                        const GraphDef& optimized_graph, double result) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/optimizers/layout_optimizer.h b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/layout_optimizer.h
new file mode 100644
index 0000000..66dec17
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Convert the NHWC layout to NCHW for Conv-related ops on GPUs.
+class LayoutOptimizer : public GraphOptimizer {
+ public:
+  LayoutOptimizer() {}
+  ~LayoutOptimizer() override {}
+
+  string name() const override { return "layout"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/optimizers/meta_optimizer.h b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/meta_optimizer.h
new file mode 100644
index 0000000..14c99a1
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool MetaOptimizerEnabled(const RewriterConfig& cfg);
+
+Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+                        GraphDef* optimized_graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
diff --git a/libs/tensorflow/include/tensorflow/core/grappler/utils.h b/libs/tensorflow/include/tensorflow/core/grappler/utils.h
new file mode 100644
index 0000000..09dff7c
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/grappler/utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_UTILS_H_
+#define TENSORFLOW_GRAPPLER_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Return the node name corresponding to 'name' if name is valid, or the empty
+// string otherwise.
+string NodeName(const string& name);
+
+// Get the trailing position number ":{digits}" (if any) of a node name.
+int NodePosition(const string& name);
+
+// Add a prefix to a node name
+string AddPrefixToNodeName(const string& name, const string& prefix);
+
+// Executes a 'fn' in the 'thread_pool'. The method waits for the configured
+// timeout (in milliseconds) for 'fn' to complete, before returning false.
+//
+// If returning false, the 'fn' may still continue to execute in the
+// thread-pool. It is the responsibility of the caller to reset the thread-pool
+// as appropriate.
+bool ExecuteWithTimeout(std::function<void()> fn, int64 timeout_in_ms,
+                        thread::ThreadPool* thread_pool);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_UTILS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/BUILD b/libs/tensorflow/include/tensorflow/core/kernels/BUILD
index 147c26a..5faeb7d 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/kernels/BUILD
@@ -17,10 +17,7 @@ licenses(["notice"])  # Apache 2.0
 
 package_group(
     name = "friends",
-    packages = [
-        "//learning/deepmind/...",
-        "//tensorflow/...",
-    ],
+    packages = ["//tensorflow/..."],
 )
 
 load(
@@ -39,6 +36,10 @@ load(
     "tf_proto_library",
     "tf_kernel_tests_linkstatic",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -257,6 +258,21 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "variable_ops_test",
+    size = "small",
+    srcs = ["variable_ops_test.cc"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "stage_op",
     srcs = ["stage_op.cc"],
@@ -299,26 +315,7 @@ cc_library(
     ],
 )
 
-tf_proto_library(
-    name = "reader_base_proto",
-    srcs = ["reader_base.proto"],
-    cc_api_version = 2,
-    go_api_version = 2,
-    java_api_version = 2,
-)
-
-cc_library(
-    name = "reader_base",
-    srcs = ["reader_base.cc"],
-    hdrs = ["reader_base.h"],
-    deps = [
-        ":reader_base_proto_cc",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
+tf_kernel_library(
     name = "record_input_op",
     srcs = [
         "record_input_op.cc",
@@ -368,18 +365,18 @@ cc_library(
     ],
 )
 
-# Private support libraries ---------------------------------------------------
-
 cc_library(
     name = "bounds_check",
     hdrs = ["bounds_check.h"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
 )
 
+# Private support libraries ---------------------------------------------------
+
 cc_header_only_library(
     name = "bounds_check_lib",
     deps = [":bounds_check"],
@@ -690,8 +687,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "transpose_op",
-    prefix = "transpose_op",
-    deps = ARRAY_DEPS,
+    srcs = [
+        "transpose_op.cc",
+    ] + if_mkl([
+        "mkl_transpose_op.cc",
+    ]),
+    hdrs = ["transpose_op.h"],
+    deps = ARRAY_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -813,7 +817,12 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + select({
+        ":xsmm": [
+            "@libxsmm_archive//:xsmm_avx",
+        ],
+        "//conditions:default": [],
+    }),
 )
 
 tf_cc_test(
@@ -966,6 +975,7 @@ tf_cc_test(
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:debug_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1100,6 +1110,19 @@ tf_kernel_library(
     alwayslink = 0,
 )
 
+tf_cc_test(
+    name = "transpose_util_test",
+    size = "small",
+    srcs = ["transpose_util_test.cc"],
+    deps = [
+        ":transpose_functor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "candidate_sampler_ops",
     prefix = "candidate_sampler_ops",
@@ -1718,6 +1741,22 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "resize_benchmark_test",
+    srcs = ["resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "io",
     deps = [
@@ -1736,12 +1775,12 @@ cc_library(
 
 IO_DEPS = [
     ":ops_util",
-    ":reader_base",
     "//tensorflow/core:framework",
     "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:reader_base",
     "//tensorflow/core/util/tensor_bundle",
 ]
 
@@ -2059,6 +2098,7 @@ cc_library(
         ":argmax_op",
         ":batch_matmul_op",
         ":betainc_op",
+        ":bincount_op",
         ":cast_op",
         ":check_numerics_op",
         ":cross_op",
@@ -2123,7 +2163,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + [
+        "//tensorflow/core:spectral_ops_op_lib",
+    ],
 )
 
 tf_kernel_library(
@@ -2427,6 +2469,7 @@ tf_kernel_library(
     name = "depthwise_conv_op",
     prefix = "depthwise_conv_op",
     deps = [
+        ":bounds_check",
         ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -2443,6 +2486,7 @@ tf_kernel_library(
     ],
     prefix = "depthwise_conv_grad_op",
     deps = [
+        ":bounds_check",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -2553,6 +2597,17 @@ tf_kernel_library(
     deps = NN_DEPS,
 )
 
+tf_kernel_library(
+    name = "bincount_op",
+    prefix = "bincount_op",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "l2loss_op",
     prefix = "l2loss_op",
@@ -3585,6 +3640,7 @@ filegroup(
         "softplus_op.h",
         "softsign_op.h",
         "spacetobatch_functor.h",
+        "spacetodepth_op.h",
         "tensor_array.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
@@ -3707,6 +3763,7 @@ filegroup(
         "softsign_op.cc",
         "spacetobatch_functor.cc",
         "spacetobatch_op.cc",
+        "spacetodepth_op.cc",
         "sparse_to_dense_op.cc",
         "stack_ops.cc",
         "summary_op.cc",
@@ -3770,6 +3827,7 @@ filegroup(
         ],
         exclude = [
             "*test.cc",
+            "*test_util*",
             "*testutil*",
             "*testlib*",
             "*main.cc",
@@ -3794,7 +3852,6 @@ filegroup(
             "decode_jpeg_op.*",
             "decode_gif_op.*",
             "identity_reader_op.*",
-            "reader_base.*",
             "remote_fused_graph_execute_op.*",
             "fixed_length_record_reader_op.*",
             "whole_file_read_ops.*",
@@ -3803,8 +3860,6 @@ filegroup(
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
-            # Lib CURL is not supported on Android.
-            "bigquery*",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -4243,7 +4298,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:remote_fused_graph_ops_op_lib",
-        "//tensorflow/core/kernels/hexagon:graph_transferer",
     ],
 )
 
@@ -4251,10 +4305,67 @@ cc_library(
     name = "remote_fused_graph_execute_op",
     srcs = ["remote_fused_graph_execute_op.cc"],
     deps = [
+        ":remote_fused_graph_execute_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/hexagon:graph_transferer",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "remote_fused_graph_execute_utils",
+    srcs = ["remote_fused_graph_execute_utils.cc"],
+    hdrs = [
+        "i_remote_fused_graph_executor.h",
+        "remote_fused_graph_execute_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "remote_fused_graph_execute_op_test_utils",
+    srcs = ["remote_fused_graph_execute_op_test_utils.cc"],
+    hdrs = ["remote_fused_graph_execute_op_test_utils.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:cwise_op",
+    ],
+)
+
+tf_cc_test(
+    name = "remote_fused_graph_execute_utils_test",
+    size = "small",
+    srcs = [
+        "remote_fused_graph_execute_utils_test.cc",
+    ],
+    deps = [
+        ":remote_fused_graph_execute_op_test_utils",
+        ":remote_fused_graph_execute_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
     ],
 )
 
@@ -4268,7 +4379,13 @@ tf_cc_test(
         ":ops_testutil",
         ":ops_util",
         ":remote_fused_graph_execute_op",
+        ":remote_fused_graph_execute_op_test_utils",
+        ":remote_fused_graph_execute_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -4279,6 +4396,51 @@ tf_cc_test(
     ],
 )
 
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_matmul_op",
+        prefix = "mkl_matmul",
+        deps = [
+            ":math",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_conv_op",
+        prefix = "mkl_conv",
+        deps = [
+            ":bounds_check",
+            ":ops_util",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:nn_ops_op_lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_tfconv_op",
+        prefix = "mkl_tfconv",
+        deps = [
+            ":bounds_check",
+            ":ops_util",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:nn_ops_op_lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/adjust_hue_op.h b/libs/tensorflow/include/tensorflow/core/kernels/adjust_hue_op.h
new file mode 100644
index 0000000..5b30bd8
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/kernels/adjust_hue_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHueGPU {
+  void operator()(
+      GPUDevice* device,
+      const int64 number_of_elements,
+      const float* const input,
+      const float* const delta,
+      float* const output
+  );
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
+#endif // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/assign_op.h b/libs/tensorflow/include/tensorflow/core/kernels/assign_op.h
index da0ccda..1d2e1c8 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/assign_op.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/assign_op.h
@@ -39,60 +39,88 @@ class AssignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    Tensor rhs = context->input(1);
+    const Tensor& rhs = context->input(1);
 
     // We always return the input ref.
     context->forward_ref_input_to_ref_output(0, 0);
 
-    // If the left hand side is not initialized, or the shape of the
-    // right-hand side is different than the left hand side, we need
-    // to allocate a new tensor.
+    // We can't always know how this value will be used downstream,
+    // so make conservative assumptions in specifying constraints on
+    // the memory allocation attributes.
+    // TODO(rmlarsen): These conservative constraints make buffer
+    // forwarding unlikely to happen very often. Try to use graph analysis
+    // (possibly the InferAllocAttr pass in the executer) to improve the
+    // situation.
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+
     {
       mutex_lock l(*context->input_ref_mutex(0));
-
-      Tensor old_lhs = context->mutable_input(0, true);
-
+      const Tensor& old_lhs = context->mutable_input(0, /* lock_held */ true);
+      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
       if (validate_shape_) {
         OP_REQUIRES(
-            context, old_lhs.shape().IsSameSize(rhs.shape()),
+            context, same_shape,
             errors::InvalidArgument(
                 "Assign requires shapes of both tensors to match. lhs shape= ",
-                old_lhs.shape().DebugString(), " rhs shape= ",
-                rhs.shape().DebugString()));
+                old_lhs.shape().DebugString(),
+                " rhs shape= ", rhs.shape().DebugString()));
       }
 
-      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
-      if (!old_lhs.IsInitialized() || !same_shape) {
-        // Create new tensor whose shape matches the right hand side
-        // and copy then hand off to lhs.
-        // We can't always know how this value will be used downstream,
-        // so make conservative assumptions in specifying the memory
-        // allocation attributes.
-        AllocatorAttributes attr;
-        attr.set_gpu_compatible(true);
-        attr.set_nic_compatible(true);
+      // In the code below we try to minimize the amount of memory allocation
+      // and copying by trying the following two shortcuts:
+      // 1. If we can reuse the rhs buffer we avoid both a memory allocation
+      //   and copying.
+      // 2. If the lhs is initialized and has the same number of elements as the
+      //    rhs we can avoid a memory allocation.
+
+      // 1. Try to reuse the rhs.
+      std::unique_ptr<Tensor> input_alias = context->forward_input(
+          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+      if (input_alias != nullptr) {
+        // Transfer ownership to the ref.
+        context->replace_ref_input(0, *input_alias.release(),
+                                   /* lock_held */ true);
+        return;
+      }
+
+      // 2. Try to copy into an existing buffer.
+      if (old_lhs.IsInitialized() &&
+          old_lhs.shape().num_elements() == rhs.shape().num_elements()) {
+        // The existing lhs tensor has already been initialized and the right
+        // hand side can fit in the underlying buffer.
+        Tensor reshaped_old_lhs;
+        if (same_shape) {
+          reshaped_old_lhs = old_lhs;
+        } else {
+          CHECK(reshaped_old_lhs.CopyFrom(old_lhs, rhs.shape()));
+          context->replace_ref_input(0, reshaped_old_lhs, /* lock_held */ true);
+        }
+        if (use_exclusive_lock_) {
+          Copy(context, &reshaped_old_lhs, rhs);
+          return;
+        }
+      } else {
+        // Create a new persistent tensor whose shape matches the right hand
+        // side, hand off to lhs and copy the rhs into it.
         PersistentTensor copy;
         Tensor* copyTensor = nullptr;
         OP_REQUIRES_OK(
             context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
                                                   &copy, &copyTensor, attr));
-        Copy(context, copyTensor, rhs);
-        context->replace_ref_input(0, *copyTensor, true);
-        return;
-      }
-
-      // The tensor has already been initialized and the right hand side
-      // matches the left hand side's shape.
-      if (use_exclusive_lock_) {
-        Copy(context, &old_lhs, rhs);
-        return;
+        context->replace_ref_input(0, *copyTensor, /* lock_held */ true);
+        if (use_exclusive_lock_) {
+          Copy(context, copyTensor, rhs);
+          return;
+        }
       }
     }
 
     // The tensor has already been initialized and the right hand side
     // matches the left hand side's shape. We have been told to do the
     // copy outside the lock.
-    Tensor old_unlocked_lhs = context->mutable_input(0, false);
+    Tensor old_unlocked_lhs = context->mutable_input(0, /* lock_held */ false);
     Copy(context, &old_unlocked_lhs, rhs);
   }
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/batch_matmul_op_impl.h b/libs/tensorflow/include/tensorflow/core/kernels/batch_matmul_op_impl.h
index f2b74fc..dfc81a9 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -343,21 +343,70 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     // C = A x B
     // where A, B and C are assumed to be in column major.
     // We want the output to be in row-major, so we can compute
-    // C' = B' x A' (' stands for transpose)
-    CublasScratchAllocator scratch_allocator(context);
-    bool blas_launch_status =
-        stream
-            ->ThenBlasGemmBatchedWithScratch(
-                blas_transpose_b, blas_transpose_a, n, m, k,
-                static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size,
-                &scratch_allocator)
-            .ok();
-    if (!blas_launch_status) {
-      context->SetStatus(errors::Internal(
-          "Blas SGEMMBatched launch failed : a.shape=",
-          in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(),
-          ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size));
+    // C' = B' x A', where ' stands for transpose (not adjoint).
+    // TODO(yangzihao): Choose the best of the three strategies using autotune.
+    if (batch_size == 1) {
+      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+      // overhead of the scratch allocator and the batch interface.
+      if (n == 1 &&
+          blas_transpose_b !=
+              perftools::gputools::blas::Transpose::kConjugateTranspose &&
+          blas_transpose_a !=
+              perftools::gputools::blas::Transpose::kConjugateTranspose) {
+        // This is a matrix*vector multiply so use GEMV to compute A * b.
+        // Here we are multiplying in the natural order, so we have to flip
+        // the transposition flag to compensate for the tensor being stored
+        // row-major. Since GEMV doesn't provide a way to just conjugate an
+        // argument, we have to defer those cases to GEMM below.
+        auto gemv_trans_a =
+            blas_transpose_a == perftools::gputools::blas::Transpose::kTranspose
+                ? perftools::gputools::blas::Transpose::kNoTranspose
+                : perftools::gputools::blas::Transpose::kTranspose;
+        bool blas_launch_status =
+            stream
+                ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
+                               static_cast<Scalar>(1.0), *(a_ptrs[0]),
+                               adj_x ? m : k, *(b_ptrs[0]), 1,
+                               static_cast<Scalar>(0.0), c_ptrs[0], 1)
+                .ok();
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal(
+              "Blas xGEMV launch failed : a.shape=", in_x.shape().DebugString(),
+              ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+              ", k=", k));
+        }
+      } else {
+        bool blas_launch_status =
+            stream
+                ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                               static_cast<Scalar>(1.0), *(b_ptrs[0]),
+                               adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                               static_cast<Scalar>(0.0), c_ptrs[0], n)
+                .ok();
+        if (!blas_launch_status) {
+          context->SetStatus(errors::Internal(
+              "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
+              ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+              ", k=", k));
+        }
+      }
+    } else {
+      CublasScratchAllocator scratch_allocator(context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n,
+                  batch_size, &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMMBatched launch failed : a.shape=",
+            in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k, ", batch_size=", batch_size));
+      }
     }
   }
 };
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/cloud/BUILD b/libs/tensorflow/include/tensorflow/core/kernels/cloud/BUILD
deleted file mode 100644
index 710cb5a..0000000
--- a/libs/tensorflow/include/tensorflow/core/kernels/cloud/BUILD
+++ /dev/null
@@ -1,98 +0,0 @@
-# Description:
-# BigQueryReader implementation
-
-package(
-    default_visibility = ["//visibility:private"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
-    "tf_cc_test",
-)
-
-# For platform specific build config
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-tf_kernel_library(
-    name = "bigquery_reader_ops",
-    srcs = [
-        "bigquery_reader_ops.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":bigquery_table_accessor",
-        ":bigquery_table_partition_proto_cc",
-        "//tensorflow/core:cloud_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:reader_base",
-    ],
-)
-
-cc_library(
-    name = "bigquery_table_accessor",
-    srcs = [
-        "bigquery_table_accessor.cc",
-    ],
-    hdrs = [
-        "bigquery_table_accessor.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":bigquery_table_partition_proto_cc",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:reader_base",
-        "//tensorflow/core/platform/cloud:google_auth_provider",
-        "//tensorflow/core/platform/cloud:http_request",
-    ],
-    alwayslink = 1,
-)
-
-tf_proto_library(
-    name = "bigquery_table_partition_proto",
-    srcs = [
-        "bigquery_table_partition.proto",
-    ],
-    cc_api_version = 2,
-)
-
-tf_cc_test(
-    name = "bigquery_table_accessor_test",
-    size = "small",
-    srcs = [
-        "bigquery_table_accessor_test.cc",
-        "bigquery_table_accessor_test_data.h",
-    ],
-    deps = [
-        ":bigquery_table_accessor",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform/cloud:http_request_fake",
-    ],
-)
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor.h b/libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor.h
deleted file mode 100644
index 33d1905..0000000
--- a/libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
-
-#include <map>
-#include <memory>
-#include <vector>
-#include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/kernels/cloud/bigquery_table_partition.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/cloud/google_auth_provider.h"
-#include "tensorflow/core/platform/cloud/http_request.h"
-
-namespace tensorflow {
-
-/// This class facilitates accessing BigQuery tables.
-///
-/// Notes:
-///  - Nested fields are not supported.
-///  - BigQuery 'Record's are automatically flattened,
-///  - BigQuery float type is a double but is converted to a C++ float in this
-///    class.
-///  - It is possible for a table snapshot to go out-of-scope in the BigQuery
-///    service while accessing the table if a very old timestamp is used. For
-///    exact details, see 'Table Decorators' in BigQuery docs.
-class BigQueryTableAccessor {
- public:
-  // Column types supported by BigQuery.
-  enum class ColumnType {
-    kString = 0,
-    kBytes,
-    kInteger,
-    kFloat,
-    kBoolean,
-    kTimestamp,
-    kDate,
-    kTime,
-    kDatetime,
-    kRecord,
-    kNone
-  };
-
-  /// \brief Creates a new BigQueryTableAccessor object.
-  //
-  // We do not allow relative (negative or zero) snapshot times here since we
-  // want to have a consistent snapshot of the table for the lifetime of this
-  // object.
-  // Use end_point if you want to connect to a different end point than the
-  // official BigQuery end point. Otherwise send an empty string.
-  static Status New(const string& project_id, const string& dataset_id,
-                    const string& table_id, int64 timestamp_millis,
-                    int64 row_buffer_size, const string& end_point,
-                    const std::vector<string>& columns,
-                    const BigQueryTablePartition& partition,
-                    std::unique_ptr<BigQueryTableAccessor>* accessor);
-
-  /// \brief Starts reading a new partition.
-  Status SetPartition(const BigQueryTablePartition& partition);
-
-  /// \brief Returns true if there are more rows available in the current
-  /// partition.
-  bool Done();
-
-  /// \brief Returns a single row as example proto.
-  ///
-  /// This function will return an error if the table snapshot goes out of scope
-  /// in the BigQuery service.
-  Status ReadRow(int64* row_id, Example* example);
-
-  /// \brief Returns total number of rows in the table.
-  int64 total_num_rows() { return total_num_rows_; }
-
-  virtual ~BigQueryTableAccessor() {}
-
- private:
-  friend class BigQueryTableAccessorTest;
-
-  // This struct encapsulates schema nodes for a BigQuery table.
-  struct SchemaNode {
-    SchemaNode() {}
-    SchemaNode(const string& name, ColumnType type) : name(name), type(type) {}
-
-    string name;
-    ColumnType type;
-    std::vector<SchemaNode> schema_nodes;
-  };
-
-  /// If nullptr is passed for http_request_factory and auth_provider the
-  /// default production ones are used. This can be used by tests to override
-  /// these two variables.
-  static Status New(const string& project_id, const string& dataset_id,
-                    const string& table_id, int64 timestamp_millis,
-                    int64 row_buffer_size, const string& end_point,
-                    const std::vector<string>& columns,
-                    const BigQueryTablePartition& partition,
-                    std::unique_ptr<AuthProvider> auth_provider,
-                    std::unique_ptr<HttpRequest::Factory> http_request_factory,
-                    std::unique_ptr<BigQueryTableAccessor>* accessor);
-
-  /// \brief Constructs an object for a given table and partition.
-  BigQueryTableAccessor(const string& project_id, const string& dataset_id,
-                        const string& table_id, int64 timestamp_millis,
-                        int64 row_buffer_size, const string& end_point,
-                        const std::vector<string>& columns,
-                        const BigQueryTablePartition& partition);
-
-  /// Used for unit testing.
-  BigQueryTableAccessor(
-      const string& project_id, const string& dataset_id,
-      const string& table_id, int64 timestamp_millis, int64 row_buffer_size,
-      const string& end_point, const std::vector<string>& columns,
-      const BigQueryTablePartition& partition,
-      std::unique_ptr<AuthProvider> auth_provider,
-      std::unique_ptr<HttpRequest::Factory> http_request_factory);
-
-  /// \brief Parses column values for a given row.
-  Status ParseColumnValues(const Json::Value& value,
-                           const SchemaNode& root_schema_node,
-                           Example* example);
-
-  /// \brief Reads the table schema and stores it.
-  Status ReadSchema();
-
-  /// \brief Extracts column type from a column in schema.
-  Status ExtractColumnType(const Json::Value& columns,
-                           const string& column_name_prefix, SchemaNode* root);
-
-  /// \brief Appends a single BigQuery column Value to 'example' for a given
-  /// column.
-  Status AppendValueToExample(const string& column_name,
-                              const Json::Value& column_value,
-                              const BigQueryTableAccessor::ColumnType type,
-                              Example* example);
-
-  /// \brief Resets internal counters for reading a partition.
-  void Reset();
-
-  /// \brief Helper function that returns BigQuery http endpoint prefix.
-  string BigQueryUriPrefix();
-
-  /// \brief Computes the maxResults arg to send to BigQuery.
-  int64 ComputeMaxResultsArg();
-
-  /// \brief Returns full name of the underlying table name.
-  string FullTableName() {
-    return strings::StrCat(project_id_, ":", dataset_id_, ".", table_id_, "@",
-                           timestamp_millis_);
-  }
-
-  const string project_id_;
-  const string dataset_id_;
-  const string table_id_;
-
-  // Snapshot timestamp.
-  const int64 timestamp_millis_;
-
-  // Columns that should be read. Empty means all columns.
-  const std::set<string> columns_;
-
-  // HTTP address of BigQuery end point to use.
-  const string bigquery_end_point_;
-
-  // Describes the portion of the table that we are currently accessing.
-  BigQueryTablePartition partition_;
-
-  // Total number of rows in the underlying table.
-  int64 total_num_rows_ = 0;
-
-  // Offset of the first row in the underlying row_buffer_.
-  int64 first_buffered_row_index_ = 0;
-
-  // Offset of the next row in the row_buffer_. -1 indicates that this index
-  // is invalid.
-  int next_row_in_buffer_ = -1;
-
-  // This buffer holds next rows to improve performance. Its size will be
-  // based on how much buffering was requested.
-  std::vector<Example> row_buffer_;
-
-  // If next_page is set, it will used to read next batch of data.
-  string next_page_token_;
-
-  // A tree representing the schema for the underlying table.
-  SchemaNode schema_root_;
-
-  std::unique_ptr<AuthProvider> auth_provider_;
-  std::unique_ptr<HttpRequest::Factory> http_request_factory_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(BigQueryTableAccessor);
-};
-
-}  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h b/libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h
deleted file mode 100644
index e339ff2..0000000
--- a/libs/tensorflow/include/tensorflow/core/kernels/cloud/bigquery_table_accessor_test_data.h
+++ /dev/null
@@ -1,325 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
-
-#include <string>
-
-namespace tensorflow {
-namespace {
-
-const string kSampleSchema = R"({
-  "kind": "bigquery#table",
-  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
-  "id": "test-project:test-dataset.test-table",
-  "schema": {
-    "fields": [
-    {
-      "name": "int_field",
-      "type": "INTEGER",
-      "mode": "REQUIRED"
-    },{
-      "name": "str_field",
-      "type": "STRING",
-      "mode": "NULLABLE"
-    },{
-     "name": "rec_field",
-     "type": "RECORD",
-     "fields": [
-     {
-       "name": "float_field",
-       "type": "FLOAT",
-       "mode": "NULLABLE"
-      }]
-    },{
-      "name": "bool_field",
-      "type": "BOOLEAN",
-      "mode": "NULLABLE"
-    },{
-      "name": "bytes_field",
-      "type": "BYTES",
-      "mode": "NULLABLE"
-    },{
-      "name": "timestamp_field",
-      "type": "TIMESTAMP",
-      "mode": "NULLABLE"
-    },{
-      "name": "date_field",
-      "type": "DATE",
-      "mode": "NULLABLE"
-    },{
-      "name": "time_field",
-      "type": "TIME",
-      "mode": "NULLABLE"
-    },{
-      "name": "datetime_field",
-      "type": "DATETIME",
-      "mode": "NULLABLE"
-    }]
-  },
-  "numRows": "4"
-})";
-
-const string kTestRow = R"({
-  "kind": "bigquery#table",
-  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
-  "id": "test-project:test-dataset.test-table",
-  "rows": [
-  {
-    "f": [
-    {
-      "v": "1234"
-    },{
-      "v": ""
-    },{
-      "v": {
-        "f": [
-        {
-          "v": "1.23456"
-        }]
-      }
-    },{
-      "v": "true"
-    },{
-      "v": "01010100101"
-    },{
-      "v": "timestamp"
-    },{
-      "v": "date"
-    },{
-      "v": "time"
-    },{
-      "v": "datetime"
-    }]}]})";
-
-const string kBrokenTestRow = R"({
-  "kind": "bigquery#table",
-  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
-  "id": "test-project:test-dataset.test-table",
-  "rows": [
-  {
-    "f": [
-    {
-      "v": "1-234"   // This does not parse as integer.
-    },{
-      "v": ""
-    },{
-    },{
-      "v": "true"
-    },{
-      "v": "01010100101"
-    },{
-      "v": "timestamp"
-    },{
-      "v": "date"
-    },{
-      "v": "time"
-    },{
-      "v": "datetime"
-    }]}]})";
-
-const string kTestRowWithNulls = R"({
-  "kind": "bigquery#table",
-  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
-  "id": "test-project:test-dataset.test-table",
-  "rows": [
-  {
-    "f": [
-    {
-      "v": "1234"
-    },{
-      "v": "string"
-    },{
-      "v": null
-    },{
-      "v": "true"
-    },{
-      "v": "01010100101"
-    },{
-      "v": ""
-    },{
-      "v": null
-    },{
-      "v": null
-    },{
-      "v": "datetime"
-    }]}]})";
-
-// Example proto corresponding to kTestRow.
-const string kTestExampleProto = R"(features {
-  feature {
-    key: "bool_field"
-    value {
-      int64_list {
-        value: 1
-      }
-    }
-  }
-  feature {
-    key: "bytes_field"
-    value {
-      bytes_list {
-        value: "01010100101"
-      }
-    }
-  }
-  feature {
-    key: "date_field"
-    value {
-      bytes_list {
-        value: "date"
-      }
-    }
-  }
-  feature {
-    key: "datetime_field"
-    value {
-      bytes_list {
-        value: "datetime"
-      }
-    }
-  }
-  feature {
-    key: "int_field"
-    value {
-      int64_list {
-        value: 1234
-      }
-    }
-  }
-  feature {
-    key: "rec_field.float_field"
-    value {
-      float_list {
-        value: 1.23456
-      }
-    }
-  }
-  feature {
-    key: "str_field"
-    value {
-      bytes_list {
-        value: ""
-      }
-    }
-  }
-  feature {
-    key: "time_field"
-    value {
-      bytes_list {
-        value: "time"
-      }
-    }
-  }
-  feature {
-    key: "timestamp_field"
-    value {
-      bytes_list {
-        value: "timestamp"
-      }
-    }
-  }
-}
-)";
-
-// Example proto corresponding to kTestRowWithNulls.
-const string kTestExampleProtoWithNulls = R"(features {
-  feature {
-    key: "bool_field"
-    value {
-      int64_list {
-        value: 1
-      }
-    }
-  }
-  feature {
-    key: "bytes_field"
-    value {
-      bytes_list {
-        value: "01010100101"
-      }
-    }
-  }
-  feature {
-    key: "datetime_field"
-    value {
-      bytes_list {
-        value: "datetime"
-      }
-    }
-  }
-  feature {
-    key: "int_field"
-    value {
-      int64_list {
-        value: 1234
-      }
-    }
-  }
-  feature {
-    key: "timestamp_field"
-    value {
-      bytes_list {
-        value: ""
-      }
-    }
-  }
-  feature {
-    key: "str_field"
-    value {
-      bytes_list {
-        value: "string"
-      }
-    }
-  }
-}
-)";
-
-// Example proto corresponding to part of kTestRow.
-const string kTestPartialExampleProto = R"(features {
-  feature {
-    key: "bool_field"
-    value {
-      int64_list {
-        value: 1
-      }
-    }
-  }
-  feature {
-    key: "rec_field.float_field"
-    value {
-      float_list {
-        value: 1.23456
-      }
-    }
-  }
-}
-)";
-
-const string kTestTwoRows = R"({
-  "kind": "bigquery#table",
-  "etag": "\"4zcX32ezvFoFzxHoG04qJqKZk6c/MTQ1Nzk3NTgwNzE4Mw\"",
-  "pageToken": "next_page",
-  "id": "test-project:test-dataset.test-table",
-  "rows": [
-    {"f": [{"v": "1111"},{},{},{},{},{},{},{},{}]},
-    {"f": [{"v": "2222"},{},{},{},{},{},{},{},{}]}
-  ]})";
-
-}  // namespace
-}  // namepsace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/conditional_accumulator.h b/libs/tensorflow/include/tensorflow/core/kernels/conditional_accumulator.h
index f8c340a..414891b 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/conditional_accumulator.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/conditional_accumulator.h
@@ -85,8 +85,10 @@ class ConditionalAccumulator
 
   void AllocateAndAssignToAccumGradFunction(OpKernelContext* ctx,
                                             const Tensor* grad) override {
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
     ctx->allocate_persistent(dtype_, grad->shape(), &accum_grad_persistent_,
-                             &accum_grad_);
+                             &accum_grad_)
+        .IgnoreError();
     accum_grad_->flat<T>().device(ctx->template eigen_device<Device>()) =
         grad->flat<T>();
   }
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/conv_grad_ops.h b/libs/tensorflow/include/tensorflow/core/kernels/conv_grad_ops.h
index 507ffde..3ea9510 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/conv_grad_ops.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/conv_grad_ops.h
@@ -178,14 +178,14 @@ struct ConvBackpropSpatialDimension {
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
-  // the input when computing Conv2DBackpropInput.
+  // the input when computing Conv?DBackpropInput.
   int64 pad_before, pad_after;
 };
 
-// Computed dimensions for a Conv2D backpropagation.
-struct Conv2DBackpropDimensions {
+// Computed dimensions for a backwards convolution.
+struct ConvBackpropDimensions {
   // Information about each spatial dimension.
-  ConvBackpropSpatialDimension rows, cols;
+  gtl::InlinedVector<ConvBackpropSpatialDimension, 3> spatial_dims;
 
   // Batch size.
   int64 batch_size;
@@ -194,14 +194,16 @@ struct Conv2DBackpropDimensions {
   int64 in_depth, out_depth;
 };
 
-// Common code between implementations of Conv2DBackpropInput and
-// Conv2DBackpropFilter. Verifies that the dimensions all match, and computes
-// sizes/padding for rows and columns.
-Status Conv2DBackpropComputeDimensions(
-    StringPiece label, const TensorShape& input_shape,
-    const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
-    const std::vector<int32>& strides, Padding padding,
-    TensorFormat data_format, Conv2DBackpropDimensions* dims);
+// Common code between implementations of Conv?DBackpropInput and
+// Conv?DBackpropFilter. Verifies that the dimensions all match, and computes
+// sizes/padding for the spatial dimensions.
+Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
+                                     const TensorShape& input_shape,
+                                     const TensorShape& filter_shape,
+                                     const TensorShape& out_backprop_shape,
+                                     const std::vector<int32>& strides,
+                                     Padding padding, TensorFormat data_format,
+                                     ConvBackpropDimensions* dims);
 
 }  // namespace tensorflow
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/conv_ops_gpu.h b/libs/tensorflow/include/tensorflow/core/kernels/conv_ops_gpu.h
index 072096f..34a9c8a 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/conv_ops_gpu.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <tuple>
 #include <unordered_map>
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
@@ -94,33 +95,28 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
 // backward conv operations.
 class ConvParameters {
  public:
-  ConvParameters(int64 batch, int64 in_depths, int64 in_rows, int64 in_cols,
-                 int64 out_depths, int64 filter_rows, int64 filter_cols,
-                 int64 stride_rows, int64 stride_cols, int64 padding_rows,
-                 int64 padding_cols, int device_id)
+  using SpatialArray = gtl::InlinedVector<int64, 3>;
+  ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
+                 int64 out_depths, const SpatialArray& filter,
+                 const SpatialArray& stride, const SpatialArray& padding,
+                 const DataType& dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
-        in_rows_(in_rows),
-        in_cols_(in_cols),
+        in_(in),
         out_depths_(out_depths),
-        filter_rows_(filter_rows),
-        filter_cols_(filter_cols),
-        stride_rows_(stride_rows),
-        stride_cols_(stride_cols),
-        padding_rows_(padding_rows),
-        padding_cols_(padding_cols),
+        filter_(filter),
+        stride_(stride),
+        padding_(padding),
+        dtype_(dtype),
         device_id_(device_id) {
     hash_code_ = batch;
     hash_code_ = Hash64Combine(hash_code_, in_depths);
-    hash_code_ = Hash64Combine(hash_code_, in_rows);
-    hash_code_ = Hash64Combine(hash_code_, in_cols);
+    for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
-    hash_code_ = Hash64Combine(hash_code_, filter_rows);
-    hash_code_ = Hash64Combine(hash_code_, filter_cols);
-    hash_code_ = Hash64Combine(hash_code_, stride_rows);
-    hash_code_ = Hash64Combine(hash_code_, stride_cols);
-    hash_code_ = Hash64Combine(hash_code_, padding_rows);
-    hash_code_ = Hash64Combine(hash_code_, padding_cols);
+    for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
+    hash_code_ = Hash64Combine(hash_code_, dtype);
     hash_code_ = Hash64Combine(hash_code_, device_id);
   }
   bool operator==(const ConvParameters& other) const {
@@ -133,28 +129,23 @@ class ConvParameters {
   uint64 hash() const { return hash_code_; }
 
  private:
-  typedef std::tuple<int64, int64, int64, int64, int64, int64, int64, int64,
-                     int64, int64, int64, int>
-      DataType;
-
-  DataType get_data_as_tuple() const {
-    return std::make_tuple(batch_, in_depths_, in_rows_, in_cols_, out_depths_,
-                           filter_rows_, filter_cols_, stride_rows_,
-                           stride_cols_, padding_rows_, padding_cols_,
-                           device_id_);
+  typedef std::tuple<int64, int64, SpatialArray, int64, SpatialArray,
+                     SpatialArray, SpatialArray, DataType, int>
+      ParameterDataType;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
+                           stride_, padding_, dtype_, device_id_);
   }
 
   int64 batch_;
   int64 in_depths_;
-  int64 in_rows_;
-  int64 in_cols_;
+  SpatialArray in_;
   int64 out_depths_;
-  int64 filter_rows_;
-  int64 filter_cols_;
-  int64 stride_rows_;
-  int64 stride_cols_;
-  int64 padding_rows_;
-  int64 padding_cols_;
+  SpatialArray filter_;
+  SpatialArray stride_;
+  SpatialArray padding_;
+  DataType dtype_;
   int device_id_;
   uint64 hash_code_;
 };
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops.h b/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops.h
index 3349447..fb4c649 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops.h
@@ -753,6 +753,13 @@ struct BinaryFunctor {
              bool* error);
 };
 
+template <typename Device, typename T>
+struct ApproximateEqual {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z);
+};
+
 template <int NDIMS>
 bool AllOne(const typename Eigen::array<Eigen::DenseIndex, NDIMS>& a) {
   for (size_t i = 0; i < a.size(); ++i) {
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_common.h b/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_common.h
index c825a91..f30d889 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_common.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_common.h
@@ -48,7 +48,9 @@ class BinaryOpShared : public OpKernel {
  protected:
   struct BinaryOpState {
     // Sets up bcast with the shape of in0 and in1, ensures that the bcast
-    // is valid, and if so, allocates out using ctx->output(...).
+    // is valid, and if so, set out, either by allocating a new buffer using
+    // ctx->output(...) or by creating an alias for an owned input buffer for
+    // in-place computation.
     // Caller must check ctx->status() upon return for non-ok status.
     // If ctx->status().ok() is true, then out is guaranteed to be allocated.
     BinaryOpState(OpKernelContext* ctx);
@@ -152,6 +154,37 @@ class BinaryOp : public BinaryOpShared {
   }
 };
 
+template <typename Device, typename T>
+class ApproximateEqualOp : public OpKernel {
+ public:
+  explicit ApproximateEqualOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float tolerance;
+    OP_REQUIRES_OK(context, context->GetAttr("tolerance", &tolerance));
+    tolerance_ = T(tolerance);
+  }
+  void Compute(OpKernelContext* context) override {
+    const Tensor& x_input = context->input(0);
+    const Tensor& y_input = context->input(1);
+    OP_REQUIRES(
+        context, x_input.shape() == y_input.shape(),
+        errors::InvalidArgument("x and y must be of the same shape. ",
+                                "x shape: ", x_input.shape().DebugString(),
+                                ". y shape: ", y_input.shape().DebugString()));
+    Tensor* z_output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, x_input.shape(), &z_output));
+    const Device& d = context->eigen_device<Device>();
+    typename TTypes<T>::ConstFlat x(x_input.flat<T>());
+    typename TTypes<T>::ConstFlat y(y_input.flat<T>());
+    typename TTypes<bool>::Flat z(z_output->flat<bool>());
+    functor::ApproximateEqual<Device, T>()(d, x, y, tolerance_, z);
+  }
+
+ private:
+  T tolerance_;
+};
+
 // Basic coefficient-wise binary operations that are known to not require
 // any broadcasting. This is the case for example of the gradients of
 // unary operations.
@@ -168,14 +201,18 @@ class SimpleBinaryOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
-
-    Tensor* out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
-    auto out_flat = out->flat<Tout>();
     auto in0_flat = in0.flat<Tin>();
     auto in1_flat = in1.flat<Tin>();
     const Device& eigen_device = ctx->eigen_device<Device>();
 
+    Tensor* out = nullptr;
+    if (std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0, 1}, 0, in0.shape(), &out));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+    }
+    auto out_flat = out->flat<Tout>();
     functor::SimpleBinaryFunctor<Device, Functor>()(eigen_device, out_flat,
                                                     in0_flat, in1_flat);
   }
@@ -200,7 +237,12 @@ class UnaryOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& inp = ctx->input(0);
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    if (std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0}, 0, inp.shape(), &out));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    }
     functor::UnaryFunctor<Device, Functor>()(
         ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>());
   }
@@ -407,6 +449,17 @@ struct UnaryFunctor<CPUDevice, Functor> {
   }
 };
 
+// Partial specialization of ApproximateEqual<Device=CPUDevice, T>.
+template <typename T>
+struct ApproximateEqual<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z) {
+    auto diff = x - y;
+    z.device(d) = diff.abs() <= tolerance;
+  }
+};
+
 }  // end namespace functor
 
 #define REGISTER(OP, D, N, F, T)                                             \
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index d0db68b..b8e23e2 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -104,6 +104,17 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS, has_errors> {
   }
 };
 
+// Partial specialization of ApproximateEqual<Device=GPUDevice, T>.
+template <typename T>
+struct ApproximateEqual<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::ConstFlat y, T tolerance,
+                  typename TTypes<bool>::Flat z) {
+    auto diff = x - y;
+    z.device(d) = diff.abs() <= tolerance;
+  }
+};
+
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
 #define DEFINE_UNARY1(F, T) template struct UnaryFunctor<GPUDevice, F<T> >
@@ -162,6 +173,12 @@ struct BinaryFunctor<GPUDevice, Functor, NDIMS, has_errors> {
   DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                                \
   DEFINE_BINARY6(F, T5, T6, T7, T8, T9, T10)
 
+#define DEFINE_APPROXIMATE_EQUAL1(T) \
+  template struct ApproximateEqual<GPUDevice, T>;
+#define DEFINE_APPROXIMATE_EQUAL2(T0, T1) \
+  DEFINE_APPROXIMATE_EQUAL1(T0);          \
+  DEFINE_APPROXIMATE_EQUAL1(T1);
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/debug_ops.h b/libs/tensorflow/include/tensorflow/core/kernels/debug_ops.h
index c6395f8..5437bc5 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/debug_ops.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/debug_ops.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_DEBUG_OP_H_
 #define TENSORFLOW_KERNELS_DEBUG_OP_H_
 
+#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#endif
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -92,9 +94,11 @@ class DebugIdentityOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     if (!debug_urls_.empty()) {
+      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
       DebugIO::PublishDebugTensor(tensor_name_, "DebugIdentity",
                                   context->input(0),
-                                  Env::Default()->NowMicros(), debug_urls_);
+                                  Env::Default()->NowMicros(), debug_urls_)
+          .IgnoreError();
     }
 
     context->set_output(0, context->input(0));
@@ -142,8 +146,10 @@ class DebugNanCountOp : public OpKernel {
     output_tensor->vec<int64>()(0) = nan_count;
 
     if (!debug_urls_.empty()) {
+      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
       DebugIO::PublishDebugTensor(tensor_name_, "DebugNanCount", *output_tensor,
-                                  Env::Default()->NowMicros(), debug_urls_);
+                                  Env::Default()->NowMicros(), debug_urls_)
+          .IgnoreError();
     }
   }
 
@@ -162,6 +168,10 @@ class DebugNumericSummaryOp : public OpKernel {
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+    OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
+    OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("mute_if_healthy", &mute_if_healthy_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -190,8 +200,11 @@ class DebugNumericSummaryOp : public OpKernel {
       const T* input_flat = input.template flat<T>().data();
 
       element_count = input_shape.num_elements();
+      const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
+      const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
+
       for (int64 i = 0; i < element_count; ++i) {
-        T x = input_flat[i];
+        const double x = static_cast<double>(input_flat[i]);
         if (Eigen::numext::isnan(x)) {
           nan_count++;
         } else if (Eigen::numext::isinf(x)) {
@@ -201,7 +214,11 @@ class DebugNumericSummaryOp : public OpKernel {
             positive_inf_count++;
           }
         } else {
-          if (x < 0.0) {
+          if (is_lower_bound_custom && x <= lower_bound_) {
+            negative_inf_count++;
+          } else if (is_upper_bound_custom && x >= upper_bound_) {
+            positive_inf_count++;
+          } else if (x < 0.0) {
             negative_count++;
           } else if (x > 0.0) {
             positive_count++;
@@ -211,7 +228,8 @@ class DebugNumericSummaryOp : public OpKernel {
 
           if (x < min) {
             min = x;
-          } else if (x > max) {
+          }
+          if (x > max) {
             max = x;
           }
 
@@ -226,7 +244,7 @@ class DebugNumericSummaryOp : public OpKernel {
         // Do a second pass to compute variance.
         variance = 0.0;
         for (int64 i = 0; i < element_count; ++i) {
-          T x = input_flat[i];
+          const double x = static_cast<double>(input_flat[i]);
           if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
             variance += (x - mean) * (x - mean);
           }
@@ -241,21 +259,25 @@ class DebugNumericSummaryOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
-    output_tensor->vec<double>()(2) = static_cast<double>(negative_inf_count);
-    output_tensor->vec<double>()(3) = static_cast<double>(negative_count);
-    output_tensor->vec<double>()(4) = static_cast<double>(zero_count);
-    output_tensor->vec<double>()(5) = static_cast<double>(positive_count);
-    output_tensor->vec<double>()(6) = static_cast<double>(positive_inf_count);
-    output_tensor->vec<double>()(7) = static_cast<double>(nan_count);
+    output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
+    output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
+    output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
+    output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
+    output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
+    output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
     output_tensor->vec<double>()(8) = min;
     output_tensor->vec<double>()(9) = max;
     output_tensor->vec<double>()(10) = mean;
     output_tensor->vec<double>()(11) = variance;
 
-    if (!debug_urls_.empty()) {
+    bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
+                positive_inf_count == 0;
+    if (!mute && !debug_urls_.empty()) {
+      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
       DebugIO::PublishDebugTensor(tensor_name_, "DebugNumericSummary",
                                   *output_tensor, Env::Default()->NowMicros(),
-                                  debug_urls_);
+                                  debug_urls_)
+          .IgnoreError();
     }
   }
 
@@ -264,6 +286,9 @@ class DebugNumericSummaryOp : public OpKernel {
  private:
   string tensor_name_;
   std::vector<string> debug_urls_;
+  float lower_bound_;
+  float upper_bound_;
+  bool mute_if_healthy_;
 };
 
 }  // namespace tensorflow
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/depthwise_conv_op.h b/libs/tensorflow/include/tensorflow/core/kernels/depthwise_conv_op.h
index ed5bf05..1960b02 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/depthwise_conv_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/eigen_pooling.h b/libs/tensorflow/include/tensorflow/core/kernels/eigen_pooling.h
index e13c8b9..f93921d 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/eigen_pooling.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/eigen_pooling.h
@@ -330,9 +330,19 @@ struct AvgPoolMeanReducer {
 
 #if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
 #ifdef EIGEN_VECTORIZE_AVX512
-#define pequal(a, b) \
-  _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)
-#define psel(a, b, false_mask) _mm512_ternarylogic_epi64(false_mask, a, b, 0xca)
+#define pequal(a, b)   \
+  _mm512_castsi512_ps( \
+      _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
+
+// The ternarylogic function immediate determines the values in the result
+// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+// For details, refer to the vpternlogd instruction table at
+// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+
+#define psel(a, b, false_mask)                        \
+  _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
+      _mm512_castps_si512(a), _mm512_castps_si512(b), \
+      _mm512_castps_si512(false_mask), 0xd8))
 #elif defined EIGEN_VECTORIZE_AVX
 #define pequal(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_UQ)
 #define psel(a, b, false_mask) _mm256_blendv_ps(a, b, false_mask)
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/BUILD b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/BUILD
index 1034bbd..309a2ca 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/BUILD
@@ -65,6 +65,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:reduction_ops",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:reshape_op",
         "//tensorflow/core/kernels:softmax_op",
@@ -86,7 +87,6 @@ tf_kernel_library(
         "hexagon_control_wrapper.h",
         "hexagon_ops_definitions.h",
         "i_graph_transfer_ops_definitions.h",
-        "i_soc_control_wrapper.h",
     ],
     deps = [
         "//tensorflow/cc:cc_ops",
@@ -96,6 +96,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//third_party/eigen3",
     ],
 )
@@ -109,13 +110,8 @@ cc_library(
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:remote_fused_graph_ops",
-        "//tensorflow/cc:scope",
         "//tensorflow/core",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/tools/graph_transforms:transform_utils",
-        "//third_party/eigen3",
     ],
     alwayslink = 1,
 )
@@ -136,3 +132,28 @@ tf_cc_test(
         "//tensorflow/tools/graph_transforms:transform_utils",
     ],
 )
+
+cc_library(
+    name = "hexagon_remote_fused_graph_executor_build",
+    srcs = [
+        "hexagon_remote_fused_graph_executor_build.cc",
+    ],
+    deps = [
+        ":graph_transferer",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "hexagon_remote_fused_graph_executor_build_test",
+    size = "small",
+    srcs = ["hexagon_remote_fused_graph_executor_build_test.cc"],
+    deps = [
+        ":hexagon_remote_fused_graph_executor_build",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
index a9de914..80db3bc 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -17,8 +17,11 @@ limitations under the License.
 #define TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
 
 #include <queue>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/platform/macros.h"
@@ -30,14 +33,18 @@ class GraphTransferUtils {
   static std::priority_queue<std::tuple<float, int, string>>
   GetTopNFloatResults(const float* const data, const string* const labels,
                       const int element_count);
+
   static void DumpTopNFloatResults(const float* const data,
                                    const string* const labels,
                                    const int element_count, const int top_n);
 
+  static RemoteFusedGraphExecuteInfo BuildRemoteFusedGraphExecuteInfo(
+      const GraphTransferInfo& graph_transfer_info);
+
   static GraphDef BuildFusedGraphDef(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const string& remote_graph_execute_name,
-      const std::vector<GraphTransferer::InputNodeInfo>& inputs,
+      const std::vector<std::pair<string, Tensor>>& inputs,
       const std::vector<string>& outputs, const GraphDef& def,
       GraphTransferer* gt);
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transferer.h b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transferer.h
index 5c09ba5..7289e38 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/padding.h"
@@ -44,55 +45,32 @@ class GraphTransferer {
   static constexpr int MAX_SUPPORTED_RANK = 4;
   // TODO(satok): Remove. Use proto definition instead.
   static constexpr int SHAPE_ARRAY_SIZE = MAX_SUPPORTED_RANK;
-  using OutputTensorMap = std::unordered_map<string, Tensor*>;
-
-  struct InputNodeInfo {
-    string name;
-    Tensor tensor;
-  };
-
-  struct OutputTensorInfo {
-    std::vector<Tensor> output_tensors;
-    OutputTensorMap output_tensor_map;
-  };
+  using TensorShapeMap = RemoteFusedGraphExecuteUtils::TensorShapeMap;
 
   GraphTransferer() = default;
 
   // Load graph structure into GraphTransferer
+  // TODO(satok): Pass a pair of TensorShape and DataType instead of
+  // Tensor as input_node_info_list.
   Status LoadGraphFromProto(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const GraphDef& graph_def,
-      const std::vector<InputNodeInfo>& input_node_info_list,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names,
       const bool shape_inference_for_unkown_shape,
-      const OutputTensorMap& output_tensor_map);
+      const TensorShapeMap& output_tensor_map);
 
   // Load graph structure into GraphTransferer from protobuf file
+  // TODO(satok): Pass a pair of TensorShape and DataType instead of
+  // Tensor as input_node_info_list.
   Status LoadGraphFromProtoFile(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const string& graph_def_path,
-      const std::vector<InputNodeInfo>& input_node_info_list,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names, const bool is_text_proto,
       const bool shape_inference_for_unknown_shape,
       const bool dry_run_for_unknown_shape,
-      OutputTensorInfo* output_tensor_info);
-
-  // Dry run inference and cache the result to get memory mapping
-  static Status DryRunInference(
-      const GraphDef& graph_def,
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const std::vector<string>& output_node_names,
-      const bool initialize_by_zero,
-      std::vector<tensorflow::Tensor>* output_tensors);
-
-  // Dry run inference and fill output tensors to output tensor info
-  // CAVEAT: Do not add or modify output_tensors in output_tensor_info
-  // otherwise, address map may be broken by re-allocation inside
-  // std::vector
-  static Status DryRunInferenceForAllNode(
-      const GraphDef& graph_def,
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const bool initialize_by_zero, OutputTensorInfo* output_tensor_info);
+      RemoteFusedGraphExecuteUtils::TensorShapeMap* tensor_shape_map);
 
   // Sort params so that all input nodes appear before consumer nodes.
   // CAVEAT: This may be slow if the number of nodes are too large
@@ -106,6 +84,9 @@ class GraphTransferer {
   // Return parameters for graph transfer
   const GraphTransferInfo& GetGraphTransferInfo() const;
 
+  // Return mutable GraphTransferInfo for graph transfer
+  GraphTransferInfo& GetMutableGraphTransferInfo();
+
   // Dump verification string of parameters to verify with offline tools
   void DumpVerificationStringOfNodeTransferParams() const;
 
@@ -121,21 +102,17 @@ class GraphTransferer {
 
   int CacheNode(const Node& node);
 
-  static bool IsInputNode(
-      const std::vector<InputNodeInfo>& input_node_info_list,
-      const string& node_name);
-
   bool AreAllInputsCached(const Node& node) const;
 
-  Status RegisterNode(const IGraphTransferOpsDefinitions& ops_definitions,
-                      const ShapeRefiner& shape_refiner,
-                      const OutputTensorMap& output_tensor_map,
-                      const Node& node,
-                      const std::vector<InputNodeInfo>& input_node_info_list,
-                      const std::vector<string>& output_node_names);
+  Status RegisterNode(
+      const IGraphTransferOpsDefinitions& ops_definitions,
+      const ShapeRefiner& shape_refiner,
+      const TensorShapeMap& output_tensor_map, const Node& node,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names);
 
   void RegisterConstantNode(const ShapeRefiner& shape_refiner, const Node& node,
-                            const OutputTensorMap& output_tensor_map);
+                            const TensorShapeMap& output_tensor_map);
 
   int RegisterConstantShape(const std::vector<int>& shape);
 
@@ -145,36 +122,36 @@ class GraphTransferer {
   // TODO(satok): Remove this method once generic reshape op is implemented in
   // SOC
   bool IsNodeFlattenReshape(const Node& node,
-                            const OutputTensorMap& output_tensor_map,
+                            const TensorShapeMap& output_tensor_map,
                             const ShapeRefiner& shape_refiner);
 
   void RegisterNodeWithPaddingAndStrides(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner,
-      const OutputTensorMap& output_tensor_map, const Node& node);
+      const TensorShapeMap& output_tensor_map, const Node& node);
 
   void RegisterInputNode(const IGraphTransferOpsDefinitions& ops_definitions,
                          const ShapeRefiner& shape_refiner,
-                         const OutputTensorMap& output_tensor_map,
+                         const TensorShapeMap& output_tensor_map,
                          const Node& node);
 
   void RegisterFlattenNode(const IGraphTransferOpsDefinitions& ops_definitions,
                            const ShapeRefiner& shape_refiner,
-                           const OutputTensorMap& output_tensor_map,
+                           const TensorShapeMap& output_tensor_map,
                            const Node& node);
 
   void RegisterGenericNode(const IGraphTransferOpsDefinitions& ops_definitions,
                            const ShapeRefiner& shape_refiner,
-                           const OutputTensorMap& output_tensor_map,
+                           const TensorShapeMap& output_tensor_map,
                            const Node& node);
 
   Status RegisterNodeIfAllInputsAreCached(
       const IGraphTransferOpsDefinitions& ops_definitions,
       const ShapeRefiner& shape_refiner, const Node& node,
       const bool only_register_const_node,
-      const std::vector<InputNodeInfo>& input_node_info_list,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
       const std::vector<string>& output_node_names,
-      const OutputTensorMap& output_tensor_map);
+      const TensorShapeMap& output_tensor_map);
 
   void AppendNodeParams(const string& name, const int id, const string& type,
                         const int type_id, const int padding,
@@ -186,7 +163,7 @@ class GraphTransferer {
                              const std::vector<int>& extra_inputs);
 
   void AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
-                              const OutputTensorMap& output_tensor_map,
+                              const TensorShapeMap& output_tensor_map,
                               const int id, const Node& node);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> BuildShapeArray(
@@ -195,7 +172,7 @@ class GraphTransferer {
 
   void AppendNodeParamsWithIoParams(
       const ShapeRefiner& shape_refiner,
-      const OutputTensorMap& output_tensor_map, const Node& node,
+      const TensorShapeMap& output_tensor_map, const Node& node,
       const string& name, const int id, const string& type, const int type_id,
       const int padding, const int inputs_size,
       const std::vector<int>& extra_inputs, const int outputs_size,
@@ -206,7 +183,7 @@ class GraphTransferer {
 
   static string ToPaddingDebugString(int padding);
 
-  static void CheckShape(const OutputTensorMap& output_tensor_map,
+  static void CheckShape(const TensorShapeMap& output_tensor_map,
                          const string& node_name,
                          const std::array<int64, SHAPE_ARRAY_SIZE>& actual);
 
@@ -215,6 +192,18 @@ class GraphTransferer {
       int node_id, std::unordered_map<int, std::unordered_set<int>>& dep_map,
       std::unordered_set<int>& completed);
 
+  // Build tensor from proto
+  static Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                    Tensor* tensor);
+
+  static bool FindShapeType(const TensorShapeMap& tensor_shape_map,
+                            const string& name, const int port,
+                            const DataType** dt, const TensorShape** shape);
+
+  static bool FindShapeType(const TensorShapeMap& tensor_shape_map,
+                            const string& name, const DataType** dt,
+                            const TensorShape** shape);
+
   void ClearCache();
 
   // Dump pretty print of parameters
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 86540d3..9744888 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -20,42 +20,47 @@ limitations under the License.
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/hexagon/graph_transferer.h"
-#include "tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
 /*
-  HexagonControlWrapper is implementing interfaces in ISocControlWrapper.
+  HexagonControlWrapper is implementing interfaces in IRemoteFusedGraphExecutor.
   This class calls APIs on hexagon via hexagon control binary.
   TODO(satok): Add more documents about hexagon control binary.
  */
-class HexagonControlWrapper final : public ISocControlWrapper {
+class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
  public:
   HexagonControlWrapper() = default;
   int GetVersion() final;
-  bool Init() final;
+  bool Init(const RemoteFusedGraphExecuteInfo& info) final;
   bool Finalize() final;
-  bool SetupGraph(const GraphTransferer& graph_transferer) final;
+  bool SetupGraph() final;
   bool ExecuteGraph() final;
   bool TeardownGraph() final;
-  bool FillInputNode(const string& node_name, const ConstByteArray bytes) final;
   bool FillInputNode(const string& node_name, const Tensor& tensor) final;
-  bool ReadOutputNode(string node_name, std::vector<ByteArray>* outputs) final;
+  bool ReadOutputNode(const string& node_name,
+                      TensorAllocatorFunc tensor_allocator) final;
+  bool ReadOutputNode(const string& node_name, std::vector<ByteArray>* outputs);
 
  private:
+  bool FillInputNode(const string& node_name, const ConstByteArray bytes);
+
   // CAVEAT: Need offset as HVX library reserves some ids
   static constexpr int NODE_ID_OFFSET = 0x10000;
 
   static GraphTransferInfo::NodeInfo* FindNodeInfo(
       const string& node_name, GraphTransferInfo* graph_transfer_info);
 
+  const RemoteFusedGraphExecuteInfo* execute_info_{};
+  GraphTransferer graph_transferer_{};
   // Dummy float array for input node.
   // TODO(satok): Use actual data passed by FillInputNode and remove
-  std::vector<float> dummy_input_float_;
+  std::vector<float> dummy_input_float_{};
   // Dummy byte array for cosnt node.
   // TODO(satok): Remove
-  std::unordered_map<int, std::vector<uint8>> dummy_const_data_;
+  std::unordered_map<int, std::vector<uint8>> dummy_const_data_{};
 
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h b/libs/tensorflow/include/tensorflow/core/kernels/i_remote_fused_graph_executor.h
similarity index 50%
rename from libs/tensorflow/include/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h
rename to libs/tensorflow/include/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index 86d01b3..fe62a25 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/hexagon/i_soc_control_wrapper.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 vcyou may not use this file except in compliance with the License.
@@ -13,62 +13,60 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/hexagon/graph_transferer.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
-class ISocControlWrapper {
+class IRemoteFusedGraphExecutor {
  public:
   using ByteArray =
       std::tuple<uint8* /* data */, uint64 /* size */, DataType /* type */>;
   using ConstByteArray = std::tuple<const uint8* /* data */, uint64 /* size */,
                                     DataType /* type */>;
+  using TensorAllocatorFunc = std::function<Tensor*(const TensorShape& shape)>;
 
-  ISocControlWrapper() = default;
-  virtual ~ISocControlWrapper() = default;
+  IRemoteFusedGraphExecutor() = default;
+  virtual ~IRemoteFusedGraphExecutor() = default;
 
-  // Return version of SOC controller library.
-  // This function is mainly for a debug purpose to verify SOC controller.
+  // Return version of executor.
+  // This function is mainly for a debug purpose to verify version of
+  // executor info.
   virtual int GetVersion() = 0;
 
-  // Initialize SOC. This function should be called before
+  // Initialize executor. This function is called before
   // starting graph transfer.
-  virtual bool Init() = 0;
+  virtual bool Init(const RemoteFusedGraphExecuteInfo& info) = 0;
 
-  // Finalize SOC. This function should be called when all graph executions
+  // Finalize executor. This function is called when all graph executions
   // are finished.
   virtual bool Finalize() = 0;
 
-  // Setup graph on SOC
-  virtual bool SetupGraph(const GraphTransferer &graph_transferer) = 0;
+  // Setup graph
+  virtual bool SetupGraph() = 0;
 
-  // Execute graph on SOC
+  // Execute graph
   virtual bool ExecuteGraph() = 0;
 
-  // Teardown Graph on SOC
+  // Teardown Graph
   virtual bool TeardownGraph() = 0;
 
-  // Fill input node's output on SOC with ByteArray
-  virtual bool FillInputNode(const string& node_name,
-                             const ConstByteArray bytes) = 0;
-
-  // Fill input node's output on SOC with Tensor
+  // Fill input node's output with Tensor
   virtual bool FillInputNode(const string& node_name, const Tensor& tensor) = 0;
 
-  // Read output node's outputs on SOC
-  virtual bool ReadOutputNode(string node_name,
-                              std::vector<ByteArray> *outputs) = 0;
+  // Read output node's outputs as ByteArrays
+  virtual bool ReadOutputNode(const string& node_name,
+                              TensorAllocatorFunc tensor_allocator) = 0;
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ISocControlWrapper);
+  TF_DISALLOW_COPY_AND_ASSIGN(IRemoteFusedGraphExecutor);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_SOC_CONTROL_WRAPPER_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/linalg_ops_common.h b/libs/tensorflow/include/tensorflow/core/kernels/linalg_ops_common.h
index 8a606ed..ab4142a 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/linalg_ops_common.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/linalg_ops_common.h
@@ -108,6 +108,10 @@ class LinearAlgebraOp : public OpKernel {
                                                   : static_cast<int64>(cost);
   }
 
+  // Returns true if it is safe to forward (alias) input to output buffer
+  // and expect the kernel to perform the computation inplace.
+  virtual bool EnableInputForwarding() const { return true; }
+
   using Matrix =
       Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using ConstMatrixMap = Eigen::Map<const Matrix>;
@@ -127,7 +131,7 @@ class LinearAlgebraOp : public OpKernel {
                              MatrixMaps* outputs) = 0;
 
  private:
-  using TensorInputs = gtl::InlinedVector<Tensor, 4>;
+  using TensorInputs = gtl::InlinedVector<const Tensor*, 4>;
   using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
 
   // This function maps slices (matrices) of the input and output tensors using
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/lookup_table_op.h b/libs/tensorflow/include/tensorflow/core/kernels/lookup_table_op.h
index 062de04..41123a3 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/lookup_table_op.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/lookup_table_op.h
@@ -58,6 +58,10 @@ class LookupTableOp : public OpKernel {
           container->Unref();
           return ctx->status();
         }
+        if (ctx->track_allocations()) {
+          ctx->record_device_persistent_memory_allocation(
+              container->MemoryUsed());
+        }
         *ret = container;
         return Status::OK();
       };
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/queue_base.h b/libs/tensorflow/include/tensorflow/core/kernels/queue_base.h
index 79b479b..0a0e51a 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/queue_base.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/queue_base.h
@@ -143,7 +143,7 @@ class QueueBase : public QueueInterface {
   const DataTypeVector component_dtypes_;
   const std::vector<TensorShape> component_shapes_;
   const string name_;
-  mutex mu_;
+  mutable mutex mu_;
   bool closed_ GUARDED_BY(mu_);
 
   struct Attempt;
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/record_yielder.h b/libs/tensorflow/include/tensorflow/core/kernels/record_yielder.h
index 503644f..44f7c95 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/record_yielder.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/record_yielder.h
@@ -142,7 +142,7 @@ class RecordYielder {
     // any.
     return stop_ || !status_.ok() || (epoch_end_ && !buf_.empty()) ||
            (!epoch_end_ &&
-            buf_.size() >= std::max<int64>(1, opts_.bufsize / 2));
+            buf_.size() >= std::max<uint64>(1, opts_.bufsize / 2));
   }
 
   void MainLoop();
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/reduction_ops_common.h b/libs/tensorflow/include/tensorflow/core/kernels/reduction_ops_common.h
index 19071b4..0cd9c25 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/reduction_ops_common.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/reduction_ops_common.h
@@ -151,19 +151,16 @@ class ReductionOp : public OpKernel {
     OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_));
     CHECK_GE(helper.ndims(), 0);
 
-    // The real output shape will be assigned below.
-    TensorShape empty_shape;
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &out));
-
     if (helper.ndims() == 0 ||
         (helper.ndims() == 1 && !helper.reduce_first_axis())) {
       // Special case. Reduces nothing.  It is unclear why this is
       // necessary, but tests fail without it.  Look into why this
       // case occurs.
-      if (!out->CopyFrom(data, helper.out_shape())) {
+      Tensor out;
+      if (!out.CopyFrom(data, helper.out_shape())) {
         ctx->SetStatus(errors::Internal("Error during reduction copy."));
       }
+      ctx->set_output(0, out);
       return;
     }
 
@@ -174,8 +171,9 @@ class ReductionOp : public OpKernel {
     // A temporary tensor whose size matches the size of the reduced
     // output.
     Tensor tmp_out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(out->dtype(), helper.out_reshape(),
-                                           &tmp_out, alloc_attr));
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(ctx->expected_output_dtype(0),
+                                helper.out_reshape(), &tmp_out, alloc_attr));
 
     typedef functor::ReduceFunctor<Device, Reducer> Functor;
     Constants<Device> constants;
@@ -233,9 +231,19 @@ class ReductionOp : public OpKernel {
     // Set the real output using the contents of the reduction but the
     // real expected output shape.  The number of elements should
     // match between the two shapes.
-    if (!out->CopyFrom(tmp_out, helper.out_shape())) {
+    Tensor out;
+    if (!out.CopyFrom(tmp_out, helper.out_shape())) {
       ctx->SetStatus(errors::Internal("Error during reduction copy."));
     }
+    if (ctx->track_allocations()) {
+      // The temporary memory becomes the output memory.
+      if (ctx->allocate_on_host(alloc_attr)) {
+        ctx->record_host_temp_memory_size(-out.AllocatedBytes());
+      } else {
+        ctx->record_device_temp_memory_size(-out.AllocatedBytes());
+      }
+    }
+    ctx->set_output(0, out);
   }
 
  private:
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h b/libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
new file mode 100644
index 0000000..70d758e
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// RemoteFusedGraphExecuteOpTestUtils is a set of utilities in tests for
+// RemoteFusedGraphExecuteOp.
+class RemoteFusedGraphExecuteOpTestUtils {
+ public:
+  static Output BuildAddOp(const Scope& scope, const Input& x, const Input& y);
+  static GraphDef BuildAddGraph(const string& name0, const float val0,
+                                const string& name1, const float val1,
+                                const string& name_out);
+
+ private:
+  RemoteFusedGraphExecuteOpTestUtils() = delete;
+  TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteOpTestUtils);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
new file mode 100644
index 0000000..7c198fb
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// RemoteFusedGraphExecuteUtils provides APIs to register and get builder
+// functions for IRemoteFusedGraphExecutor.
+class RemoteFusedGraphExecuteUtils {
+ public:
+  static constexpr const char* const ATTR_OUTPUT_DATA_TYPES =
+      "_output_data_types";
+  static constexpr const char* const ATTR_OUTPUT_SHAPES = "_output_shapes";
+
+  using ExecutorBuildFunc = std::function<Status(
+      std::unique_ptr<IRemoteFusedGraphExecutor>* executor)>;
+  // Registrar class for IRemoteFusedGraphExecutor.
+  class ExecutorBuildRegistrar {
+   public:
+    ExecutorBuildRegistrar(const string& name, ExecutorBuildFunc func);
+
+   private:
+    TF_DISALLOW_COPY_AND_ASSIGN(ExecutorBuildRegistrar);
+  };
+  using ExecutorBuildRegistry = std::map<string, ExecutorBuildFunc>;
+
+  using TensorShapeType = std::pair<DataType, TensorShape>;
+  using TensorShapeMap =
+      std::unordered_multimap<string /* node name */,
+                              std::pair<int /* port */, TensorShapeType>>;
+
+  // Return registered ExecutorBuildFunc for given name.
+  static const ExecutorBuildFunc* GetExecutorBuildFunc(const string& name);
+
+  // To determine shapes of output tensors of all nodes, dryrun the graph.
+  // This function supplies memory allocation information when loading
+  // the graph. This function is used to verify shape inference and actual
+  // output shape.
+  static Status DryRunInference(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names,
+      const bool initialize_by_zero,
+      std::vector<tensorflow::Tensor>* output_tensors);
+
+  // Dry run inference to obtain shapes for all nodes.
+  // CAVEAT: Do not add or modify output_tensors in output_tensor_info
+  // otherwise, address map may be broken by re-allocation inside
+  // std::vector.
+  static Status DryRunInferenceForAllNode(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const bool initialize_by_zero, TensorShapeMap* tensor_shape_map);
+
+  static bool IsInputNode(
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const string& node_name);
+
+  static void ConvertToTensorShapeMap(
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      const std::vector<string>& output_node_names,
+      const std::vector<tensorflow::Tensor>& output_tensors,
+      TensorShapeMap* tensor_shape_map);
+
+  static Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                    Tensor* tensor);
+
+  static bool AddOutputTensorShapeType(const std::vector<DataType>& data_types,
+                                       const std::vector<TensorShape>& shapes,
+                                       NodeDef* node_def);
+
+  static Status AddOutputTensorShapeTypeByTensorShapeMap(
+      const TensorShapeMap& tensor_shape_map, NodeDef* node_def);
+
+  static Status PropagateShapeInference(
+      const GraphDef& graph_def,
+      const std::vector<std::pair<string, Tensor>>& input_node_info_list,
+      Graph* graph, ShapeRefiner* shape_refiner);
+
+  static Status BuildTensorShapeMapFromGraph(const Graph& graph,
+                                             const ShapeRefiner& shape_refiner,
+                                             TensorShapeMap* tensor_shape_map);
+
+  static const TensorShapeType* GetTensorShapeType(
+      const TensorShapeMap& tensor_shape_map, const string& node_name);
+
+  static const TensorShapeType* GetTensorShapeType(
+      const TensorShapeMap& tensor_shape_map, const string& node_name,
+      const int port);
+
+ private:
+  static void EmplaceTensorShapeType(const string& name, const Tensor& tensor,
+                                     TensorShapeMap* tensor_shape_map);
+
+  static ExecutorBuildRegistry* GetExecutorBuildRegistry();
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils);
+};
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/softmax_op.h b/libs/tensorflow/include/tensorflow/core/kernels/softmax_op.h
index dc61e26..2ae7795 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/softmax_op.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/softmax_op.h
@@ -40,8 +40,8 @@ class SoftmaxOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     Tensor* softmax_out = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, logits_in.shape(), &softmax_out));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, logits_in.shape(), &softmax_out));
     if (logits_in.NumElements()) {
       functor::SoftmaxFunctor<Device, T> functor;
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/sparse_conditional_accumulator.h b/libs/tensorflow/include/tensorflow/core/kernels/sparse_conditional_accumulator.h
index 8956009..2c1bffb 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -172,8 +172,10 @@ class SparseConditionalAccumulator
     }
 
     // Assign values to accum_val_tensor
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
     ctx->allocate_persistent(dtype_, grad_val->shape(), accum_val_persistent_,
-                             &accum_val_);
+                             &accum_val_)
+        .IgnoreError();
     accum_val_->flat<T>().device(ctx->template eigen_device<Device>()) =
         grad_val->flat<T>();
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/sparse_matmul_op.h b/libs/tensorflow/include/tensorflow/core/kernels/sparse_matmul_op.h
index 61bd659..bff6a0c 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/sparse_matmul_op.h
@@ -255,12 +255,13 @@ EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
+  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
+  Packet2d a =
+      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
@@ -417,14 +418,17 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
-  return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
-                           16);
+  return _mm512_castsi512_ps(_mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
+      16));
 }
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
-  return _mm512_slli_epi32(
-      _mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
+  return _mm512_castsi512_ps(
+      _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_castpd_si256(
+                            _mm512_extractf64x4_pd(_mm512_castps_pd(from), 1))),
+                        16));
 }
 
 #endif
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/strided_slice_op_impl.h b/libs/tensorflow/include/tensorflow/core/kernels/strided_slice_op_impl.h
index 93cede3..d820db3 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -277,8 +277,10 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
 
 #if GOOGLE_CUDA
 TF_CALL_GPU_PROXY_TYPES(PREVENT_FOR_N_GPU);
+TF_CALL_complex64(PREVENT_FOR_N_GPU);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
+TF_CALL_complex64(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
 #endif  // END GOOGLE_CUDA
 
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/transpose_functor.h b/libs/tensorflow/include/tensorflow/core/kernels/transpose_functor.h
index 99f7d8e..91e614e 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/transpose_functor.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/transpose_functor.h
@@ -35,6 +35,9 @@ Status DoTranspose(const Device& device, const Tensor& in,
 // Implementation details.
 namespace internal {
 
+typedef gtl::InlinedVector<int64, 8> TransposeDimsVec;
+typedef gtl::InlinedVector<int32, 8> TransposePermsVec;
+
 // Helper to compute 'strides' given a tensor 'shape'. I.e.,
 // strides[i] = prod(shape.dim_size[(i+1):])
 template <typename Index>
@@ -47,6 +50,57 @@ void ComputeStride(const TensorShape& shape, Index* strides) {
   }
 }
 
+// Helper function that takes a tensor shape, a permutation, combines the
+// neighboring shapes if their indices in the permutation are consecutive.
+// The function outputs the combined shape and new permutation.
+// Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
+// produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
+inline void ReduceTransposeDimensions(const TensorShape& shape,
+                                      gtl::ArraySlice<int32> perm,
+                                      TransposePermsVec* new_perm,
+                                      TransposeDimsVec* new_dims) {
+  CHECK_EQ(shape.dims(), perm.size());
+  if (shape.dims() == 1) {
+    // If input dimension is already 1, no need to reduce dimension.
+    new_perm->resize(1);
+    (*new_perm)[0] = perm[0];
+    (*new_dims)[0] = shape.dim_size(0);
+    return;
+  }
+  TransposePermsVec new_dim_position(shape.dims(), -1);
+  TransposeDimsVec combined_dims(shape.dims(), 0);
+  int cur_head = perm[0];
+  new_dim_position[cur_head] = 0;
+  combined_dims[0] = shape.dim_size(cur_head);
+  int dim_idx = 0;
+  for (int perm_idx = 1; perm_idx < shape.dims(); ++perm_idx) {
+    // If two indices in permutation are consecutive numbers, combine their
+    // dimensions.
+    if (cur_head + 1 == perm[perm_idx]) {
+      cur_head = perm[perm_idx];
+      combined_dims[dim_idx] *= shape.dim_size(cur_head);
+    } else {
+      // Else start a new dimension.
+      cur_head = perm[perm_idx];
+      dim_idx++;
+      new_dim_position[cur_head] = dim_idx;
+      combined_dims[dim_idx] = shape.dim_size(cur_head);
+    }
+  }
+  // Compact the new permutations and dimension sizes.
+  new_perm->resize(dim_idx + 1);
+  new_dims->resize(dim_idx + 1);
+  dim_idx = 0;
+  for (int i = 0; i < new_dim_position.size(); ++i) {
+    if (new_dim_position[i] >= 0) {
+      int new_perm_idx = new_dim_position[i];
+      (*new_perm)[dim_idx] = new_perm_idx;
+      (*new_dims)[dim_idx] = combined_dims[new_perm_idx];
+      dim_idx++;
+    }
+  }
+}
+
 // Device-specific naive implementation for transpose.
 template <typename Device, typename T>
 void TransposeSimple(const Device& d, const Tensor& in,
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/transpose_op.h b/libs/tensorflow/include/tensorflow/core/kernels/transpose_op.h
index 5f40bce..a69eecc 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/transpose_op.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/transpose_op.h
@@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
+#ifdef INTEL_MKL
+class MklTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif  // INTEL_MKL
+
 class TransposeGpuOp : public TransposeOp {
  public:
   explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/typed_queue.h b/libs/tensorflow/include/tensorflow/core/kernels/typed_queue.h
index b09b60f..0d608d9 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/typed_queue.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/typed_queue.h
@@ -16,9 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
 #define TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
 
+#include <deque>
+#include <queue>
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/queue_base.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -34,6 +38,8 @@ class TypedQueue : public QueueBase {
 
   virtual Status Initialize();  // Must be called before any other method.
 
+  int64 MemoryUsed() const override;
+
  protected:
   std::vector<SubQueue> queues_ GUARDED_BY(mu_);
 };  // class TypedQueue
@@ -65,6 +71,52 @@ Status TypedQueue<SubQueue>::Initialize() {
   return Status::OK();
 }
 
+namespace {
+
+template <typename SubQueue>
+int64 SizeOf(const SubQueue& sq) {
+  static_assert(sizeof(SubQueue) != sizeof(SubQueue), "SubQueue size unknown.");
+  return 0;
+}
+
+template <>
+int64 SizeOf(const std::deque<PersistentTensor>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * sq.front().AllocatedBytes();
+}
+
+template <>
+int64 SizeOf(const std::vector<PersistentTensor>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * sq.front().AllocatedBytes();
+}
+
+using TensorPair = std::pair<int64, PersistentTensor>;
+
+template <typename U, typename V>
+int64 SizeOf(const std::priority_queue<TensorPair, U, V>& sq) {
+  if (sq.empty()) {
+    return 0;
+  }
+  return sq.size() * (sizeof(TensorPair) + sq.top().second.AllocatedBytes());
+}
+
+}  // namespace
+
+template <typename SubQueue>
+int64 TypedQueue<SubQueue>::MemoryUsed() const {
+  int memory_size = 0;
+  mutex_lock l(mu_);
+  for (const auto& sq : queues_) {
+    memory_size += SizeOf(sq);
+  }
+  return memory_size;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_TYPED_QUEUE_H_
diff --git a/libs/tensorflow/include/tensorflow/core/kernels/variable_ops.h b/libs/tensorflow/include/tensorflow/core/kernels/variable_ops.h
index 2839c3d..8c173a4 100644
--- a/libs/tensorflow/include/tensorflow/core/kernels/variable_ops.h
+++ b/libs/tensorflow/include/tensorflow/core/kernels/variable_ops.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_VARIABLE_OPS_H_
 #define TENSORFLOW_KERNELS_VARIABLE_OPS_H_
 
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -154,6 +155,15 @@ class DestroyTemporaryVariableOp : public OpKernel {
     OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
     OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
                                 context->step_container()->name(), var_name_));
+    if (context->track_allocations()) {
+      if (context->allocate_on_host(AllocatorAttributes())) {
+        context->record_host_persistent_memory_allocation(
+            -tmpvar.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            -tmpvar.AllocatedBytes());
+      }
+    }
   }
 
  private:
diff --git a/libs/tensorflow/include/tensorflow/core/lib/core/error_codes.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/lib/core/error_codes.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/lib/core/error_codes.pb_text.h b/libs/tensorflow/include/tensorflow/core/lib/core/error_codes.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/lib/core/status.h b/libs/tensorflow/include/tensorflow/core/lib/core/status.h
index 1dde1b4..e345a5d 100644
--- a/libs/tensorflow/include/tensorflow/core/lib/core/status.h
+++ b/libs/tensorflow/include/tensorflow/core/lib/core/status.h
@@ -22,9 +22,15 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+#if defined(__clang__)
+// Only clang supports warn_unused_result as a type annotation.
+class TF_MUST_USE_RESULT Status;
+#endif
+
 /// @ingroup core
 /// Denotes success or failure of a call in Tensorflow.
 class Status {
@@ -112,8 +118,19 @@ std::ostream& operator<<(std::ostream& os, const Status& x);
 
 typedef std::function<void(const Status&)> StatusCallback;
 
-#define TF_CHECK_OK(val) CHECK_EQ(::tensorflow::Status::OK(), (val))
-#define TF_QCHECK_OK(val) QCHECK_EQ(::tensorflow::Status::OK(), (val))
+extern tensorflow::string* TfCheckOpHelperOutOfLine(
+    const ::tensorflow::Status& v, const char* msg);
+inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
+                                           const char* msg) {
+  if (v.ok()) return nullptr;
+  return TfCheckOpHelperOutOfLine(v, msg);
+}
+#define TF_CHECK_OK(val)                                             \
+  while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(FATAL) << *(_result)
+#define TF_QCHECK_OK(val)                                            \
+  while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
+  LOG(QFATAL) << *(_result)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
 // mode.
diff --git a/libs/tensorflow/include/tensorflow/core/lib/core/threadpool.h b/libs/tensorflow/include/tensorflow/core/lib/core/threadpool.h
index 4de1177..251d588 100644
--- a/libs/tensorflow/include/tensorflow/core/lib/core/threadpool.h
+++ b/libs/tensorflow/include/tensorflow/core/lib/core/threadpool.h
@@ -27,24 +27,36 @@ namespace thread {
 
 class ThreadPool {
  public:
-  // Construct a pool that contains "num_threads" threads with specified "name".
-  // env->StartThread() is used to create individual threads.
+  // Constructs a pool that contains "num_threads" threads with specified
+  // "name". env->StartThread() is used to create individual threads with the
+  // given ThreadOptions. If "low_latency_hint" is true the thread pool
+  // implementation may use it as a hint that lower latency if preferred at the
+  // cost of higher CPU usage, e.g. by letting one or more idle threads spin
+  // wait. Conversely, if the threadpool is used to schedule high-latency
+  // operations like I/O the hint should be set to false.
   //
   // REQUIRES: num_threads > 0
+  ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
+             int num_threads, bool low_latency_hint);
+
+  // Constructs a pool for low-latency ops that contains "num_threads" threads
+  // with specified "name". env->StartThread() is used to create individual
+  // threads.
+  // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const string& name, int num_threads);
 
-  // Construct a pool that contains "num_threads" threads with specified "name".
-  // env->StartThread() is used to create individual threads.
-  //
+  // Constructs a pool for low-latency ops that contains "num_threads" threads
+  // with specified "name". env->StartThread() is used to create individual
+  // threads with the given ThreadOptions.
   // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
              int num_threads);
 
-  // Wait until all scheduled work has finished and then destroy the
+  // Waits until all scheduled work has finished and then destroy the
   // set of threads.
   ~ThreadPool();
 
-  // Schedule fn() for execution in the pool of threads.
+  // Schedules fn() for execution in the pool of threads.
   void Schedule(std::function<void()> fn);
 
   // ParallelFor shards the "total" units of work assuming each unit of work
@@ -60,7 +72,7 @@ class ThreadPool {
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn);
 
-  // Shard the "total" units of work. For more details, see "ParallelFor".
+  // Shards the "total" units of work. For more details, see "ParallelFor".
   //
   // The function is passed a thread_id between 0 and NumThreads() *inclusive*.
   // This is because some work can happen on the caller thread while the threads
diff --git a/libs/tensorflow/include/tensorflow/core/lib/gtl/optional.h b/libs/tensorflow/include/tensorflow/core/lib/gtl/optional.h
new file mode 100644
index 0000000..f80b5c1
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/lib/gtl/optional.h
@@ -0,0 +1,876 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LIB_GTL_OPTIONAL_H_
+#define TENSORFLOW_LIB_GTL_OPTIONAL_H_
+
+#include <assert.h>
+#include <functional>
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// A value of type gtl::optional<T> holds either a value of T or an
+// "empty" value.  When it holds a value of T, it stores it as a direct
+// subobject, so sizeof(optional<T>) is approximately sizeof(T)+1. The interface
+// is based on the upcoming std::optional<T>, and gtl::optional<T> is
+// designed to be cheaply drop-in replaceable by std::optional<T>, once it is
+// rolled out.
+//
+// This implementation is based on the specification in the latest draft as of
+// 2017-01-05, section 20.6.
+//
+// Differences between gtl::optional<T> and std::optional<T> include:
+//    - constexpr not used for nonconst member functions.
+//      (dependency on some differences between C++11 and C++14.)
+//    - nullopt and in_place are not constexpr. We need the inline variable
+//      support in C++17 for external linkage.
+//    - CHECK instead of throwing std::bad_optional_access.
+//    - optional::swap() and swap() relies on std::is_(nothrow_)swappable
+//      which is introduced in C++17. So we assume is_swappable is always true
+//      and is_nothrow_swappable is same as std::is_trivial.
+//    - make_optional cannot be constexpr due to absence of guaranteed copy
+//      elision.
+//
+// Synopsis:
+//
+//     #include "tensorflow/core/lib/gtl/optional.h"
+//
+//     tensorflow::gtl::optional<string> f() {
+//       string result;
+//       if (...) {
+//          ...
+//          result = ...;
+//          return result;
+//       } else {
+//          ...
+//          return tensorflow::gtl::nullopt;
+//       }
+//     }
+//
+//     int main() {
+//         tensorflow::gtl::optional<string> optstr = f();
+//         if (optstr) {
+//            // non-empty
+//            print(optstr.value());
+//         } else {
+//            // empty
+//            error();
+//         }
+//     }
+template <typename T>
+class optional;
+
+// The tag constant `in_place` is used as the first parameter of an optional<T>
+// constructor to indicate that the remaining arguments should be forwarded
+// to the underlying T constructor.
+struct in_place_t {};
+extern const in_place_t in_place;
+
+// The tag constant `nullopt` is used to indicate an empty optional<T> in
+// certain functions, such as construction or assignment.
+struct nullopt_t {
+  struct init_t {};
+  static init_t init;
+  // It must not be default-constructible to avoid ambiguity for opt = {}.
+  // Note the non-const reference, it is to eliminate ambiguity for code like:
+  // struct S { int value; };
+  //
+  // void Test() {
+  //   optional<S> opt;
+  //   opt = {{}};
+  // }
+  explicit constexpr nullopt_t(init_t& /*unused*/) {}  // NOLINT
+};
+extern const nullopt_t nullopt;
+
+namespace internal_optional {
+
+// define forward locally because std::forward is not constexpr until C++14
+template <typename T>
+constexpr T&& forward(typename std::remove_reference<T>::type&
+                          t) noexcept {  // NOLINT(runtime/references)
+  return static_cast<T&&>(t);
+}
+
+struct empty_struct {};
+// This class stores the data in optional<T>.
+// It is specialized based on whether T is trivially destructible.
+// This is the specialization for non trivially destructible type.
+template <typename T, bool = std::is_trivially_destructible<T>::value>
+class optional_data_dtor_base {
+ protected:
+  // Whether there is data or not.
+  bool engaged_;
+  // data storage
+  union {
+    empty_struct dummy_;
+    T data_;
+  };
+
+  void destruct() noexcept {
+    if (engaged_) {
+      data_.~T();
+      engaged_ = false;
+    }
+  }
+
+  // dummy_ must be initialized for constexpr constructor
+  constexpr optional_data_dtor_base() noexcept : engaged_(false), dummy_{} {}
+
+  template <typename... Args>
+  constexpr explicit optional_data_dtor_base(in_place_t, Args&&... args)
+      : engaged_(true), data_(internal_optional::forward<Args>(args)...) {}
+
+  ~optional_data_dtor_base() { destruct(); }
+};
+
+// Specialization for trivially destructible type.
+template <typename T>
+class optional_data_dtor_base<T, true> {
+ protected:
+  // Whether there is data or not.
+  bool engaged_;
+  // data storage
+  union {
+    empty_struct dummy_;
+    T data_;
+  };
+  void destruct() noexcept { engaged_ = false; }
+
+  // dummy_ must be initialized for constexpr constructor
+  constexpr optional_data_dtor_base() noexcept : engaged_(false), dummy_{} {}
+
+  template <typename... Args>
+  constexpr explicit optional_data_dtor_base(in_place_t, Args&&... args)
+      : engaged_(true), data_(internal_optional::forward<Args>(args)...) {}
+
+  ~optional_data_dtor_base() = default;
+};
+
+template <typename T>
+class optional_data : public optional_data_dtor_base<T> {
+ protected:
+  using base = optional_data_dtor_base<T>;
+  using base::base;
+
+  T* pointer() { return &this->data_; }
+
+  constexpr const T* pointer() const { return &this->data_; }
+
+  template <typename... Args>
+  void construct(Args&&... args) {
+    new (pointer()) T(std::forward<Args>(args)...);
+    this->engaged_ = true;
+  }
+
+  template <typename U>
+  void assign(U&& u) {
+    if (this->engaged_) {
+      this->data_ = std::forward<U>(u);
+    } else {
+      construct(std::forward<U>(u));
+    }
+  }
+
+  optional_data() = default;
+
+  optional_data(const optional_data& rhs) {
+    if (rhs.engaged_) {
+      construct(rhs.data_);
+    }
+  }
+
+  optional_data(optional_data&& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value) {
+    if (rhs.engaged_) {
+      construct(std::move(rhs.data_));
+    }
+  }
+
+  optional_data& operator=(const optional_data& rhs) {
+    if (rhs.engaged_) {
+      assign(rhs.data_);
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+
+  optional_data& operator=(optional_data&& rhs) noexcept(
+      std::is_nothrow_move_assignable<T>::value&&
+          std::is_nothrow_move_constructible<T>::value) {
+    if (rhs.engaged_) {
+      assign(std::move(rhs.data_));
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+};
+
+// ordered by level of restriction, from low to high.
+// copyable implies movable.
+enum class copy_traits { copyable = 0, movable = 1, non_movable = 2 };
+
+// base class for enabling/disabling copy/move constructor.
+template <copy_traits>
+class optional_ctor_base;
+
+template <>
+class optional_ctor_base<copy_traits::copyable> {
+ public:
+  constexpr optional_ctor_base() = default;
+  optional_ctor_base(const optional_ctor_base&) = default;
+  optional_ctor_base(optional_ctor_base&&) = default;
+  optional_ctor_base& operator=(const optional_ctor_base&) = default;
+  optional_ctor_base& operator=(optional_ctor_base&&) = default;
+};
+
+template <>
+class optional_ctor_base<copy_traits::movable> {
+ public:
+  constexpr optional_ctor_base() = default;
+  optional_ctor_base(const optional_ctor_base&) = delete;
+  optional_ctor_base(optional_ctor_base&&) = default;
+  optional_ctor_base& operator=(const optional_ctor_base&) = default;
+  optional_ctor_base& operator=(optional_ctor_base&&) = default;
+};
+
+template <>
+class optional_ctor_base<copy_traits::non_movable> {
+ public:
+  constexpr optional_ctor_base() = default;
+  optional_ctor_base(const optional_ctor_base&) = delete;
+  optional_ctor_base(optional_ctor_base&&) = delete;
+  optional_ctor_base& operator=(const optional_ctor_base&) = default;
+  optional_ctor_base& operator=(optional_ctor_base&&) = default;
+};
+
+// base class for enabling/disabling copy/move assignment.
+template <copy_traits>
+class optional_assign_base;
+
+template <>
+class optional_assign_base<copy_traits::copyable> {
+ public:
+  constexpr optional_assign_base() = default;
+  optional_assign_base(const optional_assign_base&) = default;
+  optional_assign_base(optional_assign_base&&) = default;
+  optional_assign_base& operator=(const optional_assign_base&) = default;
+  optional_assign_base& operator=(optional_assign_base&&) = default;
+};
+
+template <>
+class optional_assign_base<copy_traits::movable> {
+ public:
+  constexpr optional_assign_base() = default;
+  optional_assign_base(const optional_assign_base&) = default;
+  optional_assign_base(optional_assign_base&&) = default;
+  optional_assign_base& operator=(const optional_assign_base&) = delete;
+  optional_assign_base& operator=(optional_assign_base&&) = default;
+};
+
+template <>
+class optional_assign_base<copy_traits::non_movable> {
+ public:
+  constexpr optional_assign_base() = default;
+  optional_assign_base(const optional_assign_base&) = default;
+  optional_assign_base(optional_assign_base&&) = default;
+  optional_assign_base& operator=(const optional_assign_base&) = delete;
+  optional_assign_base& operator=(optional_assign_base&&) = delete;
+};
+
+template <typename T>
+constexpr copy_traits get_ctor_copy_traits() {
+  return std::is_copy_constructible<T>::value
+             ? copy_traits::copyable
+             : std::is_move_constructible<T>::value ? copy_traits::movable
+                                                    : copy_traits::non_movable;
+}
+
+template <typename T>
+constexpr copy_traits get_assign_copy_traits() {
+  return std::is_copy_assignable<T>::value &&
+                 std::is_copy_constructible<T>::value
+             ? copy_traits::copyable
+             : std::is_move_assignable<T>::value &&
+                       std::is_move_constructible<T>::value
+                   ? copy_traits::movable
+                   : copy_traits::non_movable;
+}
+
+// Whether T is constructible or convertible from optional<U>.
+template <typename T, typename U>
+struct is_constructible_convertible_from_optional
+    : std::integral_constant<
+          bool, std::is_constructible<T, optional<U>&>::value ||
+                    std::is_constructible<T, optional<U>&&>::value ||
+                    std::is_constructible<T, const optional<U>&>::value ||
+                    std::is_constructible<T, const optional<U>&&>::value ||
+                    std::is_convertible<optional<U>&, T>::value ||
+                    std::is_convertible<optional<U>&&, T>::value ||
+                    std::is_convertible<const optional<U>&, T>::value ||
+                    std::is_convertible<const optional<U>&&, T>::value> {};
+
+// Whether T is constructible or convertible or assignable from optional<U>.
+template <typename T, typename U>
+struct is_constructible_convertible_assignable_from_optional
+    : std::integral_constant<
+          bool, is_constructible_convertible_from_optional<T, U>::value ||
+                    std::is_assignable<T&, optional<U>&>::value ||
+                    std::is_assignable<T&, optional<U>&&>::value ||
+                    std::is_assignable<T&, const optional<U>&>::value ||
+                    std::is_assignable<T&, const optional<U>&&>::value> {};
+
+}  // namespace internal_optional
+
+template <typename T>
+class optional : private internal_optional::optional_data<T>,
+                 private internal_optional::optional_ctor_base<
+                     internal_optional::get_ctor_copy_traits<T>()>,
+                 private internal_optional::optional_assign_base<
+                     internal_optional::get_assign_copy_traits<T>()> {
+  using data_base = internal_optional::optional_data<T>;
+
+ public:
+  typedef T value_type;
+
+  // [optional.ctor], constructors
+
+  // A default constructed optional holds the empty value, NOT a default
+  // constructed T.
+  constexpr optional() noexcept {}
+
+  // An optional initialized with `nullopt` holds the empty value.
+  constexpr optional(nullopt_t) noexcept {}  // NOLINT(runtime/explicit)
+
+  // Copy constructor, standard semantics.
+  optional(const optional& src) = default;
+
+  // Move constructor, standard semantics.
+  optional(optional&& src) = default;
+
+  // optional<T>(in_place, arg1, arg2, arg3) constructs a non-empty optional
+  // with an in-place constructed value of T(arg1,arg2,arg3).
+  // TODO(b/34201852): Add std::is_constructible<T, Args&&...> SFINAE.
+  template <typename... Args>
+  constexpr explicit optional(in_place_t, Args&&... args)
+      : data_base(in_place_t(), internal_optional::forward<Args>(args)...) {}
+
+  // optional<T>(in_place, {arg1, arg2, arg3}) constructs a non-empty optional
+  // with an in-place list-initialized value of T({arg1, arg2, arg3}).
+  template <typename U, typename... Args,
+            typename = typename std::enable_if<std::is_constructible<
+                T, std::initializer_list<U>&, Args&&...>::value>::type>
+  constexpr explicit optional(in_place_t, std::initializer_list<U> il,
+                              Args&&... args)
+      : data_base(in_place_t(), il, internal_optional::forward<Args>(args)...) {
+  }
+
+  template <
+      typename U = T,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !std::is_same<in_place_t, typename std::decay<U>::type>::value &&
+              !std::is_same<optional<T>, typename std::decay<U>::type>::value &&
+              std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  constexpr optional(U&& v)  // NOLINT
+      : data_base(in_place_t(), internal_optional::forward<U>(v)) {}
+
+  template <
+      typename U = T,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !std::is_same<in_place_t, typename std::decay<U>::type>::value &&
+              !std::is_same<optional<T>, typename std::decay<U>::type>::value &&
+              !std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  explicit constexpr optional(U&& v)
+      : data_base(in_place_t(), internal_optional::forward<U>(v)) {}
+
+  // Converting copy constructor (implicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, const U&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              std::is_convertible<const U&, T>::value,
+          bool>::type = false>
+  optional(const optional<U>& rhs) {  // NOLINT
+    if (rhs) {
+      this->construct(*rhs);
+    }
+  }
+
+  // Converting copy constructor (explicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, const U&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              !std::is_convertible<const U&, T>::value,
+          bool>::type = false>
+  explicit optional(const optional<U>& rhs) {
+    if (rhs) {
+      this->construct(*rhs);
+    }
+  }
+
+  // Converting move constructor (implicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  optional(optional<U>&& rhs) {  // NOLINT
+    if (rhs) {
+      this->construct(std::move(*rhs));
+    }
+  }
+
+  // Converting move constructor (explicit)
+  template <
+      typename U,
+      typename std::enable_if<
+          std::is_constructible<T, U&&>::value &&
+              !internal_optional::is_constructible_convertible_from_optional<
+                  T, U>::value &&
+              !std::is_convertible<U&&, T>::value,
+          bool>::type = false>
+  explicit optional(optional<U>&& rhs) {
+    if (rhs) {
+      this->construct(std::move(*rhs));
+    }
+  }
+
+  // [optional.dtor], destructor, trivial if T is trivially destructible.
+  ~optional() = default;
+
+  // [optional.assign], assignment
+
+  // Assignment from nullopt: opt = nullopt
+  optional& operator=(nullopt_t) noexcept {
+    this->destruct();
+    return *this;
+  }
+
+  // Copy assigment, standard semantics.
+  optional& operator=(const optional& src) = default;
+
+  // Move assignment, standard semantics.
+  optional& operator=(optional&& src) = default;
+
+  // Value assignment
+  template <
+      typename U = T,
+      typename = typename std::enable_if<
+          !std::is_same<optional<T>, typename std::decay<U>::type>::value &&
+          (!std::is_scalar<T>::value ||
+           !std::is_same<T, typename std::decay<U>::type>::value) &&
+          std::is_constructible<T, U>::value &&
+          std::is_assignable<T&, U>::value>::type>
+  optional& operator=(U&& v) {
+    this->assign(std::forward<U>(v));
+    return *this;
+  }
+
+  template <typename U,
+            typename = typename std::enable_if<
+                std::is_constructible<T, const U&>::value &&
+                std::is_assignable<T&, const U&>::value &&
+                !internal_optional::
+                    is_constructible_convertible_assignable_from_optional<
+                        T, U>::value>::type>
+  optional& operator=(const optional<U>& rhs) {
+    if (rhs) {
+      this->assign(*rhs);
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+
+  template <typename U,
+            typename = typename std::enable_if<
+                std::is_constructible<T, U>::value &&
+                std::is_assignable<T&, U>::value &&
+                !internal_optional::
+                    is_constructible_convertible_assignable_from_optional<
+                        T, U>::value>::type>
+  optional& operator=(optional<U>&& rhs) {
+    if (rhs) {
+      this->assign(std::move(*rhs));
+    } else {
+      this->destruct();
+    }
+    return *this;
+  }
+
+  // [optional.mod], modifiers
+  // Destroys the inner T value if one is present.
+  void reset() noexcept { this->destruct(); }
+
+  // Emplace reconstruction.  (Re)constructs the underlying T in-place with the
+  // given arguments forwarded:
+  //
+  // optional<Foo> opt;
+  // opt.emplace(arg1,arg2,arg3);  (Constructs Foo(arg1,arg2,arg3))
+  //
+  // If the optional is non-empty, and the `args` refer to subobjects of the
+  // current object, then behaviour is undefined.  This is because the current
+  // object will be destructed before the new object is constructed with `args`.
+  //
+  template <typename... Args,
+            typename = typename std::enable_if<
+                std::is_constructible<T, Args&&...>::value>::type>
+  void emplace(Args&&... args) {
+    this->destruct();
+    this->construct(std::forward<Args>(args)...);
+  }
+
+  // Emplace reconstruction with initializer-list.  See immediately above.
+  template <class U, class... Args,
+            typename = typename std::enable_if<std::is_constructible<
+                T, std::initializer_list<U>&, Args&&...>::value>::type>
+  void emplace(std::initializer_list<U> il, Args&&... args) {
+    this->destruct();
+    this->construct(il, std::forward<Args>(args)...);
+  }
+
+  // [optional.swap], swap
+  // Swap, standard semantics.
+  void swap(optional& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value&&
+          std::is_trivial<T>::value) {
+    if (*this) {
+      if (rhs) {
+        using std::swap;
+        swap(**this, *rhs);
+      } else {
+        rhs.construct(std::move(**this));
+        this->destruct();
+      }
+    } else {
+      if (rhs) {
+        this->construct(std::move(*rhs));
+        rhs.destruct();
+      } else {
+        // no effect (swap(disengaged, disengaged))
+      }
+    }
+  }
+
+  // [optional.observe], observers
+  // You may use `*opt`, and `opt->m`, to access the underlying T value and T's
+  // member `m`, respectively.  If the optional is empty, behaviour is
+  // undefined.
+  constexpr const T* operator->() const { return this->pointer(); }
+  T* operator->() {
+    assert(this->engaged_);
+    return this->pointer();
+  }
+  constexpr const T& operator*() const & { return reference(); }
+  T& operator*() & {
+    assert(this->engaged_);
+    return reference();
+  }
+  constexpr const T&& operator*() const && { return std::move(reference()); }
+  T&& operator*() && {
+    assert(this->engaged_);
+    return std::move(reference());
+  }
+
+  // In a bool context an optional<T> will return false if and only if it is
+  // empty.
+  //
+  //   if (opt) {
+  //     // do something with opt.value();
+  //   } else {
+  //     // opt is empty
+  //   }
+  //
+  constexpr explicit operator bool() const noexcept { return this->engaged_; }
+
+  // Returns false if and only if *this is empty.
+  constexpr bool has_value() const noexcept { return this->engaged_; }
+
+  // Use `opt.value()` to get a reference to underlying value.  The constness
+  // and lvalue/rvalue-ness of `opt` is preserved to the view of the T
+  // subobject.
+  const T& value() const & {
+    CHECK(*this) << "Bad optional access";
+    return reference();
+  }
+  T& value() & {
+    CHECK(*this) << "Bad optional access";
+    return reference();
+  }
+  T&& value() && {  // NOLINT(build/c++11)
+    CHECK(*this) << "Bad optional access";
+    return std::move(reference());
+  }
+  const T&& value() const && {  // NOLINT(build/c++11)
+    CHECK(*this) << "Bad optional access";
+    return std::move(reference());
+  }
+
+  // Use `opt.value_or(val)` to get either the value of T or the given default
+  // `val` in the empty case.
+  template <class U>
+  constexpr T value_or(U&& v) const & {
+    return static_cast<bool>(*this) ? **this
+                                    : static_cast<T>(std::forward<U>(v));
+  }
+  template <class U>
+  T value_or(U&& v) && {  // NOLINT(build/c++11)
+    return static_cast<bool>(*this) ? std::move(**this)
+                                    : static_cast<T>(std::forward<U>(v));
+  }
+
+ private:
+  // Private accessors for internal storage viewed as reference to T.
+  constexpr const T& reference() const { return *this->pointer(); }
+  T& reference() { return *(this->pointer()); }
+
+  // T constaint checks.  You can't have an optional of nullopt_t, in_place_t or
+  // a reference.
+  static_assert(
+      !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
+      "optional<nullopt_t> is not allowed.");
+  static_assert(
+      !std::is_same<in_place_t, typename std::remove_cv<T>::type>::value,
+      "optional<in_place_t> is not allowed.");
+  static_assert(!std::is_reference<T>::value,
+                "optional<reference> is not allowed.");
+};
+
+// [optional.specalg]
+// Swap, standard semantics.
+// This function shall not participate in overload resolution unless
+// is_move_constructible_v<T> is true and is_swappable_v<T> is true.
+// NOTE: we assume is_swappable is always true. There will be a compiling error
+// if T is actually not Swappable.
+template <typename T,
+          typename std::enable_if<std::is_move_constructible<T>::value,
+                                  bool>::type = false>
+void swap(optional<T>& a, optional<T>& b) noexcept(noexcept(a.swap(b))) {
+  a.swap(b);
+}
+
+// NOTE: make_optional cannot be constexpr in C++11 because the copy/move
+// constructor is not constexpr and we don't have guaranteed copy elision
+// util C++17. But they are still declared constexpr for consistency with
+// the standard.
+
+// make_optional(v) creates a non-empty optional<T> where the type T is deduced
+// from v.  Can also be explicitly instantiated as make_optional<T>(v).
+template <typename T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v) {
+  return optional<typename std::decay<T>::type>(std::forward<T>(v));
+}
+
+template <typename T, typename... Args>
+constexpr optional<T> make_optional(Args&&... args) {
+  return optional<T>(in_place_t(), internal_optional::forward<Args>(args)...);
+}
+
+template <typename T, typename U, typename... Args>
+constexpr optional<T> make_optional(std::initializer_list<U> il,
+                                    Args&&... args) {
+  return optional<T>(in_place_t(), il,
+                     internal_optional::forward<Args>(args)...);
+}
+
+// Relational operators. Empty optionals are considered equal to each
+// other and less than non-empty optionals. Supports relations between
+// optional<T> and optional<T>, between optional<T> and T, and between
+// optional<T> and nullopt.
+// Note: We're careful to support T having non-bool relationals.
+
+// Relational operators [optional.relops]
+// The C++17 (N4606) "Returns:" statements are translated into code
+// in an obvious way here, and the original text retained as function docs.
+// Returns: If bool(x) != bool(y), false; otherwise if bool(x) == false, true;
+// otherwise *x == *y.
+template <class T>
+constexpr bool operator==(const optional<T>& x, const optional<T>& y) {
+  return static_cast<bool>(x) != static_cast<bool>(y)
+             ? false
+             : static_cast<bool>(x) == false ? true : *x == *y;
+}
+// Returns: If bool(x) != bool(y), true; otherwise, if bool(x) == false, false;
+// otherwise *x != *y.
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const optional<T>& y) {
+  return static_cast<bool>(x) != static_cast<bool>(y)
+             ? true
+             : static_cast<bool>(x) == false ? false : *x != *y;
+}
+// Returns: If !y, false; otherwise, if !x, true; otherwise *x < *y.
+template <class T>
+constexpr bool operator<(const optional<T>& x, const optional<T>& y) {
+  return !y ? false : !x ? true : *x < *y;
+}
+// Returns: If !x, false; otherwise, if !y, true; otherwise *x > *y.
+template <class T>
+constexpr bool operator>(const optional<T>& x, const optional<T>& y) {
+  return !x ? false : !y ? true : *x > *y;
+}
+// Returns: If !x, true; otherwise, if !y, false; otherwise *x <= *y.
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const optional<T>& y) {
+  return !x ? true : !y ? false : *x <= *y;
+}
+// Returns: If !y, true; otherwise, if !x, false; otherwise *x >= *y.
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const optional<T>& y) {
+  return !y ? true : !x ? false : *x >= *y;
+}
+
+// Comparison with nullopt [optional.nullops]
+// The C++17 (N4606) "Returns:" statements are used directly here.
+template <class T>
+constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept {
+  return !x;
+}
+template <class T>
+constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept {
+  return !x;
+}
+template <class T>
+constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator<(const optional<T>& x, nullopt_t) noexcept {
+  return false;
+}
+template <class T>
+constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept {
+  return !x;
+}
+template <class T>
+constexpr bool operator<=(nullopt_t, const optional<T>& x) noexcept {
+  return true;
+}
+template <class T>
+constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept {
+  return static_cast<bool>(x);
+}
+template <class T>
+constexpr bool operator>(nullopt_t, const optional<T>& x) noexcept {
+  return false;
+}
+template <class T>
+constexpr bool operator>=(const optional<T>& x, nullopt_t) noexcept {
+  return true;
+}
+template <class T>
+constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept {
+  return !x;
+}
+
+// Comparison with T [optional.comp_with_t]
+// The C++17 (N4606) "Equivalent to:" statements are used directly here.
+template <class T>
+constexpr bool operator==(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x == v : false;
+}
+template <class T>
+constexpr bool operator==(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v == *x : false;
+}
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x != v : true;
+}
+template <class T>
+constexpr bool operator!=(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v != *x : true;
+}
+template <class T>
+constexpr bool operator<(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x < v : true;
+}
+template <class T>
+constexpr bool operator<(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v < *x : false;
+}
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x <= v : true;
+}
+template <class T>
+constexpr bool operator<=(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v <= *x : false;
+}
+template <class T>
+constexpr bool operator>(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x > v : false;
+}
+template <class T>
+constexpr bool operator>(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v > *x : true;
+}
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const T& v) {
+  return static_cast<bool>(x) ? *x >= v : false;
+}
+template <class T>
+constexpr bool operator>=(const T& v, const optional<T>& x) {
+  return static_cast<bool>(x) ? v >= *x : true;
+}
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+namespace std {
+
+// Normally std::hash specializations are not recommended in tensorflow code,
+// but we allow this as it is following a standard library component.
+template <class T>
+struct hash<::tensorflow::gtl::optional<T>> {
+  size_t operator()(const ::tensorflow::gtl::optional<T>& opt) const {
+    if (opt) {
+      return hash<T>()(*opt);
+    } else {
+      return static_cast<size_t>(0x297814aaad196e6dULL);
+    }
+  }
+};
+
+}  // namespace std
+
+#endif  // TENSORFLOW_LIB_GTL_OPTIONAL_H_
diff --git a/libs/tensorflow/include/tensorflow/core/ops/compat/BUILD b/libs/tensorflow/include/tensorflow/core/ops/compat/BUILD
index fdc53f4..c590343 100644
--- a/libs/tensorflow/include/tensorflow/core/ops/compat/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/ops/compat/BUILD
@@ -18,6 +18,7 @@ cc_library(
     hdrs = ["op_compatibility_lib.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:debug_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
diff --git a/libs/tensorflow/include/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/libs/tensorflow/include/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 0061fbc..201a77c 100644
--- a/libs/tensorflow/include/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/libs/tensorflow/include/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1255,6 +1255,51 @@ op {
     }
   }
 }
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "ArgMax"
   input_arg {
@@ -2997,6 +3042,37 @@ op {
     }
   }
 }
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Bitcast"
   input_arg {
@@ -4626,6 +4702,58 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "DecodeBase64"
   input_arg {
@@ -4986,6 +5114,101 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
 op {
   name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
@@ -5028,6 +5251,62 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
 }
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
@@ -5071,6 +5350,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
 }
 op {
   name: "Dequantize"
@@ -7245,6 +7537,40 @@ op {
     type: "type"
   }
 }
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "GetSessionTensor"
   input_arg {
@@ -7454,6 +7780,51 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "Identity"
   input_arg {
@@ -11121,6 +11492,65 @@ op {
     version: 21
   }
 }
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
 op {
   name: "QuantizeAndDequantizeV2"
   input_arg {
@@ -12527,6 +12957,51 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
 op {
   name: "RGBToHSV"
   input_arg {
diff --git a/libs/tensorflow/include/tensorflow/core/ops/ops.pbtxt b/libs/tensorflow/include/tensorflow/core/ops/ops.pbtxt
index fa27edb..bcd8bd5 100644
--- a/libs/tensorflow/include/tensorflow/core/ops/ops.pbtxt
+++ b/libs/tensorflow/include/tensorflow/core/ops/ops.pbtxt
@@ -1454,6 +1454,52 @@ op {
   summary: "Update \'*var\' according to the RMSProp algorithm."
   description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
 }
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  summary: "Returns the truth value of abs(x-y) < tolerance element-wise."
+  is_commutative: true
+}
 op {
   name: "ArgMax"
   input_arg {
@@ -3391,6 +3437,43 @@ op {
   summary: "Adds `bias` to `value`."
   description: "This is a deprecated version of BiasAdd and will be soon removed.\n\nThis is a special case of `tf.add` where `bias` is restricted to be 1-D.\nBroadcasting is supported, so `value` may have any number of dimensions."
 }
+op {
+  name: "Bincount"
+  input_arg {
+    name: "arr"
+    description: "int32 `Tensor`."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    description: "non-negative int32 scalar `Tensor`."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    description: "is an int32, int64, float32, or float64 `Tensor` with the same\nshape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights\nequal to 1."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "bins"
+    description: "1D `Tensor` with length equal to `size`. The counts or summed weights for\neach value in the range [0, size)."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Counts the number of occurrences of each value in an integer array."
+  description: "Outputs a vector with length `size` and the same dtype as `weights`. If\n`weights` are empty, then index `i` stores the number of times the value `i` is\ncounted in `arr`. If `weights` are non-empty, then index `i` stores the sum of\nthe value in `weights` at each index where the corresponding value in `arr` is\n`i`.\n\nValues in `arr` outside of the range [0, size) are ignored."
+}
 op {
   name: "Bitcast"
   input_arg {
@@ -4135,14 +4218,17 @@ op {
   name: "Conv2D"
   input_arg {
     name: "input"
+    description: "A 4-D tensor. The dimension order is interpreted according to the value\nof `data_format`, see below for details."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
+    description: "A 4-D tensor of shape\n`[filter_height, filter_width, in_channels, out_channels]`"
     type_attr: "T"
   }
   output_arg {
     name: "output"
+    description: "A 4-D tensor. The dimension order is determined by the value of\n`data_format`, see below for details."
     type_attr: "T"
   }
   attr {
@@ -4159,7 +4245,7 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4.  The stride of the sliding window for each dimension\nof `input`. Must be in the same order as the dimension specified with format."
+    description: "1-D tensor of length 4.  The stride of the sliding window for each\ndimension of `input`. The dimension order is determined by the value of\n  `data_format`, see below for details."
   }
   attr {
     name: "use_cudnn_on_gpu"
@@ -4185,7 +4271,7 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -5200,7 +5286,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A double tensor of shape [12], the elements of which are:\n  [0]: is initialized (1.0) or not (0.0).\n  [1]: total number of elements\n  [2]: -inf count\n  [3]: negative element count (excluding -inf)\n  [4]: zero element count\n  [5]: positive element count (excluding +inf)\n  [6]: +inf element count\n  [7]: NaN element count\nOutput elements [1:8] are all zero, if the tensor is uninitialized.\n  [8]: minimum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: +inf.\n  [9]: maximum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: -inf.\n  [10]: mean of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN.\n  [11]: variance of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN."
+    description: "A double tensor of shape [12], the elements of which are:\n  [0]: is initialized (1.0) or not (0.0).\n  [1]: total number of elements\n  [2]: NaN element count\n  [3]: generalized -inf count: elements <= lower_bound. lower_bound is -inf by\n    default.\n  [4]: negative element count (excluding -inf), if lower_bound is the default\n    -inf. Otherwise, this is the count of elements > lower_bound and < 0.\n  [5]: zero element count\n  [6]: positive element count (excluding +inf), if upper_bound is the default\n    -inf. Otherwise, this is the count of elements < upper_bound and > 0.\n  [7]: generalized +inf count, elements >= upper_bound. upper_bound is +inf by\n    default.\nOutput elements [1:8] are all zero, if the tensor is uninitialized.\n  [8]: minimum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: +inf.\n  [9]: maximum of all non-inf and non-NaN elements.\n       If uninitialized or no such element exists: -inf.\n  [10]: mean of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN.\n  [11]: variance of all non-inf and non-NaN elements.\n        If uninitialized or no such element exists: NaN."
     type: DT_DOUBLE
   }
   attr {
@@ -5224,6 +5310,30 @@ op {
     }
     description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
   }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+    description: "(float) The lower bound <= which values will be included in the\ngeneralized -inf count. Default: -inf."
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+    description: "(float) The upper bound >= which values will be included in the\ngeneralized +inf count. Default: +inf."
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "(bool) Do not send data to the debug URLs unless at least one\nof elements [2], [3] and [7] (i.e., the nan count and the generalized -inf and\ninf counts) is non-zero."
+  }
   summary: "Debug Numeric Summary Op."
   description: "Provide a basic summary of numeric value types, range and distribution."
   allows_uninitialized_input: true
@@ -5647,6 +5757,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
   description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
 }
@@ -5654,7 +5778,7 @@ op {
   name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`."
+    description: "4-D with shape based on `data_format`.  For example, if\n`data_format` is \'NHWC\' then `input` is a 4-D `[batch, in_height,\nin_width, in_channels]` tensor."
     type_attr: "T"
   }
   input_arg {
@@ -5664,7 +5788,7 @@ op {
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
+    description: "4-D with shape  based on `data_format`.\nFor example, if `data_format` is \'NHWC\' then\nout_backprop shape is `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
@@ -5698,13 +5822,27 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   summary: "Computes the gradients of depthwise convolution with respect to the filter."
 }
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
     name: "input_sizes"
-    description: "An integer vector representing the shape of `input`,\nwhere `input` is a 4-D `[batch, height, width, channels]` tensor."
+    description: "An integer vector representing the shape of `input`, based\non `data_format`.  For example, if `data_format` is \'NHWC\' then\n `input` is a 4-D `[batch, height, width, channels]` tensor."
     type: DT_INT32
   }
   input_arg {
@@ -5714,12 +5852,12 @@ op {
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
+    description: "4-D with shape  based on `data_format`.\nFor example, if `data_format` is \'NHWC\' then\nout_backprop shape is `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient\nw.r.t. the input of the convolution."
+    description: "4-D with shape according to `data_format`.  For example, if\n`data_format` is \'NHWC\', output shape is `[batch, in_height,\nin_width, in_channels]`.  Gradient w.r.t. the input of the\nconvolution."
     type_attr: "T"
   }
   attr {
@@ -5748,6 +5886,20 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   summary: "Computes the gradients of depthwise convolution with respect to the input."
 }
 op {
@@ -6777,7 +6929,7 @@ op {
   }
   input_arg {
     name: "offsets"
-    description: "A 2-D integer tensor of shape `[batch_size, 2]` containing\nthe x, y locations of the center of each window."
+    description: "A 2-D integer tensor of shape `[batch_size, 2]` containing\nthe y, x locations of the center of each window."
     type: DT_FLOAT
   }
   output_arg {
@@ -6884,7 +7036,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\ndimension of `input` is replaced with its 1D Fourier Transform."
+    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its 1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft\n@end_compatibility"
     type: DT_COMPLEX64
   }
   summary: "Compute the 1-dimensional discrete Fourier Transform over the inner-most"
@@ -6899,7 +7051,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft2\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
   summary: "Compute the 2-dimensional discrete Fourier Transform over the inner-most"
@@ -6914,7 +7066,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft3\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
   summary: "Compute the 3-dimensional discrete Fourier Transform over the inner-most 3"
@@ -8227,19 +8379,39 @@ op {
   name: "GetSessionHandle"
   input_arg {
     name: "value"
-    description: "The tensor to be stored."
     type_attr: "T"
   }
   output_arg {
     name: "handle"
-    description: "The handle for the tensor stored in the session state."
     type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
   }
+  deprecation {
+    version: 23
+    explanation: "Use GetSessionHandleV2"
+  }
+}
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    description: "The tensor to be stored."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    description: "The handle for the tensor stored in the session state, represented\nas a ResourceHandle object."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
   summary: "Store the input tensor in the state of the current session."
+  is_stateful: true
 }
 op {
   name: "GetSessionTensor"
@@ -8451,7 +8623,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\ndimension of `input` is replaced with its inverse 1D Fourier Transform."
+    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its inverse 1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft\n@end_compatibility"
     type: DT_COMPLEX64
   }
   summary: "Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most"
@@ -8466,7 +8638,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.ifft2\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
   summary: "Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most"
@@ -8481,12 +8653,72 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft3\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
   summary: "Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most"
   description: "3 dimensions of `input`."
 }
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    description: "A complex64 tensor."
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [1]. The FFT length."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A float32 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length` samples of its inverse\n  1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft\n@end_compatibility"
+    type: DT_FLOAT
+  }
+  summary: "Compute the inverse 1-dimensional discrete Fourier Transform of a real-valued"
+  description: "signal over the inner-most dimension of `input`.\n\nThe inner-most dimension of `input` is assumed to be the result of `RFFT`: the\n`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If\n`fft_length` is not provided, it is computed from the size of the inner-most\ndimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to\ncompute `input` is odd, it should be provided since it cannot be inferred\nproperly."
+}
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    description: "A complex64 tensor."
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [2]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A float32 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft2\n@end_compatibility"
+    type: DT_FLOAT
+  }
+  summary: "Compute the inverse 2-dimensional discrete Fourier Transform of a real-valued"
+  description: "signal over the inner-most 2 dimensions of `input`.\n\nThe inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 2 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly."
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    description: "A complex64 tensor."
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [3]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A float32 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 3D real Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.irfftn with 3 dimensions.\n@end_compatibility"
+    type: DT_FLOAT
+  }
+  summary: "Compute the inverse 3-dimensional discrete Fourier Transform of a real-valued"
+  description: "signal over the inner-most 3 dimensions of `input`.\n\nThe inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 3 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly."
+}
 op {
   name: "Identity"
   input_arg {
@@ -12556,7 +12788,6 @@ op {
   name: "QuantizeAndDequantize"
   input_arg {
     name: "input"
-    description: "Tensor to quantize and then dequantize."
     type_attr: "T"
   }
   output_arg {
@@ -12569,7 +12800,6 @@ op {
     default_value {
       b: true
     }
-    description: "If the quantization is signed or unsigned."
   }
   attr {
     name: "num_bits"
@@ -12577,7 +12807,6 @@ op {
     default_value {
       i: 8
     }
-    description: "The bitwidth of the quantization."
   }
   attr {
     name: "range_given"
@@ -12585,7 +12814,6 @@ op {
     default_value {
       b: false
     }
-    description: "If the range is given or should be computed from the tensor."
   }
   attr {
     name: "input_min"
@@ -12593,7 +12821,6 @@ op {
     default_value {
       f: 0
     }
-    description: "If range is given, this is the min of the range."
   }
   attr {
     name: "input_max"
@@ -12601,7 +12828,6 @@ op {
     default_value {
       f: 0
     }
-    description: "If range is given, this is the max of the range."
   }
   attr {
     name: "T"
@@ -12613,10 +12839,9 @@ op {
       }
     }
   }
-  summary: "Quantizes then dequantizes a tensor."
-  description: "This op simulates the precision loss from the quantized forward pass by:\n1. Quantizing the tensor to fixed point numbers, which should match the target\n   quantization method when it is used in inference.\n2. Dequantizing it back to floating point numbers for the following ops, most\n   likely matmul.\n\nThere are different ways to quantize. This version does not use the full range\nof the output type, choosing to elide the lowest possible value for symmetry\n(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit\nquantization), so that 0.0 maps to 0.\n\nTo perform this op, we first find the range of values in our tensor. The range\nwe use is always centered on 0, so we find m such that\n\n1. m = max(abs(input_min), abs(input_max)) if range_given is true,\n2. m = max(max(abs(min_elem(input)), abs(max_elem(input))) otherwise.\n\nOur input tensor range is then [-m, m].\n\nNext, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].\nIf signed_input is true, this is\n\n  [min_fixed, max_fixed ] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].\n\nOtherwise, if signed_input is false, the fixed-point range is\n\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].\n\nFrom this we compute our scaling factor, s:\n\n  s = (max_fixed - min_fixed) / (2 * m).\n\nNow we can quantize and dequantize the elements of our tensor.  An element e\nis transformed into e\':\n\n  e\' = (e * s).round_to_nearest() / s.\n\nNote that we have a different number of buckets in the signed vs. unsigned\ncases.  For example, if num_bits == 8, we get 254 buckets in the signed case\nvs. 255 in the unsigned case.\n\nFor example, suppose num_bits = 8 and m = 1.  Then\n\n  [min_fixed, max_fixed] = [-127, 127], and\n  s = (127 + 127) / 2 = 127.\n\nGiven the vector {-1, -0.5, 0, 0.3}, this is quantized to\n{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}."
+  summary: "Use QuantizeAndDequantizeV2 instead."
   deprecation {
-    version: 21
+    version: 22
     explanation: "Replaced by QuantizeAndDequantizeV2"
   }
 }
@@ -14245,6 +14470,66 @@ op {
   summary: "Computes the number of elements in the given queue."
   is_stateful: true
 }
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    description: "A float32 tensor."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [1]. The FFT length."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length / 2 + 1` unique\n  frequency components of its 1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft\n@end_compatibility"
+    type: DT_COMPLEX64
+  }
+  summary: "Compute the 1-dimensional discrete Fourier Transform of a real-valued signal"
+  description: "over the inner-most dimension of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the\n`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,\nfollowed by the `fft_length / 2` positive-frequency terms."
+}
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    description: "A float32 tensor."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [2]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier Transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft2\n@end_compatibility"
+    type: DT_COMPLEX64
+  }
+  summary: "Compute the 2-dimensional discrete Fourier Transform of a real-valued signal"
+  description: "over the inner-most 2 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms."
+}
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    description: "A float32 tensor."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    description: "An int32 tensor of shape [3]. The FFT length for each dimension."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the their 3D Fourier Transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfftn with 3 dimensions.\n@end_compatibility"
+    type: DT_COMPLEX64
+  }
+  summary: "Compute the 3-dimensional discrete Fourier Transform of a real-valued signal"
+  description: "over the inner-most 3 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms."
+}
 op {
   name: "RGBToHSV"
   input_arg {
@@ -18957,7 +19242,7 @@ op {
     }
   }
   summary: "Computes the maximum along segments of a tensor."
-  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMax.png\" alt>\n</div>"
+  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the max is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMax.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMean"
@@ -19003,7 +19288,7 @@ op {
     }
   }
   summary: "Computes the mean along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMean.png\" alt>\n</div>"
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\nIf the mean is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMean.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMin"
@@ -19049,7 +19334,7 @@ op {
     }
   }
   summary: "Computes the minimum along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMin.png\" alt>\n</div>"
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the min is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMin.png\" alt>\n</div>"
 }
 op {
   name: "SegmentProd"
@@ -19100,7 +19385,7 @@ op {
     }
   }
   summary: "Computes the product along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentProd.png\" alt>\n</div>"
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the product is empty for a given segment ID `i`, `output[i] = 1`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentProd.png\" alt>\n</div>"
 }
 op {
   name: "SegmentSum"
@@ -19151,7 +19436,7 @@ op {
     }
   }
   summary: "Computes the sum along segments of a tensor."
-  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentSum.png\" alt>\n</div>"
+  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Select"
@@ -23123,7 +23408,7 @@ op {
     description: "The key for the keyed hash function passed as a list of two uint64\nelements."
   }
   summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: "The hash function is deterministic on the content of the string within the\nprocess. The hash function is a keyed hash function, where attribute `key`\ndefines the key of the hash function. `key` is an array of 2 elements.\n\nA strong hash is important when inputs may be malicious, e.g. URLs with\nadditional components. Adversaries could try to make their inputs hash to the\nsame bucket for a denial-of-service attack or to skew the results. A strong\nhash prevents this by making it dificult, if not infeasible, to compute inputs\nthat hash to the same bucket. This comes at a cost of roughly 4x higher compute\ntime than `tf.string_to_hash_bucket_fast`."
+  description: "The hash function is deterministic on the content of the string within the\nprocess. The hash function is a keyed hash function, where attribute `key`\ndefines the key of the hash function. `key` is an array of 2 elements.\n\nA strong hash is important when inputs may be malicious, e.g. URLs with\nadditional components. Adversaries could try to make their inputs hash to the\nsame bucket for a denial-of-service attack or to skew the results. A strong\nhash prevents this by making it difficult, if not infeasible, to compute inputs\nthat hash to the same bucket. This comes at a cost of roughly 4x higher compute\ntime than `tf.string_to_hash_bucket_fast`."
 }
 op {
   name: "StringToNumber"
diff --git a/libs/tensorflow/include/tensorflow/core/platform/cloud/BUILD b/libs/tensorflow/include/tensorflow/core/platform/cloud/BUILD
index 84b0bda..4df88b8 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/cloud/BUILD
+++ b/libs/tensorflow/include/tensorflow/core/platform/cloud/BUILD
@@ -38,6 +38,7 @@ cc_library(
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
+        ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -239,3 +240,16 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "retrying_utils_test",
+    size = "small",
+    srcs = ["retrying_utils_test.cc"],
+    deps = [
+        ":retrying_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/libs/tensorflow/include/tensorflow/core/platform/cloud/gcs_file_system.h b/libs/tensorflow/include/tensorflow/core/platform/cloud/gcs_file_system.h
index 28e6a85..6a6437f 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -35,7 +35,7 @@ class GcsFileSystem : public FileSystem {
   GcsFileSystem();
   GcsFileSystem(std::unique_ptr<AuthProvider> auth_provider,
                 std::unique_ptr<HttpRequest::Factory> http_request_factory,
-                size_t read_ahead_bytes, int32 max_upload_attempts);
+                size_t read_ahead_bytes, int64 initial_retry_delay_usec);
 
   Status NewRandomAccessFile(
       const string& filename,
@@ -114,9 +114,8 @@ class GcsFileSystem : public FileSystem {
   // RandomAccessFile implementation. Defaults to 256Mb.
   const size_t read_ahead_bytes_ = 256 * 1024 * 1024;
 
-  // The max number of attempts to upload a file to GCS using the resumable
-  // upload API.
-  const int32 max_upload_attempts_ = 5;
+  // The initial delay for exponential backoffs when retrying failed calls.
+  const int64 initial_retry_delay_usec_ = 1000000L;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GcsFileSystem);
 };
diff --git a/libs/tensorflow/include/tensorflow/core/platform/cloud/retrying_utils.h b/libs/tensorflow/include/tensorflow/core/platform/cloud/retrying_utils.h
index 19bbf51..99ab216 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/cloud/retrying_utils.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/cloud/retrying_utils.h
@@ -32,6 +32,17 @@ class RetryingUtils {
   /// If all retries failed, returns the last error status.
   static Status CallWithRetries(const std::function<Status()>& f,
                                 const int64 initial_delay_microseconds);
+  /// sleep_usec is a function that sleeps for the given number of microseconds.
+  static Status CallWithRetries(const std::function<Status()>& f,
+                                const int64 initial_delay_microseconds,
+                                const std::function<void(int64)>& sleep_usec);
+  /// \brief A retrying wrapper for a function that deletes a resource.
+  ///
+  /// The function takes care of the scenario when a delete operation
+  /// returns a failure but succeeds under the hood: if a retry returns
+  /// NOT_FOUND, the whole operation is considered a success.
+  static Status DeleteWithRetries(const std::function<Status()>& delete_func,
+                                  const int64 initial_delay_microseconds);
 };
 
 }  // namespace tensorflow
diff --git a/libs/tensorflow/include/tensorflow/core/platform/env.h b/libs/tensorflow/include/tensorflow/core/platform/env.h
index 29b6b2f..1b7e024 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/env.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/env.h
@@ -136,6 +136,12 @@ class Env {
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
   Status FileExists(const string& fname);
 
+  /// Returns true if all the listed files exist, false otherwise.
+  /// if status is not null, populate the vector with a detailed status
+  /// for each file.
+  bool FilesExist(const std::vector<string>& files,
+                  std::vector<Status>* status);
+
   /// \brief Stores in *result the names of the children of the specified
   /// directory. The names are relative to "dir".
   ///
@@ -394,8 +400,9 @@ namespace register_file_system {
 template <typename Factory>
 struct Register {
   Register(Env* env, const string& scheme) {
-    env->RegisterFileSystem(scheme,
-                            []() -> FileSystem* { return new Factory; });
+    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+    env->RegisterFileSystem(scheme, []() -> FileSystem* { return new Factory; })
+        .IgnoreError();
   }
 };
 
diff --git a/libs/tensorflow/include/tensorflow/core/platform/file_system.h b/libs/tensorflow/include/tensorflow/core/platform/file_system.h
index b249976..903df96 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/file_system.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/file_system.h
@@ -105,6 +105,12 @@ class FileSystem {
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
   virtual Status FileExists(const string& fname) = 0;
 
+  /// Returns true if all the listed files exist, false otherwise.
+  /// if status is not null, populate the vector with a detailed status
+  /// for each file.
+  virtual bool FilesExist(const std::vector<string>& files,
+                          std::vector<Status>* status);
+
   /// \brief Returns the immediate children in the given directory.
   ///
   /// The returned paths are relative to 'dir'.
diff --git a/libs/tensorflow/include/tensorflow/core/platform/macros.h b/libs/tensorflow/include/tensorflow/core/platform/macros.h
index aad3589..b6fb18b 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/macros.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/macros.h
@@ -53,6 +53,17 @@ limitations under the License.
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
 #endif
 
+// Control visiblity outside .so
+#if defined(COMPILER_MSVC)
+# ifdef TF_COMPILE_LIBRARY
+#  define TF_EXPORT __declspec(dllexport)
+# else
+#  define TF_EXPORT __declspec(dllimport)
+# endif   // TF_COMPILE_LIBRARY
+#else
+# define TF_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+
 // GCC can be told that a certain branch is not likely to be taken (for
 // instance, a CHECK failure), and use that information in static analysis.
 // Giving it this information can help it optimize for the common case in
diff --git a/libs/tensorflow/include/tensorflow/core/platform/prefetch.h b/libs/tensorflow/include/tensorflow/core/platform/prefetch.h
index cc6f05d..81e1a52 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/prefetch.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/prefetch.h
@@ -44,7 +44,9 @@ void prefetch(const void* x);
 // ---------------------------------------------------------------------------
 template <PrefetchHint hint>
 inline void prefetch(const void* x) {
-#if defined(__llvm__) || defined(COMPILER_GCC)
+// Check of COMPILER_GCC macro below is kept only for backward-compatibility
+// reasons. COMPILER_GCC3 is the macro that actually enables prefetch.
+#if defined(__llvm__) || defined(COMPILER_GCC) || defined(COMPILER_GCC3)
   __builtin_prefetch(x, 0, hint);
 #else
 // You get no effect.  Feel free to add more sections above.
diff --git a/libs/tensorflow/include/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/libs/tensorflow/include/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 09c365d..8604b01 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 21) && \
+    (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
 struct perf_event_attr;
 
 namespace tensorflow {
@@ -58,4 +61,7 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
 }  // profile_utils
 }  // tensorflow
 
+#endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
+        // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
+
 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
diff --git a/libs/tensorflow/include/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h b/libs/tensorflow/include/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
index 876bb9c..de4eec2 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
 #define TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
 
+#include <algorithm>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
diff --git a/libs/tensorflow/include/tensorflow/core/platform/profile_utils/cpu_utils.h b/libs/tensorflow/include/tensorflow/core/platform/profile_utils/cpu_utils.h
index 2d80f2e..19471ec 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -53,11 +53,7 @@ class CpuUtils {
   // is less than 2 ^ 61.
   static inline uint64 GetCurrentClockCycle() {
 #if defined(__ANDROID__)
-#if defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
     return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
-#else   // defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
-    return DUMMY_CYCLE_CLOCK;
-#endif  // defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
 // ----------------------------------------------------------------
 #elif defined(__x86_64__) || defined(__amd64__)
     uint64_t high, low;
diff --git a/libs/tensorflow/include/tensorflow/core/platform/protobuf.h b/libs/tensorflow/include/tensorflow/core/platform/protobuf.h
index c7a72ee..288d091 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/protobuf.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/protobuf.h
@@ -25,7 +25,7 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#if defined(PLATFORM_GOOGLE)
+#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
 #include "tensorflow/core/platform/google/protobuf.h"
 #else
 #include "tensorflow/core/platform/default/protobuf.h"
diff --git a/libs/tensorflow/include/tensorflow/core/platform/tracing.h b/libs/tensorflow/include/tensorflow/core/platform/tracing.h
index 3723ae1..a34b23e 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/tracing.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/tracing.h
@@ -197,6 +197,12 @@ class Tracing::ScopedAnnotation {
   // label string is only done if tracing is enabled.
   ScopedAnnotation(StringPiece name_part1, StringPiece name_part2);
 
+  // Returns true iff scoped annotations are active.
+  static bool Enabled() {
+    auto e = Tracing::engine();
+    return e && e->IsEnabled();
+  }
+
  private:
   std::unique_ptr<Engine::Annotation> annotation_;
 };
diff --git a/libs/tensorflow/include/tensorflow/core/platform/windows/cpu_info.h b/libs/tensorflow/include/tensorflow/core/platform/windows/cpu_info.h
index 77a1946..d6e78db 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/windows/cpu_info.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/windows/cpu_info.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 
+// included so __cpuidex function is available for GETCPUID on Windows
+#include <intrin.h>
+
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
diff --git a/libs/tensorflow/include/tensorflow/core/platform/windows/intrinsics_port.h b/libs/tensorflow/include/tensorflow/core/platform/windows/intrinsics_port.h
index a4fa1e9..e52f5b1 100644
--- a/libs/tensorflow/include/tensorflow/core/platform/windows/intrinsics_port.h
+++ b/libs/tensorflow/include/tensorflow/core/platform/windows/intrinsics_port.h
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #define _mm_load_pd1 _mm_load1_pd
+
+// only define these intrinsics if immintrin.h doesn't have them (VS2015 and earlier)
+#if _MSC_VER < 1910
 static inline int
 _mm256_extract_epi32(__m256i a, const int i)
 {
@@ -39,3 +42,4 @@ _mm256_insert_epi32(__m256i a, int b, const int i)
 }
 #endif
 #endif
+#endif
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/config.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/config.pb.h
old mode 100644
new mode 100755
index 31d9900..8bd3062
--- a/libs/tensorflow/include/tensorflow/core/protobuf/config.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/config.pb.h
@@ -35,6 +35,7 @@
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 // @@protoc_insertion_point(includes)
 namespace tensorflow {
 class AllocatorMemoryUsed;
@@ -73,6 +74,9 @@ extern GraphDefDefaultTypeInternal _GraphDef_default_instance_;
 class GraphOptions;
 class GraphOptionsDefaultTypeInternal;
 extern GraphOptionsDefaultTypeInternal _GraphOptions_default_instance_;
+class MemoryStats;
+class MemoryStatsDefaultTypeInternal;
+extern MemoryStatsDefaultTypeInternal _MemoryStats_default_instance_;
 class NodeExecStats;
 class NodeExecStatsDefaultTypeInternal;
 extern NodeExecStatsDefaultTypeInternal _NodeExecStats_default_instance_;
@@ -85,6 +89,9 @@ extern OptimizerOptionsDefaultTypeInternal _OptimizerOptions_default_instance_;
 class RPCOptions;
 class RPCOptionsDefaultTypeInternal;
 extern RPCOptionsDefaultTypeInternal _RPCOptions_default_instance_;
+class RewriterConfig;
+class RewriterConfigDefaultTypeInternal;
+extern RewriterConfigDefaultTypeInternal _RewriterConfig_default_instance_;
 class RunMetadata;
 class RunMetadataDefaultTypeInternal;
 extern RunMetadataDefaultTypeInternal _RunMetadata_default_instance_;
@@ -616,6 +623,24 @@ class GraphOptions : public ::google::protobuf::Message /* @@protoc_insertion_po
   void unsafe_arena_set_allocated_optimizer_options(
       ::tensorflow::OptimizerOptions* optimizer_options);
 
+  // .tensorflow.RewriterConfig rewrite_options = 10;
+  bool has_rewrite_options() const;
+  void clear_rewrite_options();
+  static const int kRewriteOptionsFieldNumber = 10;
+  private:
+  void _slow_mutable_rewrite_options();
+  void _slow_set_allocated_rewrite_options(
+      ::google::protobuf::Arena* message_arena, ::tensorflow::RewriterConfig** rewrite_options);
+  ::tensorflow::RewriterConfig* _slow_release_rewrite_options();
+  public:
+  const ::tensorflow::RewriterConfig& rewrite_options() const;
+  ::tensorflow::RewriterConfig* mutable_rewrite_options();
+  ::tensorflow::RewriterConfig* release_rewrite_options();
+  void set_allocated_rewrite_options(::tensorflow::RewriterConfig* rewrite_options);
+  ::tensorflow::RewriterConfig* unsafe_arena_release_rewrite_options();
+  void unsafe_arena_set_allocated_rewrite_options(
+      ::tensorflow::RewriterConfig* rewrite_options);
+
   // int64 build_cost_model = 4;
   void clear_build_cost_model();
   static const int kBuildCostModelFieldNumber = 4;
@@ -666,6 +691,7 @@ class GraphOptions : public ::google::protobuf::Message /* @@protoc_insertion_po
   typedef void InternalArenaConstructable_;
   typedef void DestructorSkippable_;
   ::tensorflow::OptimizerOptions* optimizer_options_;
+  ::tensorflow::RewriterConfig* rewrite_options_;
   ::google::protobuf::int64 build_cost_model_;
   ::google::protobuf::int64 build_cost_model_after_;
   bool enable_recv_scheduling_;
@@ -985,10 +1011,16 @@ class ConfigProto : public ::google::protobuf::Message /* @@protoc_insertion_poi
   const ::std::string& device_filters(int index) const;
   ::std::string* mutable_device_filters(int index);
   void set_device_filters(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_device_filters(int index, ::std::string&& value);
+  #endif
   void set_device_filters(int index, const char* value);
   void set_device_filters(int index, const char* value, size_t size);
   ::std::string* add_device_filters();
   void add_device_filters(const ::std::string& value);
+  #if LANG_CXX11
+  void add_device_filters(::std::string&& value);
+  #endif
   void add_device_filters(const char* value);
   void add_device_filters(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& device_filters() const;
@@ -1868,6 +1900,55 @@ inline void GraphOptions::set_timeline_step(::google::protobuf::int32 value) {
   // @@protoc_insertion_point(field_set:tensorflow.GraphOptions.timeline_step)
 }
 
+// .tensorflow.RewriterConfig rewrite_options = 10;
+inline bool GraphOptions::has_rewrite_options() const {
+  return this != internal_default_instance() && rewrite_options_ != NULL;
+}
+inline void GraphOptions::clear_rewrite_options() {
+  if (GetArenaNoVirtual() == NULL && rewrite_options_ != NULL) delete rewrite_options_;
+  rewrite_options_ = NULL;
+}
+inline const ::tensorflow::RewriterConfig& GraphOptions::rewrite_options() const {
+  // @@protoc_insertion_point(field_get:tensorflow.GraphOptions.rewrite_options)
+  return rewrite_options_ != NULL ? *rewrite_options_
+                         : *::tensorflow::RewriterConfig::internal_default_instance();
+}
+inline ::tensorflow::RewriterConfig* GraphOptions::mutable_rewrite_options() {
+  
+  if (rewrite_options_ == NULL) {
+    _slow_mutable_rewrite_options();
+  }
+  // @@protoc_insertion_point(field_mutable:tensorflow.GraphOptions.rewrite_options)
+  return rewrite_options_;
+}
+inline ::tensorflow::RewriterConfig* GraphOptions::release_rewrite_options() {
+  // @@protoc_insertion_point(field_release:tensorflow.GraphOptions.rewrite_options)
+  
+  if (GetArenaNoVirtual() != NULL) {
+    return _slow_release_rewrite_options();
+  } else {
+    ::tensorflow::RewriterConfig* temp = rewrite_options_;
+    rewrite_options_ = NULL;
+    return temp;
+  }
+}
+inline  void GraphOptions::set_allocated_rewrite_options(::tensorflow::RewriterConfig* rewrite_options) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == NULL) {
+    delete rewrite_options_;
+  }
+  if (rewrite_options != NULL) {
+    _slow_set_allocated_rewrite_options(message_arena, &rewrite_options);
+  }
+  rewrite_options_ = rewrite_options;
+  if (rewrite_options) {
+    
+  } else {
+    
+  }
+  // @@protoc_insertion_point(field_set_allocated:tensorflow.GraphOptions.rewrite_options)
+}
+
 // -------------------------------------------------------------------
 
 // ThreadPoolOptionProto
@@ -2033,6 +2114,12 @@ inline void ConfigProto::set_device_filters(int index, const ::std::string& valu
   // @@protoc_insertion_point(field_set:tensorflow.ConfigProto.device_filters)
   device_filters_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void ConfigProto::set_device_filters(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.ConfigProto.device_filters)
+  device_filters_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void ConfigProto::set_device_filters(int index, const char* value) {
   device_filters_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.ConfigProto.device_filters)
@@ -2050,6 +2137,12 @@ inline void ConfigProto::add_device_filters(const ::std::string& value) {
   device_filters_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.ConfigProto.device_filters)
 }
+#if LANG_CXX11
+inline void ConfigProto::add_device_filters(::std::string&& value) {
+  device_filters_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.ConfigProto.device_filters)
+}
+#endif
 inline void ConfigProto::add_device_filters(const char* value) {
   device_filters_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.ConfigProto.device_filters)
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text-impl.h
old mode 100644
new mode 100755
index 5386443..f1044b7
--- a/libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text-impl.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text-impl.h
@@ -36,6 +36,8 @@
 #include "tensorflow/core/protobuf/config.pb_text.h"
 #include "tensorflow/core/protobuf/debug.pb.h"
 #include "tensorflow/core/protobuf/debug.pb_text-impl.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb_text-impl.h"
 
 namespace tensorflow {
 
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text.h b/libs/tensorflow/include/tensorflow/core/protobuf/config.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/control_flow.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/control_flow.pb.h
old mode 100644
new mode 100755
index 2a43558..601a7ea
--- a/libs/tensorflow/include/tensorflow/core/protobuf/control_flow.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/control_flow.pb.h
@@ -148,10 +148,16 @@ class ValuesDef : public ::google::protobuf::Message /* @@protoc_insertion_point
   const ::std::string& values(int index) const;
   ::std::string* mutable_values(int index);
   void set_values(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_values(int index, ::std::string&& value);
+  #endif
   void set_values(int index, const char* value);
   void set_values(int index, const char* value, size_t size);
   ::std::string* add_values();
   void add_values(const ::std::string& value);
+  #if LANG_CXX11
+  void add_values(::std::string&& value);
+  #endif
   void add_values(const char* value);
   void add_values(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& values() const;
@@ -438,10 +444,16 @@ class WhileContextDef : public ::google::protobuf::Message /* @@protoc_insertion
   const ::std::string& loop_exit_names(int index) const;
   ::std::string* mutable_loop_exit_names(int index);
   void set_loop_exit_names(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_loop_exit_names(int index, ::std::string&& value);
+  #endif
   void set_loop_exit_names(int index, const char* value);
   void set_loop_exit_names(int index, const char* value, size_t size);
   ::std::string* add_loop_exit_names();
   void add_loop_exit_names(const ::std::string& value);
+  #if LANG_CXX11
+  void add_loop_exit_names(::std::string&& value);
+  #endif
   void add_loop_exit_names(const char* value);
   void add_loop_exit_names(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& loop_exit_names() const;
@@ -587,6 +599,12 @@ inline void ValuesDef::set_values(int index, const ::std::string& value) {
   // @@protoc_insertion_point(field_set:tensorflow.ValuesDef.values)
   values_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void ValuesDef::set_values(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.ValuesDef.values)
+  values_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void ValuesDef::set_values(int index, const char* value) {
   values_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.ValuesDef.values)
@@ -604,6 +622,12 @@ inline void ValuesDef::add_values(const ::std::string& value) {
   values_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.ValuesDef.values)
 }
+#if LANG_CXX11
+inline void ValuesDef::add_values(::std::string&& value) {
+  values_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.ValuesDef.values)
+}
+#endif
 inline void ValuesDef::add_values(const char* value) {
   values_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.ValuesDef.values)
@@ -1235,6 +1259,12 @@ inline void WhileContextDef::set_loop_exit_names(int index, const ::std::string&
   // @@protoc_insertion_point(field_set:tensorflow.WhileContextDef.loop_exit_names)
   loop_exit_names_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void WhileContextDef::set_loop_exit_names(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.WhileContextDef.loop_exit_names)
+  loop_exit_names_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void WhileContextDef::set_loop_exit_names(int index, const char* value) {
   loop_exit_names_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.WhileContextDef.loop_exit_names)
@@ -1252,6 +1282,12 @@ inline void WhileContextDef::add_loop_exit_names(const ::std::string& value) {
   loop_exit_names_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.WhileContextDef.loop_exit_names)
 }
+#if LANG_CXX11
+inline void WhileContextDef::add_loop_exit_names(::std::string&& value) {
+  loop_exit_names_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.WhileContextDef.loop_exit_names)
+}
+#endif
 inline void WhileContextDef::add_loop_exit_names(const char* value) {
   loop_exit_names_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.WhileContextDef.loop_exit_names)
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb.h
old mode 100644
new mode 100755
index 8db7896..a1032fe
--- a/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb.h
@@ -139,10 +139,16 @@ class DebugTensorWatch : public ::google::protobuf::Message /* @@protoc_insertio
   const ::std::string& debug_ops(int index) const;
   ::std::string* mutable_debug_ops(int index);
   void set_debug_ops(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_debug_ops(int index, ::std::string&& value);
+  #endif
   void set_debug_ops(int index, const char* value);
   void set_debug_ops(int index, const char* value, size_t size);
   ::std::string* add_debug_ops();
   void add_debug_ops(const ::std::string& value);
+  #if LANG_CXX11
+  void add_debug_ops(::std::string&& value);
+  #endif
   void add_debug_ops(const char* value);
   void add_debug_ops(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& debug_ops() const;
@@ -155,10 +161,16 @@ class DebugTensorWatch : public ::google::protobuf::Message /* @@protoc_insertio
   const ::std::string& debug_urls(int index) const;
   ::std::string* mutable_debug_urls(int index);
   void set_debug_urls(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_debug_urls(int index, ::std::string&& value);
+  #endif
   void set_debug_urls(int index, const char* value);
   void set_debug_urls(int index, const char* value, size_t size);
   ::std::string* add_debug_urls();
   void add_debug_urls(const ::std::string& value);
+  #if LANG_CXX11
+  void add_debug_urls(::std::string&& value);
+  #endif
   void add_debug_urls(const char* value);
   void add_debug_urls(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& debug_urls() const;
@@ -184,6 +196,12 @@ class DebugTensorWatch : public ::google::protobuf::Message /* @@protoc_insertio
   ::google::protobuf::int32 output_slot() const;
   void set_output_slot(::google::protobuf::int32 value);
 
+  // bool tolerate_debug_op_creation_failures = 5;
+  void clear_tolerate_debug_op_creation_failures();
+  static const int kTolerateDebugOpCreationFailuresFieldNumber = 5;
+  bool tolerate_debug_op_creation_failures() const;
+  void set_tolerate_debug_op_creation_failures(bool value);
+
   // @@protoc_insertion_point(class_scope:tensorflow.DebugTensorWatch)
  private:
 
@@ -195,6 +213,7 @@ class DebugTensorWatch : public ::google::protobuf::Message /* @@protoc_insertio
   ::google::protobuf::RepeatedPtrField< ::std::string> debug_urls_;
   ::google::protobuf::internal::ArenaStringPtr node_name_;
   ::google::protobuf::int32 output_slot_;
+  bool tolerate_debug_op_creation_failures_;
   mutable int _cached_size_;
   friend struct  protobuf_tensorflow_2fcore_2fprotobuf_2fdebug_2eproto::TableStruct;
 };
@@ -416,6 +435,12 @@ inline void DebugTensorWatch::set_debug_ops(int index, const ::std::string& valu
   // @@protoc_insertion_point(field_set:tensorflow.DebugTensorWatch.debug_ops)
   debug_ops_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void DebugTensorWatch::set_debug_ops(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.DebugTensorWatch.debug_ops)
+  debug_ops_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void DebugTensorWatch::set_debug_ops(int index, const char* value) {
   debug_ops_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.DebugTensorWatch.debug_ops)
@@ -433,6 +458,12 @@ inline void DebugTensorWatch::add_debug_ops(const ::std::string& value) {
   debug_ops_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.DebugTensorWatch.debug_ops)
 }
+#if LANG_CXX11
+inline void DebugTensorWatch::add_debug_ops(::std::string&& value) {
+  debug_ops_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.DebugTensorWatch.debug_ops)
+}
+#endif
 inline void DebugTensorWatch::add_debug_ops(const char* value) {
   debug_ops_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.DebugTensorWatch.debug_ops)
@@ -471,6 +502,12 @@ inline void DebugTensorWatch::set_debug_urls(int index, const ::std::string& val
   // @@protoc_insertion_point(field_set:tensorflow.DebugTensorWatch.debug_urls)
   debug_urls_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void DebugTensorWatch::set_debug_urls(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.DebugTensorWatch.debug_urls)
+  debug_urls_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void DebugTensorWatch::set_debug_urls(int index, const char* value) {
   debug_urls_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.DebugTensorWatch.debug_urls)
@@ -488,6 +525,12 @@ inline void DebugTensorWatch::add_debug_urls(const ::std::string& value) {
   debug_urls_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.DebugTensorWatch.debug_urls)
 }
+#if LANG_CXX11
+inline void DebugTensorWatch::add_debug_urls(::std::string&& value) {
+  debug_urls_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.DebugTensorWatch.debug_urls)
+}
+#endif
 inline void DebugTensorWatch::add_debug_urls(const char* value) {
   debug_urls_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.DebugTensorWatch.debug_urls)
@@ -507,6 +550,20 @@ DebugTensorWatch::mutable_debug_urls() {
   return &debug_urls_;
 }
 
+// bool tolerate_debug_op_creation_failures = 5;
+inline void DebugTensorWatch::clear_tolerate_debug_op_creation_failures() {
+  tolerate_debug_op_creation_failures_ = false;
+}
+inline bool DebugTensorWatch::tolerate_debug_op_creation_failures() const {
+  // @@protoc_insertion_point(field_get:tensorflow.DebugTensorWatch.tolerate_debug_op_creation_failures)
+  return tolerate_debug_op_creation_failures_;
+}
+inline void DebugTensorWatch::set_tolerate_debug_op_creation_failures(bool value) {
+  
+  tolerate_debug_op_creation_failures_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.DebugTensorWatch.tolerate_debug_op_creation_failures)
+}
+
 // -------------------------------------------------------------------
 
 // DebugOptions
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb_text.h b/libs/tensorflow/include/tensorflow/core/protobuf/debug.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/meta_graph.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/meta_graph.pb.h
index 5339da0..83ef54d 100755
--- a/libs/tensorflow/include/tensorflow/core/protobuf/meta_graph.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/meta_graph.pb.h
@@ -208,10 +208,16 @@ class MetaGraphDef_MetaInfoDef : public ::google::protobuf::Message /* @@protoc_
   const ::std::string& tags(int index) const;
   ::std::string* mutable_tags(int index);
   void set_tags(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_tags(int index, ::std::string&& value);
+  #endif
   void set_tags(int index, const char* value);
   void set_tags(int index, const char* value, size_t size);
   ::std::string* add_tags();
   void add_tags(const ::std::string& value);
+  #if LANG_CXX11
+  void add_tags(::std::string&& value);
+  #endif
   void add_tags(const char* value);
   void add_tags(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& tags() const;
@@ -606,10 +612,16 @@ class CollectionDef_NodeList : public ::google::protobuf::Message /* @@protoc_in
   const ::std::string& value(int index) const;
   ::std::string* mutable_value(int index);
   void set_value(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_value(int index, ::std::string&& value);
+  #endif
   void set_value(int index, const char* value);
   void set_value(int index, const char* value, size_t size);
   ::std::string* add_value();
   void add_value(const ::std::string& value);
+  #if LANG_CXX11
+  void add_value(::std::string&& value);
+  #endif
   void add_value(const char* value);
   void add_value(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& value() const;
@@ -714,10 +726,16 @@ class CollectionDef_BytesList : public ::google::protobuf::Message /* @@protoc_i
   const ::std::string& value(int index) const;
   ::std::string* mutable_value(int index);
   void set_value(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_value(int index, ::std::string&& value);
+  #endif
   void set_value(int index, const char* value);
   void set_value(int index, const void* value, size_t size);
   ::std::string* add_value();
   void add_value(const ::std::string& value);
+  #if LANG_CXX11
+  void add_value(::std::string&& value);
+  #endif
   void add_value(const char* value);
   void add_value(const void* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& value() const;
@@ -1867,6 +1885,12 @@ inline void MetaGraphDef_MetaInfoDef::set_tags(int index, const ::std::string& v
   // @@protoc_insertion_point(field_set:tensorflow.MetaGraphDef.MetaInfoDef.tags)
   tags_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void MetaGraphDef_MetaInfoDef::set_tags(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.MetaGraphDef.MetaInfoDef.tags)
+  tags_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void MetaGraphDef_MetaInfoDef::set_tags(int index, const char* value) {
   tags_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.MetaGraphDef.MetaInfoDef.tags)
@@ -1884,6 +1908,12 @@ inline void MetaGraphDef_MetaInfoDef::add_tags(const ::std::string& value) {
   tags_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.MetaGraphDef.MetaInfoDef.tags)
 }
+#if LANG_CXX11
+inline void MetaGraphDef_MetaInfoDef::add_tags(::std::string&& value) {
+  tags_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.MetaGraphDef.MetaInfoDef.tags)
+}
+#endif
 inline void MetaGraphDef_MetaInfoDef::add_tags(const char* value) {
   tags_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.MetaGraphDef.MetaInfoDef.tags)
@@ -2279,6 +2309,12 @@ inline void CollectionDef_NodeList::set_value(int index, const ::std::string& va
   // @@protoc_insertion_point(field_set:tensorflow.CollectionDef.NodeList.value)
   value_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void CollectionDef_NodeList::set_value(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.CollectionDef.NodeList.value)
+  value_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void CollectionDef_NodeList::set_value(int index, const char* value) {
   value_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.CollectionDef.NodeList.value)
@@ -2296,6 +2332,12 @@ inline void CollectionDef_NodeList::add_value(const ::std::string& value) {
   value_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.CollectionDef.NodeList.value)
 }
+#if LANG_CXX11
+inline void CollectionDef_NodeList::add_value(::std::string&& value) {
+  value_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.CollectionDef.NodeList.value)
+}
+#endif
 inline void CollectionDef_NodeList::add_value(const char* value) {
   value_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.CollectionDef.NodeList.value)
@@ -2338,6 +2380,12 @@ inline void CollectionDef_BytesList::set_value(int index, const ::std::string& v
   // @@protoc_insertion_point(field_set:tensorflow.CollectionDef.BytesList.value)
   value_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void CollectionDef_BytesList::set_value(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.CollectionDef.BytesList.value)
+  value_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void CollectionDef_BytesList::set_value(int index, const char* value) {
   value_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.CollectionDef.BytesList.value)
@@ -2355,6 +2403,12 @@ inline void CollectionDef_BytesList::add_value(const ::std::string& value) {
   value_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.CollectionDef.BytesList.value)
 }
+#if LANG_CXX11
+inline void CollectionDef_BytesList::add_value(::std::string&& value) {
+  value_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.CollectionDef.BytesList.value)
+}
+#endif
 inline void CollectionDef_BytesList::add_value(const char* value) {
   value_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.CollectionDef.BytesList.value)
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/named_tensor.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/named_tensor.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb.h
index a8f4379..6c317e5 100755
--- a/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb.h
@@ -139,10 +139,16 @@ class QueueRunnerDef : public ::google::protobuf::Message /* @@protoc_insertion_
   const ::std::string& enqueue_op_name(int index) const;
   ::std::string* mutable_enqueue_op_name(int index);
   void set_enqueue_op_name(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_enqueue_op_name(int index, ::std::string&& value);
+  #endif
   void set_enqueue_op_name(int index, const char* value);
   void set_enqueue_op_name(int index, const char* value, size_t size);
   ::std::string* add_enqueue_op_name();
   void add_enqueue_op_name(const ::std::string& value);
+  #if LANG_CXX11
+  void add_enqueue_op_name(::std::string&& value);
+  #endif
   void add_enqueue_op_name(const char* value);
   void add_enqueue_op_name(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& enqueue_op_name() const;
@@ -309,6 +315,12 @@ inline void QueueRunnerDef::set_enqueue_op_name(int index, const ::std::string&
   // @@protoc_insertion_point(field_set:tensorflow.QueueRunnerDef.enqueue_op_name)
   enqueue_op_name_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void QueueRunnerDef::set_enqueue_op_name(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.QueueRunnerDef.enqueue_op_name)
+  enqueue_op_name_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void QueueRunnerDef::set_enqueue_op_name(int index, const char* value) {
   enqueue_op_name_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.QueueRunnerDef.enqueue_op_name)
@@ -326,6 +338,12 @@ inline void QueueRunnerDef::add_enqueue_op_name(const ::std::string& value) {
   enqueue_op_name_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.QueueRunnerDef.enqueue_op_name)
 }
+#if LANG_CXX11
+inline void QueueRunnerDef::add_enqueue_op_name(::std::string&& value) {
+  enqueue_op_name_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.QueueRunnerDef.enqueue_op_name)
+}
+#endif
 inline void QueueRunnerDef::add_enqueue_op_name(const char* value) {
   enqueue_op_name_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.QueueRunnerDef.enqueue_op_name)
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text-impl.h
new file mode 100755
index 0000000..447e704
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text-impl.h
@@ -0,0 +1,27 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_protobuf_queue_runner_proto_IMPL_H_
+#define tensorflow_core_protobuf_queue_runner_proto_IMPL_H_
+
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/error_codes.pb_text-impl.h"
+#include "tensorflow/core/lib/strings/proto_text_util.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tensorflow/core/protobuf/queue_runner.pb_text.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::QueueRunnerDef& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::QueueRunnerDef* msg);
+
+}  // namespace internal
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_protobuf_queue_runner_proto_IMPL_H_
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text.h b/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text.h
new file mode 100755
index 0000000..0642018
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/queue_runner.pb_text.h
@@ -0,0 +1,24 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_protobuf_queue_runner_proto_H_
+#define tensorflow_core_protobuf_queue_runner_proto_H_
+
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Message-text conversion for tensorflow.QueueRunnerDef
+string ProtoDebugString(
+    const ::tensorflow::QueueRunnerDef& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::QueueRunnerDef& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::QueueRunnerDef* msg)
+        TF_MUST_USE_RESULT;
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_protobuf_queue_runner_proto_H_
diff --git a/libs/tensorflow/include/tensorflow/core/debug/debug_service.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb.h
old mode 100644
new mode 100755
similarity index 53%
rename from libs/tensorflow/include/tensorflow/core/debug/debug_service.pb.h
rename to libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb.h
index 1b90c65..2a03750
--- a/libs/tensorflow/include/tensorflow/core/debug/debug_service.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb.h
@@ -1,8 +1,8 @@
 // Generated by the protocol buffer compiler.  DO NOT EDIT!
-// source: tensorflow/core/debug/debug_service.proto
+// source: tensorflow/core/protobuf/rewriter_config.proto
 
-#ifndef PROTOBUF_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto__INCLUDED
-#define PROTOBUF_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto__INCLUDED
+#ifndef PROTOBUF_tensorflow_2fcore_2fprotobuf_2frewriter_5fconfig_2eproto__INCLUDED
+#define PROTOBUF_tensorflow_2fcore_2fprotobuf_2frewriter_5fconfig_2eproto__INCLUDED
 
 #include <string>
 
@@ -28,29 +28,16 @@
 #include <google/protobuf/repeated_field.h>  // IWYU pragma: export
 #include <google/protobuf/extension_set.h>  // IWYU pragma: export
 #include <google/protobuf/unknown_field_set.h>
-#include "tensorflow/core/util/event.pb.h"
 // @@protoc_insertion_point(includes)
 namespace tensorflow {
-class Event;
-class EventDefaultTypeInternal;
-extern EventDefaultTypeInternal _Event_default_instance_;
-class EventReply;
-class EventReplyDefaultTypeInternal;
-extern EventReplyDefaultTypeInternal _EventReply_default_instance_;
-class LogMessage;
-class LogMessageDefaultTypeInternal;
-extern LogMessageDefaultTypeInternal _LogMessage_default_instance_;
-class SessionLog;
-class SessionLogDefaultTypeInternal;
-extern SessionLogDefaultTypeInternal _SessionLog_default_instance_;
-class TaggedRunMetadata;
-class TaggedRunMetadataDefaultTypeInternal;
-extern TaggedRunMetadataDefaultTypeInternal _TaggedRunMetadata_default_instance_;
+class RewriterConfig;
+class RewriterConfigDefaultTypeInternal;
+extern RewriterConfigDefaultTypeInternal _RewriterConfig_default_instance_;
 }  // namespace tensorflow
 
 namespace tensorflow {
 
-namespace protobuf_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto {
+namespace protobuf_tensorflow_2fcore_2fprotobuf_2frewriter_5fconfig_2eproto {
 // Internal implementation detail -- do not call these.
 struct TableStruct {
   static const ::google::protobuf::uint32 offsets[];
@@ -59,41 +46,48 @@ struct TableStruct {
 };
 void AddDescriptors();
 void InitDefaults();
-}  // namespace protobuf_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto
+}  // namespace protobuf_tensorflow_2fcore_2fprotobuf_2frewriter_5fconfig_2eproto
 
 // ===================================================================
 
-class EventReply : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.EventReply) */ {
+class RewriterConfig : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.RewriterConfig) */ {
  public:
-  EventReply();
-  virtual ~EventReply();
+  RewriterConfig();
+  virtual ~RewriterConfig();
 
-  EventReply(const EventReply& from);
+  RewriterConfig(const RewriterConfig& from);
 
-  inline EventReply& operator=(const EventReply& from) {
+  inline RewriterConfig& operator=(const RewriterConfig& from) {
     CopyFrom(from);
     return *this;
   }
 
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
   static const ::google::protobuf::Descriptor* descriptor();
-  static const EventReply& default_instance();
+  static const RewriterConfig& default_instance();
 
-  static inline const EventReply* internal_default_instance() {
-    return reinterpret_cast<const EventReply*>(
-               &_EventReply_default_instance_);
+  static inline const RewriterConfig* internal_default_instance() {
+    return reinterpret_cast<const RewriterConfig*>(
+               &_RewriterConfig_default_instance_);
   }
 
-  void Swap(EventReply* other);
+  void UnsafeArenaSwap(RewriterConfig* other);
+  void Swap(RewriterConfig* other);
 
   // implements Message ----------------------------------------------
 
-  inline EventReply* New() const PROTOBUF_FINAL { return New(NULL); }
+  inline RewriterConfig* New() const PROTOBUF_FINAL { return New(NULL); }
 
-  EventReply* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  RewriterConfig* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
   void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
   void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
-  void CopyFrom(const EventReply& from);
-  void MergeFrom(const EventReply& from);
+  void CopyFrom(const RewriterConfig& from);
+  void MergeFrom(const RewriterConfig& from);
   void Clear() PROTOBUF_FINAL;
   bool IsInitialized() const PROTOBUF_FINAL;
 
@@ -114,13 +108,18 @@ class EventReply : public ::google::protobuf::Message /* @@protoc_insertion_poin
   void SharedCtor();
   void SharedDtor();
   void SetCachedSize(int size) const PROTOBUF_FINAL;
-  void InternalSwap(EventReply* other);
+  void InternalSwap(RewriterConfig* other);
+  protected:
+  explicit RewriterConfig(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
   private:
   inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
-    return NULL;
+    return _internal_metadata_.arena();
   }
   inline void* MaybeArenaPtr() const {
-    return NULL;
+    return _internal_metadata_.raw_arena_ptr();
   }
   public:
 
@@ -130,12 +129,22 @@ class EventReply : public ::google::protobuf::Message /* @@protoc_insertion_poin
 
   // accessors -------------------------------------------------------
 
-  // @@protoc_insertion_point(class_scope:tensorflow.EventReply)
+  // bool optimize_tensor_layout = 1;
+  void clear_optimize_tensor_layout();
+  static const int kOptimizeTensorLayoutFieldNumber = 1;
+  bool optimize_tensor_layout() const;
+  void set_optimize_tensor_layout(bool value);
+
+  // @@protoc_insertion_point(class_scope:tensorflow.RewriterConfig)
  private:
 
   ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  friend class ::google::protobuf::Arena;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  bool optimize_tensor_layout_;
   mutable int _cached_size_;
-  friend struct  protobuf_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto::TableStruct;
+  friend struct  protobuf_tensorflow_2fcore_2fprotobuf_2frewriter_5fconfig_2eproto::TableStruct;
 };
 // ===================================================================
 
@@ -143,7 +152,21 @@ class EventReply : public ::google::protobuf::Message /* @@protoc_insertion_poin
 // ===================================================================
 
 #if !PROTOBUF_INLINE_NOT_IN_HEADERS
-// EventReply
+// RewriterConfig
+
+// bool optimize_tensor_layout = 1;
+inline void RewriterConfig::clear_optimize_tensor_layout() {
+  optimize_tensor_layout_ = false;
+}
+inline bool RewriterConfig::optimize_tensor_layout() const {
+  // @@protoc_insertion_point(field_get:tensorflow.RewriterConfig.optimize_tensor_layout)
+  return optimize_tensor_layout_;
+}
+inline void RewriterConfig::set_optimize_tensor_layout(bool value) {
+  
+  optimize_tensor_layout_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.RewriterConfig.optimize_tensor_layout)
+}
 
 #endif  // !PROTOBUF_INLINE_NOT_IN_HEADERS
 
@@ -154,4 +177,4 @@ class EventReply : public ::google::protobuf::Message /* @@protoc_insertion_poin
 
 // @@protoc_insertion_point(global_scope)
 
-#endif  // PROTOBUF_tensorflow_2fcore_2fdebug_2fdebug_5fservice_2eproto__INCLUDED
+#endif  // PROTOBUF_tensorflow_2fcore_2fprotobuf_2frewriter_5fconfig_2eproto__INCLUDED
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text-impl.h
new file mode 100755
index 0000000..0e39d16
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text-impl.h
@@ -0,0 +1,25 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_protobuf_rewriter_config_proto_IMPL_H_
+#define tensorflow_core_protobuf_rewriter_config_proto_IMPL_H_
+
+#include "tensorflow/core/lib/strings/proto_text_util.h"
+#include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb_text.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+void AppendProtoDebugString(
+    ::tensorflow::strings::ProtoTextOutput* o,
+    const ::tensorflow::RewriterConfig& msg);
+bool ProtoParseFromScanner(
+    ::tensorflow::strings::Scanner* scanner, bool nested, bool close_curly,
+    ::tensorflow::RewriterConfig* msg);
+
+}  // namespace internal
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_protobuf_rewriter_config_proto_IMPL_H_
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text.h b/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text.h
new file mode 100755
index 0000000..e352448
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/protobuf/rewriter_config.pb_text.h
@@ -0,0 +1,24 @@
+// GENERATED FILE - DO NOT MODIFY
+#ifndef tensorflow_core_protobuf_rewriter_config_proto_H_
+#define tensorflow_core_protobuf_rewriter_config_proto_H_
+
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Message-text conversion for tensorflow.RewriterConfig
+string ProtoDebugString(
+    const ::tensorflow::RewriterConfig& msg);
+string ProtoShortDebugString(
+    const ::tensorflow::RewriterConfig& msg);
+bool ProtoParseFromString(
+    const string& s,
+    ::tensorflow::RewriterConfig* msg)
+        TF_MUST_USE_RESULT;
+
+}  // namespace tensorflow
+
+#endif  // tensorflow_core_protobuf_rewriter_config_proto_H_
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/saved_model.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/saved_model.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/saver.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/protobuf/saver.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/saver.pb_text.h b/libs/tensorflow/include/tensorflow/core/protobuf/saver.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb_text.h b/libs/tensorflow/include/tensorflow/core/protobuf/tensor_bundle.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/protobuf/tensorflow_server.pb.h b/libs/tensorflow/include/tensorflow/core/protobuf/tensorflow_server.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/public/session.h b/libs/tensorflow/include/tensorflow/core/public/session.h
index b2f998c..eaa076f 100644
--- a/libs/tensorflow/include/tensorflow/core/public/session.h
+++ b/libs/tensorflow/include/tensorflow/core/public/session.h
@@ -188,10 +188,26 @@ Status NewSession(const SessionOptions& options, Session** out_session);
 
 /// \brief Resets resource containers associated with a target.
 ///
+/// Reset() allows misbehaving or slow sessions to be aborted and closed, and
+/// causes their resources eventually to be released.  Reset() does not wait
+/// for the computations in old sessions to cease; it merely starts the
+/// process of tearing them down.  However, if a new session is started after
+/// a Reset(), the new session is isolated from changes that old sessions
+/// (started prior to the Reset()) may continue to make to resources, provided
+/// all those resources are in containers listed in "containers".
+///
+/// Old sessions may continue to have side-effects on resources not in
+/// containers listed in "containers", and thus may affect future
+/// sessions' results in ways that are hard to predict.  Thus, if well-defined
+/// behaviour is desired, it is recommended that all containers be listed in
+/// "containers".
+///
 /// `containers` is a vector of string representation of resource container
 /// names. When a resource container is reset, the resources held by the
 /// container will be released. In particular, all Variables in the container
-/// will become undefined.
+/// will become undefined.  If the "containers" vector is empty, the default
+/// container is assumed.  If the "containers" vector is non-empty, the
+/// default container should be listed explicitly.
 ///
 /// If Reset succeeds, this function will return `OK()`. Otherwise, this
 /// function will return an error status.
diff --git a/libs/tensorflow/include/tensorflow/core/public/version.h b/libs/tensorflow/include/tensorflow/core/public/version.h
index 9c7c4b3..5ce475a 100644
--- a/libs/tensorflow/include/tensorflow/core/public/version.h
+++ b/libs/tensorflow/include/tensorflow/core/public/version.h
@@ -19,7 +19,7 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 0
+#define TF_MINOR_VERSION 1
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -77,8 +77,11 @@ limitations under the License.
 //     (08dec2016)
 // 20. Catch all version 1.0 changes to Python API generation. SplitV is now
 //     used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is
-//     now used by tf.concat_v2 (and soon tf.concat). Graphs use flooring
+//     now used by tf.concat. Graphs use flooring
 //     division and mod semantics. TensorArrayV3. (12dec2016)
+//     Also considered the version for when it is required for reduction
+//     ops' indices to be scalar or vector, and not higher rank.
+//     Some earlier graph def versions allowed this.
 // 21. Dropped FunctionDef.Node support, switched to node_def introduced
 //     in version 12. (11jan2017)
 
diff --git a/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_beam_search.h b/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_beam_search.h
index f638019..28b52ab 100644
--- a/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -92,10 +93,10 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   ~CTCBeamSearchDecoder() override {}
 
   // Run the hibernating beam search algorithm on the given input.
-  void Decode(const CTCDecoder::SequenceLength& seq_len,
-              const std::vector<CTCDecoder::Input>& input,
-              std::vector<CTCDecoder::Output>* output,
-              CTCDecoder::ScoreOutput* scores) override;
+  Status Decode(const CTCDecoder::SequenceLength& seq_len,
+                const std::vector<CTCDecoder::Input>& input,
+                std::vector<CTCDecoder::Output>* output,
+                CTCDecoder::ScoreOutput* scores) override;
 
   // Calculate the next step of the beam search and update the internal state.
   template <typename Vector>
@@ -116,8 +117,8 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   void Reset();
 
   // Extract the top n paths at current time step
-  void TopPaths(int n, std::vector<std::vector<int>>* paths,
-                std::vector<float>* log_probs, bool merge_repeated) const;
+  Status TopPaths(int n, std::vector<std::vector<int>>* paths,
+                  std::vector<float>* log_probs, bool merge_repeated) const;
 
  private:
   int beam_width_;
@@ -143,7 +144,7 @@ class CTCBeamSearchDecoder : public CTCDecoder {
 };
 
 template <typename CTCBeamState, typename CTCBeamComparer>
-void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
+Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
     const CTCDecoder::SequenceLength& seq_len,
     const std::vector<CTCDecoder::Input>& input,
     std::vector<CTCDecoder::Output>* output, ScoreOutput* scores) {
@@ -151,6 +152,17 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
   std::vector<std::vector<int>> beams;
   std::vector<float> beam_log_probabilities;
   int top_n = output->size();
+  if (std::any_of(output->begin(), output->end(),
+                  [this](const CTCDecoder::Output& output) -> bool {
+                    return output.size() < this->batch_size_;
+                  })) {
+    return errors::InvalidArgument(
+        "output needs to be of size at least (top_n, batch_size).");
+  }
+  if (scores->rows() < batch_size_ || scores->cols() < top_n) {
+    return errors::InvalidArgument(
+        "scores needs to be of size at least (batch_size, top_n).");
+  }
 
   for (int b = 0; b < batch_size_; ++b) {
     int seq_len_b = seq_len[b];
@@ -172,7 +184,11 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
       leaves_.push(entry);
     }
 
-    TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_);
+    Status status =
+        TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_);
+    if (!status.ok()) {
+      return status;
+    }
 
     CHECK_EQ(top_n, beam_log_probabilities.size());
     CHECK_EQ(beams.size(), beam_log_probabilities.size());
@@ -183,6 +199,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Decode(
       (*scores)(b, i) = -beam_log_probabilities[i];
     }
   }  // for (int b...
+  return Status::OK();
 }
 
 template <typename CTCBeamState, typename CTCBeamComparer>
@@ -206,7 +223,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
     // max element is 0, per normalization above
     label_selection_input_min =
         std::max(label_selection_input_min, -label_selection_margin_);
-  };
+  }
 
   // Extract the beams sorted in decreasing new probability
   CHECK_EQ(num_classes_, input.size());
@@ -328,14 +345,18 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Reset() {
 }
 
 template <typename CTCBeamState, typename CTCBeamComparer>
-void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
+Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
     int n, std::vector<std::vector<int>>* paths, std::vector<float>* log_probs,
     bool merge_repeated) const {
   CHECK_NOTNULL(paths)->clear();
   CHECK_NOTNULL(log_probs)->clear();
-  CHECK_LE(n, beam_width_) << "Requested more paths than the beam width.";
-  CHECK_LE(n, leaves_.size()) << "Less leaves in the beam search "
-                              << "than requested.  Have you called Step()?";
+  if (n > beam_width_) {
+    return errors::InvalidArgument("requested more paths than the beam width.");
+  }
+  if (n > leaves_.size()) {
+    return errors::InvalidArgument(
+        "Less leaves in the beam search than requested.");
+  }
 
   gtl::TopN<BeamEntry*, CTCBeamComparer> top_branches(n);
 
@@ -351,6 +372,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
     paths->push_back(e->LabelSeq(merge_repeated));
     log_probs->push_back(e->newp.total);
   }
+  return Status::OK();
 }
 
 }  // namespace ctc
diff --git a/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_decoder.h b/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_decoder.h
index 77b91d8..294419e 100644
--- a/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/libs/tensorflow/include/tensorflow/core/util/ctc/ctc_decoder.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace ctc {
@@ -47,9 +49,9 @@ class CTCDecoder {
   //  - input[t].rows(b) - t = 0 to timesteps; b = 0 t batch_size_
   //  - output.size() specifies the number of beams to be returned.
   //  - scores(b, i) - b = 0 to batch_size; i = 0 to output.size()
-  virtual void Decode(const SequenceLength& seq_len,
-                      const std::vector<Input>& input,
-                      std::vector<Output>* output, ScoreOutput* scores) = 0;
+  virtual Status Decode(const SequenceLength& seq_len,
+                        const std::vector<Input>& input,
+                        std::vector<Output>* output, ScoreOutput* scores) = 0;
 
   int batch_size() { return batch_size_; }
   int num_classes() { return num_classes_; }
@@ -68,10 +70,18 @@ class CTCGreedyDecoder : public CTCDecoder {
   CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated)
       : CTCDecoder(num_classes, batch_size, merge_repeated) {}
 
-  void Decode(const CTCDecoder::SequenceLength& seq_len,
-              const std::vector<CTCDecoder::Input>& input,
-              std::vector<CTCDecoder::Output>* output,
-              CTCDecoder::ScoreOutput* scores) override {
+  Status Decode(const CTCDecoder::SequenceLength& seq_len,
+                const std::vector<CTCDecoder::Input>& input,
+                std::vector<CTCDecoder::Output>* output,
+                CTCDecoder::ScoreOutput* scores) override {
+    if (output->empty() || (*output)[0].size() < batch_size_) {
+      return errors::InvalidArgument(
+          "output needs to be of size at least (1, batch_size).");
+    }
+    if (scores->rows() < batch_size_ || scores->cols() == 0) {
+      return errors::InvalidArgument(
+          "scores needs to be of size at least (batch_size, 1).");
+    }
     // For each batch entry, identify the transitions
     for (int b = 0; b < batch_size_; ++b) {
       int seq_len_b = seq_len[b];
@@ -93,6 +103,7 @@ class CTCGreedyDecoder : public CTCDecoder {
         prev_class_ix = max_class_ix;
       }
     }
+    return Status::OK();
   }
 };
 
diff --git a/libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb.h b/libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb_text.h b/libs/tensorflow/include/tensorflow/core/util/memmapped_file_system.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/util/mkl_util.h b/libs/tensorflow/include/tensorflow/core/util/mkl_util.h
new file mode 100644
index 0000000..6d09995
--- /dev/null
+++ b/libs/tensorflow/include/tensorflow/core/util/mkl_util.h
@@ -0,0 +1,296 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifdef INTEL_MKL
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "third_party/mkl/include/mkl_service.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+// The file contains a number of utility classes and functions used by MKL
+// enabled kernels
+
+namespace tensorflow {
+
+// This class encapsulates all the meta data that is associated with an MKL
+// tensor. A tensor is an MKL tensor if it was created as the result of an
+// MKL operation, and did not go through a conversion to a standard
+// Tensorflow tensor.
+
+class MklShape {
+ public:
+  MklShape() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MklShape);  // Cannot copy
+
+  ~MklShape() {
+    if (sizes_) delete[] sizes_;
+    if (strides_) delete[] strides_;
+    if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
+    if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
+  }
+
+  const bool IsMklTensor() const { return isMklTensor_; }
+
+  void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
+
+  void SetMklLayout(const void* primitive, size_t resourceType) {
+    CHECK_EQ(
+        dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
+                                         (dnnResourceType_t)resourceType),
+        E_SUCCESS);
+  }
+
+  void SetTfLayout(const size_t dimension, const size_t* sizes,
+                   const size_t* strides) {
+    dimension_ = dimension;
+    if (dimension > 0) {  // MKl doesn't support dimension 0
+      sizes_ = new size_t[dimension];
+      strides_ = new size_t[dimension];
+
+      for (int ii = 0; ii < dimension; ii++) {
+        sizes_[ii] = sizes[ii];
+        strides_[ii] = strides[ii];
+      }
+      CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides),
+               E_SUCCESS);
+    }
+  }
+
+  const dnnLayout_t GetMklLayout() const { return mklLayout_; }
+  const dnnLayout_t GetTfLayout() const { return tfLayout_; }
+  const dnnLayout_t GetCurLayout() const {
+    return isMklTensor_ ? mklLayout_ : tfLayout_;
+  }
+  size_t GetDimension() const { return dimension_; }
+  const size_t* GetSizes() const { return sizes_; }
+  const size_t* GetStrides() const { return strides_; }
+
+  void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
+                            void* output) const {
+    dnnLayout_t curLayout;
+    if (isMklTensor_)
+      curLayout = mklLayout_;
+    else
+      curLayout = tfLayout_;
+    dnnPrimitive_t convert;
+    CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout),
+             E_SUCCESS);
+    CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS);
+    CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
+  }
+
+// The following methods are used for serializing and de-serializing the
+// contents of the mklshape object.
+// The data is serialized in this order
+// isMklTensor_
+// dimension_
+// sizes
+// strides
+// mklLayout_
+// tfLayout_
+
+#define SIZE_OF_MKL_DNN_BUF \
+  (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
+                                            // serialize dnn_layout pointer
+
+// Size of buffer to hold the serialized object, the size is computed as follows
+// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides)
+// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+
+#define SIZE_OF_MKL_SERIAL_DATA(dims) \
+  (2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
+
+// First we need to define some macro for offsets into the serial buffer where
+// different elements of Mklshape is written/read from
+
+#define IS_MKL_TENSOR_OFFSET 0
+// Location from start of buffer where isMklTensor_ is serialized
+#define DIMS_OFFSET \
+  (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
+#define SIZES_OFFSET(dims) \
+  (DIMS_OFFSET +           \
+  sizeof(size_t))  // Location of sizes. Note dim is not used here, left here
+                    // to make macros consistent.
+#define STRIDES_OFFSET(dims) \
+  (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
+#define MKL_LAYOUT_OFFSET(dims) \
+  (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
+#define TF_LAYOUT_OFFSET(dims) \
+  (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+
+  // TODO(agramesh1) make sure to create a const to share with rewrite pass
+  // for min size of MKL metadata tensor.
+
+  void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) {
+    CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize";
+    // Make sure buffer holds at least  isMklTensor_
+    isMklTensor_ =
+        *reinterpret_cast<const size_t*>(buf + IS_MKL_TENSOR_OFFSET) != 0;
+
+    if (isMklTensor_) {  // If it is an MKL Tensor then read the rest
+      dimension_ = *(reinterpret_cast<const size_t*>(buf + DIMS_OFFSET));
+      CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+          << "Bufsize too small in DeSerialize";
+      sizes_ = new size_t[dimension_];
+      strides_ = new size_t[dimension_];
+      for (int i = 0; i < dimension_; i++) {
+        sizes_[i] =
+            reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
+        strides_[i] = reinterpret_cast<const size_t*>(
+            buf + STRIDES_OFFSET(dimension_))[i];
+      }
+      CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
+                                        buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_,
+                                        buf + TF_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+    }
+  }
+
+  void SerializeMklShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+        << "Bufsize too small to Serialize";
+    *reinterpret_cast<size_t*>(buf + IS_MKL_TENSOR_OFFSET) =
+        isMklTensor_ ? 1 : 0;
+    if (isMklTensor_) {
+      *(reinterpret_cast<size_t*>(buf + DIMS_OFFSET)) = dimension_;
+      for (int i = 0; i < dimension_; i++) {
+        reinterpret_cast<size_t*>(buf + SIZES_OFFSET(dimension_))[i] =
+            sizes_[i];
+        reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
+            strides_[i];
+      }
+      CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
+                                      buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(
+          dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)),
+          E_SUCCESS);
+    }
+  }
+
+ private:
+  bool isMklTensor_ =
+      false;  // Flag to indicate if the tensor is an  MKL tensor or not
+  dnnLayout_t mklLayout_ = nullptr;  // Pointer to the MKL layout
+  dnnLayout_t tfLayout_ = nullptr;   // Pointer to layout of corresponding
+  // Tensorflow tensor, used when conversion from MKL to standard tensor
+  size_t dimension_ = 0;
+  size_t* sizes_ = nullptr;    // Required by MKL for conversions
+  size_t* strides_ = nullptr;  // Required by MKL for conversions
+};
+
+int inline GetTensorDataIndex(int n) {
+  return 2 * n;  // index corresponding to nth input/output tensor
+}
+
+int inline GetTensorMetaDataIndex(int n) {
+  // index corresponding to meta data of nth input/output tensor
+  return 2 * n + 1;
+}
+// Get the MKL shape from the second string tensor
+inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
+  mklshape->DeSerializeMklShape(
+      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().data(),
+      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().size() *
+          sizeof(uint8));
+}
+
+// Gets the actual input
+inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
+  return ctext->input(GetTensorDataIndex(n));
+}
+
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklshape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tfshape,
+                                      const MklShape& mklshape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mklshape.GetDimension()));
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n), tfshape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(GetTensorMetaDataIndex(n),
+                                               second_shape, &second_tensor));
+  mklshape.SerializeMklShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+
+// Allocates a temp tensor and returns the data buffer for temporary storage.
+// Currently
+// we only support F32, will need to templatize if other types are added
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           dnnLayout_t lt_buff, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(
+      dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(lt_buff)) /
+          sizeof(float) +
+      1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<float>().data());
+}
+
+inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
+                                const size_t* sizes) {
+  // MKL requires strides in NCHW
+  if (data_format == FORMAT_NHWC) {
+    strides[0] = sizes[2];
+    strides[1] = sizes[0] * sizes[2];
+    strides[2] = 1;
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  } else {
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  }
+}
+
+namespace mkl_layer_registry {
+
+static const char* kMklLayerLabel = "MklLayer";
+static const string kMklLayerLabelPattern = "label='MklLayer'";
+
+// Check whether opname is registered as MKL-compliant in the registry.
+//
+// @input: name of the op
+// @return: true if opname is registered as Mkl layer op
+static inline bool IsMklLayer(const std::string& op_name) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  return kernel.find(kMklLayerLabelPattern) != string::npos;
+}
+
+} // namespace mkl_layer_registry
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/libs/tensorflow/include/tensorflow/core/util/presized_cuckoo_map.h b/libs/tensorflow/include/tensorflow/core/util/presized_cuckoo_map.h
index cf3b8cf..e7dab83 100644
--- a/libs/tensorflow/include/tensorflow/core/util/presized_cuckoo_map.h
+++ b/libs/tensorflow/include/tensorflow/core/util/presized_cuckoo_map.h
@@ -132,6 +132,10 @@ class PresizedCuckooMap {
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
+  int64 MemoryUsed() const {
+    return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
+  }
+
  private:
   static constexpr int kSlotsPerBucket = 4;
 
diff --git a/libs/tensorflow/include/tensorflow/core/util/reporter.h b/libs/tensorflow/include/tensorflow/core/util/reporter.h
index f69e5b5..bcae122 100644
--- a/libs/tensorflow/include/tensorflow/core/util/reporter.h
+++ b/libs/tensorflow/include/tensorflow/core/util/reporter.h
@@ -75,7 +75,8 @@ class TestReporter {
   Status Benchmark(int64 iters, double cpu_time, double wall_time,
                    double throughput);
 
-  ~TestReporter() { Close(); }  // Autoclose in destructor.
+  // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
+  ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
  private:
   static string GetLogEnv() {
diff --git a/libs/tensorflow/include/tensorflow/core/util/saved_tensor_slice.pb_text-impl.h b/libs/tensorflow/include/tensorflow/core/util/saved_tensor_slice.pb_text-impl.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/util/saved_tensor_slice.pb_text.h b/libs/tensorflow/include/tensorflow/core/util/saved_tensor_slice.pb_text.h
old mode 100644
new mode 100755
diff --git a/libs/tensorflow/include/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/libs/tensorflow/include/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 6a8104a..bca3910 100644
--- a/libs/tensorflow/include/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/libs/tensorflow/include/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -207,6 +207,12 @@ class BundleReader {
   // REQUIRES: status().ok()
   Status Lookup(StringPiece key, Tensor* val) TF_MUST_USE_RESULT;
 
+  // Looks up the slices of the tensor keyed by "key".  On OK, "slices"
+  // is non-empty if and only if the tensor is a partitioned tensor.
+  // REQUIRES: status().ok()
+  Status LookupTensorSlices(StringPiece key, std::vector<TensorSlice>* slices)
+      TF_MUST_USE_RESULT;
+
   // Looks up a specific slice of a partitioned tensor.
   // It is only required that the stored slices cover the requested slice,
   // namely "slice_spec" is a subset of the union of the stored slices.
diff --git a/libs/tensorflow/include/tensorflow/core/util/tensor_format.h b/libs/tensorflow/include/tensorflow/core/util/tensor_format.h
index a21dee5..7d8e4b1 100644
--- a/libs/tensorflow/include/tensorflow/core/util/tensor_format.h
+++ b/libs/tensorflow/include/tensorflow/core/util/tensor_format.h
@@ -35,6 +35,39 @@ bool FormatFromString(const string& format_str, TensorFormat* format);
 // Convert a tensor format into string.
 string ToString(TensorFormat format);
 
+// Returns the index of the batch dimension.
+inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
+  if (format == FORMAT_NHWC || format == FORMAT_NCHW) {
+    return 0;
+  } else {
+    LOG(FATAL) << "Unknown format " << format;
+  }
+}
+
+// Returns the index of the feature dimension.
+inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
+  if (format == FORMAT_NHWC) {
+    return num_dims - 1;
+  } else if (format == FORMAT_NCHW) {
+    return 1;
+  } else {
+    LOG(FATAL) << "Unknown format " << format;
+  }
+}
+
+// Returns the index of the `dim`-th spatial dimension.
+inline int GetTensorSpatialDimIndex(int num_dims, TensorFormat format,
+                                    int dim) {
+  CHECK(dim >= 0 && dim < num_dims - 2) << dim << " " << num_dims;
+  if (format == FORMAT_NHWC) {
+    return dim + 1;
+  } else if (format == FORMAT_NCHW) {
+    return dim + 2;
+  } else {
+    LOG(FATAL) << "Unknown format " << format;
+  }
+}
+
 // Return the position index from a format given a dimension specification with
 // a char. The chars can be N (batch), C (channels), H (y), W (x), or
 // 0 .. (NDIMS-1).
@@ -58,7 +91,7 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
         return 1 + NDIMS;
       default:
         LOG(FATAL) << "Invalid dimension: " << dimension;
-        return -1; // Avoid compiler warning about missing return value
+        return -1;  // Avoid compiler warning about missing return value
     }
   } else if (format == FORMAT_NCHW) {
     switch (dimension) {
@@ -78,11 +111,11 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
         return NDIMS + 1;
       default:
         LOG(FATAL) << "Invalid dimension: " << dimension;
-        return -1; // Avoid compiler warning about missing return value
+        return -1;  // Avoid compiler warning about missing return value
     }
   } else {
     LOG(FATAL) << "Invalid format: " << static_cast<int>(format);
-    return -1; // Avoid compiler warning about missing return value
+    return -1;  // Avoid compiler warning about missing return value
   }
 }
 
diff --git a/libs/tensorflow/include/tensorflow/core/util/test_log.pb.h b/libs/tensorflow/include/tensorflow/core/util/test_log.pb.h
old mode 100644
new mode 100755
index a7dc517..5b30d2b
--- a/libs/tensorflow/include/tensorflow/core/util/test_log.pb.h
+++ b/libs/tensorflow/include/tensorflow/core/util/test_log.pb.h
@@ -29,6 +29,7 @@
 #include <google/protobuf/extension_set.h>  // IWYU pragma: export
 #include <google/protobuf/map.h>
 #include <google/protobuf/map_field_inl.h>
+#include <google/protobuf/generated_enum_reflection.h>
 #include <google/protobuf/unknown_field_set.h>
 #include <google/protobuf/any.pb.h>
 // @@protoc_insertion_point(includes)
@@ -94,6 +95,29 @@ void AddDescriptors();
 void InitDefaults();
 }  // namespace protobuf_tensorflow_2fcore_2futil_2ftest_5flog_2eproto
 
+enum TestResults_BenchmarkType {
+  TestResults_BenchmarkType_UNKNOWN = 0,
+  TestResults_BenchmarkType_CPP_MICROBENCHMARK = 1,
+  TestResults_BenchmarkType_PYTHON_BENCHMARK = 2,
+  TestResults_BenchmarkType_ANDROID_BENCHMARK = 3,
+  TestResults_BenchmarkType_TestResults_BenchmarkType_INT_MIN_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32min,
+  TestResults_BenchmarkType_TestResults_BenchmarkType_INT_MAX_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32max
+};
+bool TestResults_BenchmarkType_IsValid(int value);
+const TestResults_BenchmarkType TestResults_BenchmarkType_BenchmarkType_MIN = TestResults_BenchmarkType_UNKNOWN;
+const TestResults_BenchmarkType TestResults_BenchmarkType_BenchmarkType_MAX = TestResults_BenchmarkType_ANDROID_BENCHMARK;
+const int TestResults_BenchmarkType_BenchmarkType_ARRAYSIZE = TestResults_BenchmarkType_BenchmarkType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* TestResults_BenchmarkType_descriptor();
+inline const ::std::string& TestResults_BenchmarkType_Name(TestResults_BenchmarkType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    TestResults_BenchmarkType_descriptor(), value);
+}
+inline bool TestResults_BenchmarkType_Parse(
+    const ::std::string& name, TestResults_BenchmarkType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<TestResults_BenchmarkType>(
+    TestResults_BenchmarkType_descriptor(), name, value);
+}
 // ===================================================================
 
 class EntryValue : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:tensorflow.EntryValue) */ {
@@ -581,10 +605,16 @@ class BuildConfiguration : public ::google::protobuf::Message /* @@protoc_insert
   const ::std::string& cc_flags(int index) const;
   ::std::string* mutable_cc_flags(int index);
   void set_cc_flags(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_cc_flags(int index, ::std::string&& value);
+  #endif
   void set_cc_flags(int index, const char* value);
   void set_cc_flags(int index, const char* value, size_t size);
   ::std::string* add_cc_flags();
   void add_cc_flags(const ::std::string& value);
+  #if LANG_CXX11
+  void add_cc_flags(::std::string&& value);
+  #endif
   void add_cc_flags(const char* value);
   void add_cc_flags(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& cc_flags() const;
@@ -597,10 +627,16 @@ class BuildConfiguration : public ::google::protobuf::Message /* @@protoc_insert
   const ::std::string& opts(int index) const;
   ::std::string* mutable_opts(int index);
   void set_opts(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_opts(int index, ::std::string&& value);
+  #endif
   void set_opts(int index, const char* value);
   void set_opts(int index, const char* value, size_t size);
   ::std::string* add_opts();
   void add_opts(const ::std::string& value);
+  #if LANG_CXX11
+  void add_opts(::std::string&& value);
+  #endif
   void add_opts(const char* value);
   void add_opts(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& opts() const;
@@ -1808,10 +1844,16 @@ class RunConfiguration : public ::google::protobuf::Message /* @@protoc_insertio
   const ::std::string& argument(int index) const;
   ::std::string* mutable_argument(int index);
   void set_argument(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_argument(int index, ::std::string&& value);
+  #endif
   void set_argument(int index, const char* value);
   void set_argument(int index, const char* value, size_t size);
   ::std::string* add_argument();
   void add_argument(const ::std::string& value);
+  #if LANG_CXX11
+  void add_argument(::std::string&& value);
+  #endif
   void add_argument(const char* value);
   void add_argument(const char* value, size_t size);
   const ::google::protobuf::RepeatedPtrField< ::std::string>& argument() const;
@@ -1907,6 +1949,36 @@ class TestResults : public ::google::protobuf::Message /* @@protoc_insertion_poi
 
   // nested types ----------------------------------------------------
 
+  typedef TestResults_BenchmarkType BenchmarkType;
+  static const BenchmarkType UNKNOWN =
+    TestResults_BenchmarkType_UNKNOWN;
+  static const BenchmarkType CPP_MICROBENCHMARK =
+    TestResults_BenchmarkType_CPP_MICROBENCHMARK;
+  static const BenchmarkType PYTHON_BENCHMARK =
+    TestResults_BenchmarkType_PYTHON_BENCHMARK;
+  static const BenchmarkType ANDROID_BENCHMARK =
+    TestResults_BenchmarkType_ANDROID_BENCHMARK;
+  static inline bool BenchmarkType_IsValid(int value) {
+    return TestResults_BenchmarkType_IsValid(value);
+  }
+  static const BenchmarkType BenchmarkType_MIN =
+    TestResults_BenchmarkType_BenchmarkType_MIN;
+  static const BenchmarkType BenchmarkType_MAX =
+    TestResults_BenchmarkType_BenchmarkType_MAX;
+  static const int BenchmarkType_ARRAYSIZE =
+    TestResults_BenchmarkType_BenchmarkType_ARRAYSIZE;
+  static inline const ::google::protobuf::EnumDescriptor*
+  BenchmarkType_descriptor() {
+    return TestResults_BenchmarkType_descriptor();
+  }
+  static inline const ::std::string& BenchmarkType_Name(BenchmarkType value) {
+    return TestResults_BenchmarkType_Name(value);
+  }
+  static inline bool BenchmarkType_Parse(const ::std::string& name,
+      BenchmarkType* value) {
+    return TestResults_BenchmarkType_Parse(name, value);
+  }
+
   // accessors -------------------------------------------------------
 
   // string target = 1;
@@ -2039,6 +2111,12 @@ class TestResults : public ::google::protobuf::Message /* @@protoc_insertion_poi
   double run_time() const;
   void set_run_time(double value);
 
+  // .tensorflow.TestResults.BenchmarkType benchmark_type = 10;
+  void clear_benchmark_type();
+  static const int kBenchmarkTypeFieldNumber = 10;
+  ::tensorflow::TestResults_BenchmarkType benchmark_type() const;
+  void set_benchmark_type(::tensorflow::TestResults_BenchmarkType value);
+
   // @@protoc_insertion_point(class_scope:tensorflow.TestResults)
  private:
 
@@ -2055,6 +2133,7 @@ class TestResults : public ::google::protobuf::Message /* @@protoc_insertion_poi
   ::tensorflow::RunConfiguration* run_configuration_;
   ::google::protobuf::int64 start_time_;
   double run_time_;
+  int benchmark_type_;
   mutable int _cached_size_;
   friend struct  protobuf_tensorflow_2fcore_2futil_2ftest_5flog_2eproto::TableStruct;
 };
@@ -2482,6 +2561,12 @@ inline void BuildConfiguration::set_cc_flags(int index, const ::std::string& val
   // @@protoc_insertion_point(field_set:tensorflow.BuildConfiguration.cc_flags)
   cc_flags_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void BuildConfiguration::set_cc_flags(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.BuildConfiguration.cc_flags)
+  cc_flags_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void BuildConfiguration::set_cc_flags(int index, const char* value) {
   cc_flags_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.BuildConfiguration.cc_flags)
@@ -2499,6 +2584,12 @@ inline void BuildConfiguration::add_cc_flags(const ::std::string& value) {
   cc_flags_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.BuildConfiguration.cc_flags)
 }
+#if LANG_CXX11
+inline void BuildConfiguration::add_cc_flags(::std::string&& value) {
+  cc_flags_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.BuildConfiguration.cc_flags)
+}
+#endif
 inline void BuildConfiguration::add_cc_flags(const char* value) {
   cc_flags_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.BuildConfiguration.cc_flags)
@@ -2537,6 +2628,12 @@ inline void BuildConfiguration::set_opts(int index, const ::std::string& value)
   // @@protoc_insertion_point(field_set:tensorflow.BuildConfiguration.opts)
   opts_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void BuildConfiguration::set_opts(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.BuildConfiguration.opts)
+  opts_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void BuildConfiguration::set_opts(int index, const char* value) {
   opts_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.BuildConfiguration.opts)
@@ -2554,6 +2651,12 @@ inline void BuildConfiguration::add_opts(const ::std::string& value) {
   opts_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.BuildConfiguration.opts)
 }
+#if LANG_CXX11
+inline void BuildConfiguration::add_opts(::std::string&& value) {
+  opts_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.BuildConfiguration.opts)
+}
+#endif
 inline void BuildConfiguration::add_opts(const char* value) {
   opts_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.BuildConfiguration.opts)
@@ -4204,6 +4307,12 @@ inline void RunConfiguration::set_argument(int index, const ::std::string& value
   // @@protoc_insertion_point(field_set:tensorflow.RunConfiguration.argument)
   argument_.Mutable(index)->assign(value);
 }
+#if LANG_CXX11
+inline void RunConfiguration::set_argument(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:tensorflow.RunConfiguration.argument)
+  argument_.Mutable(index)->assign(std::move(value));
+}
+#endif
 inline void RunConfiguration::set_argument(int index, const char* value) {
   argument_.Mutable(index)->assign(value);
   // @@protoc_insertion_point(field_set_char:tensorflow.RunConfiguration.argument)
@@ -4221,6 +4330,12 @@ inline void RunConfiguration::add_argument(const ::std::string& value) {
   argument_.Add()->assign(value);
   // @@protoc_insertion_point(field_add:tensorflow.RunConfiguration.argument)
 }
+#if LANG_CXX11
+inline void RunConfiguration::add_argument(::std::string&& value) {
+  argument_.Add()->assign(std::move(value));
+  // @@protoc_insertion_point(field_add:tensorflow.RunConfiguration.argument)
+}
+#endif
 inline void RunConfiguration::add_argument(const char* value) {
   argument_.Add()->assign(value);
   // @@protoc_insertion_point(field_add_char:tensorflow.RunConfiguration.argument)
@@ -4649,6 +4764,20 @@ inline void TestResults::unsafe_arena_set_allocated_name(
   // @@protoc_insertion_point(field_unsafe_arena_set_allocated:tensorflow.TestResults.name)
 }
 
+// .tensorflow.TestResults.BenchmarkType benchmark_type = 10;
+inline void TestResults::clear_benchmark_type() {
+  benchmark_type_ = 0;
+}
+inline ::tensorflow::TestResults_BenchmarkType TestResults::benchmark_type() const {
+  // @@protoc_insertion_point(field_get:tensorflow.TestResults.benchmark_type)
+  return static_cast< ::tensorflow::TestResults_BenchmarkType >(benchmark_type_);
+}
+inline void TestResults::set_benchmark_type(::tensorflow::TestResults_BenchmarkType value) {
+  
+  benchmark_type_ = value;
+  // @@protoc_insertion_point(field_set:tensorflow.TestResults.benchmark_type)
+}
+
 #endif  // !PROTOBUF_INLINE_NOT_IN_HEADERS
 // -------------------------------------------------------------------
 
@@ -4684,6 +4813,20 @@ inline void TestResults::unsafe_arena_set_allocated_name(
 
 }  // namespace tensorflow
 
+#ifndef SWIG
+namespace google {
+namespace protobuf {
+
+template <> struct is_proto_enum< ::tensorflow::TestResults_BenchmarkType> : ::google::protobuf::internal::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< ::tensorflow::TestResults_BenchmarkType>() {
+  return ::tensorflow::TestResults_BenchmarkType_descriptor();
+}
+
+}  // namespace protobuf
+}  // namespace google
+#endif  // SWIG
+
 // @@protoc_insertion_point(global_scope)
 
 #endif  // PROTOBUF_tensorflow_2fcore_2futil_2ftest_5flog_2eproto__INCLUDED
diff --git a/libs/tensorflow/include/third_party/eigen3/BUILD b/libs/tensorflow/include/third_party/eigen3/BUILD
index 4cf530e..f38a267 100644
--- a/libs/tensorflow/include/third_party/eigen3/BUILD
+++ b/libs/tensorflow/include/third_party/eigen3/BUILD
@@ -13,6 +13,7 @@ exports_files(["LICENSE"])
 
 # INTEL_MKL start
 load("//tensorflow:tensorflow.bzl", "if_mkl")
+
 # INTEL_MKL end
 load("//tensorflow:tensorflow.bzl", "if_mkl")
 
diff --git a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
index b0a73aa..eb604d3 100644
--- a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
+++ b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
@@ -31,7 +31,7 @@
 #include "src/FixedPoint/FixedPointTypes.h"
 
 // Use optimized implementations whenever available
-#ifdef EIGEN_VECTORIZE_AVX512
+#if defined (EIGEN_VECTORIZE_AVX512DQ) || defined (EIGEN_VECTORIZE_AVX512BW)
 #include "src/FixedPoint/PacketMathAVX512.h"
 #include "src/FixedPoint/TypeCastingAVX512.h"
 
diff --git a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 98deb17..078be83 100644
--- a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -11,6 +11,13 @@ typedef struct Packet32q8i {
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
+typedef struct Packet16q16i {
+  __m256i val;
+  operator __m256i() const { return val; }
+  Packet16q16i();
+  Packet16q16i(__m256i val) : val(val) {}
+} Packet16q16i;
+
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
@@ -32,6 +39,13 @@ typedef struct Packet16q8u {
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
+typedef struct Packet8q16i {
+  __m128i val;
+  operator __m128i() const { return val; }
+  Packet8q16i();
+  Packet8q16i(__m128i val) : val(val) {}
+} Packet8q16i;
+
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
@@ -92,6 +106,28 @@ struct packet_traits<QUInt8> : default_packet_traits {
   };
 };
 template <>
+struct packet_traits<QInt16> : default_packet_traits {
+  typedef Packet16q16i type;
+  typedef Packet8q16i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+template <>
 struct packet_traits<QInt32> : default_packet_traits {
   typedef Packet8q32i type;
   typedef Packet4q32i half;
@@ -122,6 +158,12 @@ struct unpacket_traits<Packet32q8i> {
   enum { size = 32, alignment=Aligned32 };
 };
 template <>
+struct unpacket_traits<Packet16q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 16, alignment=Aligned32 };
+};
+template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
@@ -146,6 +188,11 @@ EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -163,6 +210,11 @@ EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+      reinterpret_cast<const __m256i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -180,6 +232,11 @@ EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -192,6 +249,11 @@ EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
                                                from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+                                               from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
@@ -208,6 +270,10 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
 }
 template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+  return _mm256_extract_epi16(a.val, 0);
+}
+template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
   return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
 }
@@ -237,6 +303,10 @@ EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
   return _mm256_add_epi32(a.val, b.val);
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
                                                   const Packet8q32i& b) {
   return _mm256_sub_epi32(a.val, b.val);
@@ -264,6 +334,17 @@ EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
   return _mm256_max_epi32(a.val, b.val);
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_min_epi16(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+                                                    const Packet16q16i& b) {
+  return _mm256_max_epi16(a.val, b.val);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
                                                   const Packet32q8u& b) {
@@ -304,6 +385,23 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
       _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
 }
 
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+  __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+  tmp =
+      _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+  return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
   __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
diff --git a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index b754bbf..7a222fd 100644
--- a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -127,25 +127,25 @@ template <>
 struct unpacket_traits<Packet64q8i> {
   typedef QInt8 type;
   typedef Packet32q8i half;
-  enum { size = 64 };
+  enum { size = 64, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet32q16i> {
   typedef QInt16 type;
   typedef Packet16q16i half;
-  enum { size = 32 };
+  enum { size = 32, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet64q8u> {
   typedef QUInt8 type;
   typedef Packet32q8u half;
-  enum { size = 64 };
+  enum { size = 64, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet16q32i> {
   typedef QInt32 type;
   typedef Packet8q32i half;
-  enum { size = 16 };
+  enum { size = 16, alignment=Aligned64 };
 };
 
 // Unaligned load
@@ -457,7 +457,7 @@ EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
   std::uint32_t w =
       pfirst(
         _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
+  return std::max({
            static_cast<std::int16_t>(w >> 16),
            static_cast<std::int16_t>(w)
          });
@@ -493,7 +493,7 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
   std::uint32_t w =
       pfirst(
         _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
+  return std::max({
            static_cast<std::uint8_t>(w >> 24),
            static_cast<std::uint8_t>(w >> 16),
            static_cast<std::uint8_t>(w >> 8),
diff --git a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
index 94d616f..cbcce9e 100644
--- a/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
+++ b/libs/tensorflow/include/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
@@ -80,11 +80,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/README.md b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/README.md
index fbb7f3b..38cdb9c 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -75,16 +75,16 @@ large enough to hold all the data.
 
     // Map a tensor of ints on top of stack-allocated storage.
     int storage[128];  // 2 x 4 x 2 x 8 = 128
-    TensorMap<int, 4> t_4d(storage, 2, 4, 2, 8);
+    TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
 
     // The same storage can be viewed as a different tensor.
     // You can also pass the sizes as an array.
-    TensorMap<int, 2> t_2d(storage, 16, 8);
+    TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
 
     // You can also map fixed-size tensors.  Here we get a 1d view of
     // the 2d fixed-size tensor.
     Tensor<float, Sizes<4, 5>> t_4x3;
-    TensorMap<float, 1> t_12(t_4x3, 12);
+    TensorMap<Tensor<float, 1>> t_12(t_4x3, 12);
 
 
 #### Class TensorRef
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index d65dbb4..c04b784 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
                        float2 rhs_shmem2[][8], const Index m_size,
                        const Index n_size, const Index k_size,
                        const Index base_m, const Index base_n) {
-  typedef float Scalar;
 
   // prefetch registers
   float4 lhs_pf0, rhs_pf0;
@@ -540,27 +539,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
   }
 
 
-#define prefetch_lhs(reg, row, col)                   \
-    if (!CHECK_LHS_BOUNDARY) {                        \
-      if (col < k_size) {                             \
-        reg =lhs.loadPacket<Unaligned>(row, col);     \
-      }                                               \
-    } else {                                          \
-      if (col < k_size) {                             \
-        if (row + 3 < m_size) {                       \
-          reg =lhs.loadPacket<Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-        } else if (row + 1 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-        } else if (row  < m_size) {                   \
-          reg.x =lhs(row + 0, col);                   \
-        }                                             \
-      }                                               \
-    }                                                 \
+#define prefetch_lhs(reg, row, col)                            \
+    if (!CHECK_LHS_BOUNDARY) {                                 \
+      if (col < k_size) {                                      \
+        reg =lhs.template loadPacket<Unaligned>(row, col);     \
+      }                                                        \
+    } else {                                                   \
+      if (col < k_size) {                                      \
+        if (row + 3 < m_size) {                                \
+          reg =lhs.template loadPacket<Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+          reg.z =lhs(row + 2, col);                            \
+        } else if (row + 1 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+        } else if (row  < m_size) {                            \
+          reg.x =lhs(row + 0, col);                            \
+        }                                                      \
+      }                                                        \
+    }                                                          \
 
 
   Index lhs_vert = base_m+threadIdx.x*4;
@@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     } else {
       if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
           rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
                        float2 rhs_shmem2[][8], const Index m_size,
                        const Index n_size, const Index k_size,
                        const Index base_m, const Index base_n) {
-  typedef float Scalar;
 
   // prefetch registers
   float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 
      if (!CHECK_LHS_BOUNDARY) {
       if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
       } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
       } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
       } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
       }
     } else {
       // just CHECK_LHS_BOUNDARY
       if (lhs_vert + 3 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
         } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
         } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
         } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
         }
       } else if (lhs_vert + 2 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
@@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       if (rhs_horiz1 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
         } else if (rhs_vert + 2 < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       } else if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
   typedef float2 LHS_MEM[64][32];
   typedef float2 RHS_MEM[128][8];
 
-  typedef float2 LHS_MEM16x16[32][16];
-  typedef float2 RHS_MEM16x16[64][8];
-
   const Index m_block_idx = blockIdx.x;
   const Index n_block_idx = blockIdx.y;
 
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index e6cee11..be8d693 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -217,7 +217,10 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c841786..e341e2e 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,6 +11,17 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && defined(__CUDA__)
+#define KERNEL_FRIEND friend __global__
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+
 namespace Eigen {
 
 
@@ -681,15 +692,15 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 #ifdef EIGEN_HAS_CUDA_FP16
-  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+  template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
 #endif
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 
-  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
 #if defined(EIGEN_USE_SYCL)
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 65638b6..edb0ab2 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -287,7 +287,6 @@ struct FullReductionLauncher<
     void>::type> {
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
-    typedef typename Self::CoeffReturnType Scalar;
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index f8121d1..e6a666f 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, typename Dimensions, int Options_> class TensorStorage;
+template<typename T, typename Dimensions, int Options> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, int Options_, typename FixedDimensions>
-class TensorStorage<T, FixedDimensions, Options_>
+template<typename T, typename FixedDimensions, int Options_>
+class TensorStorage
 {
  private:
   static const std::size_t Size = FixedDimensions::total_size;
@@ -66,7 +66,7 @@ class TensorStorage<T, FixedDimensions, Options_>
 
 
 // pure dynamic
-template<typename T, int Options_, typename IndexType, int NumIndices_>
+template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
   public:
@@ -126,7 +126,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 	}
 	else 
           m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
       }
       m_dimensions = nbDimensions;
     }
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3523e7c..d23f2e4 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,6 +23,7 @@ struct static_val {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index ed1a761..9dcc9da 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -20,7 +20,13 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   typedef RunQueue<Task, 1024> Queue;
 
   NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
-      : env_(env),
+      : NonBlockingThreadPoolTempl(num_threads, true, env) {}
+
+  NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
+                             Environment env = Environment())
+      : num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        env_(env),
         threads_(num_threads),
         queues_(num_threads),
         coprimes_(num_threads),
@@ -30,18 +36,18 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         done_(false),
         cancelled_(false),
         ec_(waiters_) {
-    waiters_.resize(num_threads);
+    waiters_.resize(num_threads_);
 
-    // Calculate coprimes of num_threads.
+    // Calculate coprimes of num_threads_.
     // Coprimes are used for a random walk over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
     // a walk starting thread index t and calculate num_threads - 1 subsequent
     // indices as (t + coprime) % num_threads, we will cover all threads without
     // repetitions (effectively getting a presudo-random permutation of thread
     // indices).
-    for (int i = 1; i <= num_threads; i++) {
+    for (int i = 1; i <= num_threads_; i++) {
       unsigned a = i;
-      unsigned b = num_threads;
+      unsigned b = num_threads_;
       // If GCD(a, b) == 1, then a and b are coprimes.
       while (b != 0) {
         unsigned tmp = a;
@@ -52,10 +58,10 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         coprimes_.push_back(i);
       }
     }
-    for (int i = 0; i < num_threads; i++) {
+    for (int i = 0; i < num_threads_; i++) {
       queues_.push_back(new Queue());
     }
-    for (int i = 0; i < num_threads; i++) {
+    for (int i = 0; i < num_threads_; i++) {
       threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
     }
   }
@@ -77,8 +83,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     }
 
     // Join threads explicitly to avoid destruction order issues.
-    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
-    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+    for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
+    for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
   }
 
   void Schedule(std::function<void()> fn) {
@@ -125,7 +131,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 
   int NumThreads() const final {
-    return static_cast<int>(threads_.size());
+    return num_threads_;
   }
 
   int CurrentThreadId() const final {
@@ -149,6 +155,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   };
 
   Environment env_;
+  const int num_threads_;
+  const bool allow_spinning_;
   MaxSizeVector<Thread*> threads_;
   MaxSizeVector<Queue*> queues_;
   MaxSizeVector<unsigned> coprimes_;
@@ -167,36 +175,62 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     pt->thread_id = thread_id;
     Queue* q = queues_[thread_id];
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    while (!cancelled_) {
-      Task t = q->PopFront();
-      if (!t.f) {
-        t = Steal();
+    // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
+    // to num_threads_ and we assume that new work is scheduled at a
+    // constant rate, so we set spin_count to 5000 / num_threads_. The
+    // constant was picked based on a fair dice roll, tune it.
+    const int spin_count =
+        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since Steal() calls PopBack() on the victim
+      // queues it might reverse the order in which ops are executed compared to
+      // the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads the single thread
+      // pools tend to be used for.
+      while (!cancelled_) {
+        Task t = q->PopFront();
+        for (int i = 0; i < spin_count && !t.f; i++) {
+          if (!cancelled_.load(std::memory_order_relaxed)) {
+            t = q->PopFront();
+          }
+        }
         if (!t.f) {
-          // Leave one thread spinning. This reduces latency.
-          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
-          // Also, the time it takes to attempt to steal work 1000 times depends
-          // on the size of the thread pool. However the speed at which the user
-          // of the thread pool submit tasks is independent of the size of the
-          // pool. Consider a time based limit instead.
-          if (!spinning_ && !spinning_.exchange(true)) {
-            for (int i = 0; i < 1000 && !t.f; i++) {
-              if (!cancelled_.load(std::memory_order_relaxed)) {
-                t = Steal();
-              } else {
-                return;
-              }
-            }
-            spinning_ = false;
+          if (!WaitForWork(waiter, &t)) {
+            return;
           }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    } else {
+      while (!cancelled_) {
+        Task t = q->PopFront();
+        if (!t.f) {
+          t = Steal();
           if (!t.f) {
-            if (!WaitForWork(waiter, &t)) {
-              return;
+            // Leave one thread spinning. This reduces latency.
+            if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+              for (int i = 0; i < spin_count && !t.f; i++) {
+                if (!cancelled_.load(std::memory_order_relaxed)) {
+                  t = Steal();
+                } else {
+                  return;
+                }
+              }
+              spinning_ = false;
+            }
+            if (!t.f) {
+              if (!WaitForWork(waiter, &t)) {
+                return;
+              }
             }
           }
         }
-      }
-      if (t.f) {
-        env_.ExecuteTask(t);
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
       }
     }
   }
@@ -244,7 +278,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    if (done_ && blocked_ == threads_.size()) {
+    if (done_ && blocked_ == num_threads_) {
       ec_.CancelWait(waiter);
       // Almost done, but need to re-check queues.
       // Consider that all queues are empty and all worker threads are preempted
diff --git a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 03169d5..573ca43 100644
--- a/libs/tensorflow/include/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/libs/tensorflow/include/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,6 +169,7 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
     eigen_assert(l.size() == 0);
   }
 #endif
diff --git a/libs/tensorflow/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/libs/tensorflow/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
old mode 100644
new mode 100755
index 50fedf6..d280886
--- a/libs/tensorflow/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/libs/tensorflow/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -108,7 +108,9 @@ class AutoDiffScalar
     template<typename OtherDerType>
     AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    , typename internal::enable_if<internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value,void*>::type = 0
+    , typename internal::enable_if<
+            internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value
+        &&  internal::is_convertible<OtherDerType,DerType>::value , void*>::type = 0
 #endif
     )
       : m_value(other.value()), m_derivatives(other.derivatives())
diff --git a/libs/tensorflow/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/libs/tensorflow/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 9ad2b9c..bb6d9e1 100644
--- a/libs/tensorflow/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/libs/tensorflow/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -61,10 +61,11 @@ struct MatrixExponentialScalingOp
  *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
  *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
  */
-template <typename MatrixType>
-void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V)
 {
-  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar;
   const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
   const MatrixType A2 = A * A;
   const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
@@ -77,9 +78,10 @@ void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V)
  *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
  *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
  */
-template <typename MatrixType>
-void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
   const MatrixType A2 = A * A;
@@ -94,9 +96,10 @@ void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V)
  *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
  *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
  */
-template <typename MatrixType>
-void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
   const MatrixType A2 = A * A;
@@ -114,9 +117,10 @@ void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V)
  *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
  *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
  */
-template <typename MatrixType>
-void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
                           2162160.L, 110880.L, 3960.L, 90.L, 1.L};
@@ -135,9 +139,10 @@ void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V)
  *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
  *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
  */
-template <typename MatrixType>
-void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
                           1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
@@ -162,9 +167,10 @@ void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V)
  *  This function activates only if your long double is double-double or quadruple.
  */
 #if LDBL_MANT_DIG > 64
-template <typename MatrixType>
-void matrix_exp_pade17(const MatrixType &A, MatrixType &U, MatrixType &V)
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V)
 {
+  typedef typename MatA::PlainObject MatrixType;
   typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
                           100610229646136770560000.L, 15720348382208870400000.L,
@@ -342,9 +348,10 @@ struct matrix_exp_computeUV<MatrixType, long double>
  * \param arg    argument of matrix exponential (should be plain object)
  * \param result variable in which result will be stored
  */
-template <typename MatrixType, typename ResultType> 
-void matrix_exp_compute(const MatrixType& arg, ResultType &result)
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result)
 {
+  typedef typename ArgType::PlainObject MatrixType;
 #if LDBL_MANT_DIG > 112 // rarely happens
   typedef typename traits<MatrixType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -354,11 +361,11 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result)
     return;
   }
 #endif
-  typename MatrixType::PlainObject U, V;
+  MatrixType U, V;
   int squarings; 
   matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
-  typename MatrixType::PlainObject numer = U + V;
-  typename MatrixType::PlainObject denom = -U + V;
+  MatrixType numer = U + V;
+  MatrixType denom = -U + V;
   result = denom.partialPivLu().solve(numer);
   for (int i=0; i<squarings; i++)
     result *= result;   // undo scaling by repeated squaring