From 358c37bea2e39f0c53706f8d08fa19b83bcaace8 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 10 Jul 2017 08:33:50 -0400
Subject: [PATCH 1/3] Finished separating nodes

Former-commit-id: cd2a8c77af19d24f5493aca98e941406f98c8556
---
 doc/source/code_style.rst                     |   24 +-
 dynet/CMakeLists.txt                          |   59 +-
 ...nary-arith.cu => gpu-nodes-arith-unary.cu} |    2 +-
 dynet/nodes-activations.cc                    |  181 ++
 dynet/nodes-arith-const.cc                    |  111 +
 dynet/nodes-arith-cwise.cc                    |  188 ++
 dynet/nodes-arith-scalar.cc                   |  173 ++
 dynet/nodes-arith-sum.cc                      |  348 +++
 ...es-unary-arith.cc => nodes-arith-unary.cc} |   33 +
 dynet/nodes-common.cc                         |  996 --------
 dynet/nodes-concat.cc                         |  148 ++
 dynet/nodes-const.cc                          |   42 +
 dynet/nodes-conv.cc                           |  313 +--
 dynet/nodes-conv.h                            |   39 +-
 dynet/nodes-dropout.cc                        |  187 ++
 dynet/nodes-flow.cc                           |  151 ++
 dynet/nodes-linalg.cc                         |  224 ++
 dynet/nodes-logsumexp.cc                      |  115 +
 dynet/nodes-losses.cc                         |  123 +
 dynet/nodes-minmax.cc                         |  252 ++
 dynet/nodes-moments.cc                        |  440 ++++
 dynet/nodes-normalization.cc                  |   54 +
 dynet/nodes-random.cc                         |  184 ++
 dynet/nodes-select.cc                         |  333 +++
 dynet/nodes-softmaxes.cc                      |  362 +++
 dynet/nodes-trig.cc                           |   43 +
 dynet/nodes.cc                                | 2200 -----------------
 dynet/nodes.h                                 |   21 +-
 28 files changed, 3876 insertions(+), 3470 deletions(-)
 rename dynet/{gpu-nodes-unary-arith.cu => gpu-nodes-arith-unary.cu} (76%)
 create mode 100644 dynet/nodes-activations.cc
 create mode 100644 dynet/nodes-arith-const.cc
 create mode 100644 dynet/nodes-arith-cwise.cc
 create mode 100644 dynet/nodes-arith-scalar.cc
 create mode 100644 dynet/nodes-arith-sum.cc
 rename dynet/{nodes-unary-arith.cc => nodes-arith-unary.cc} (87%)
 delete mode 100644 dynet/nodes-common.cc
 create mode 100644 dynet/nodes-concat.cc
 create mode 100644 dynet/nodes-const.cc
 create mode 100644 dynet/nodes-dropout.cc
 create mode 100644 dynet/nodes-flow.cc
 create mode 100644 dynet/nodes-linalg.cc
 create mode 100644 dynet/nodes-logsumexp.cc
 create mode 100644 dynet/nodes-losses.cc
 create mode 100644 dynet/nodes-minmax.cc
 create mode 100644 dynet/nodes-moments.cc
 create mode 100644 dynet/nodes-normalization.cc
 create mode 100644 dynet/nodes-random.cc
 create mode 100644 dynet/nodes-select.cc
 create mode 100644 dynet/nodes-softmaxes.cc
 create mode 100644 dynet/nodes-trig.cc
 delete mode 100644 dynet/nodes.cc

diff --git a/doc/source/code_style.rst b/doc/source/code_style.rst
index b28ebbc8e..a65416258 100644
--- a/doc/source/code_style.rst
+++ b/doc/source/code_style.rst
@@ -8,7 +8,29 @@ Coding Tips
 One of the most common things that one will want to do to modify DyNet is to add a new operation
 to calculate a new function.
 You can find more information on how to do so at the end of the tutorial slides
-`here <http://phontron.com/slides/emnlp2016-dynet-tutorial-part1.pdf>`_.
+`here <http://phontron.com/slides/emnlp2016-dynet-tutorial-part1.pdf>`_ (note that some file
+names are old).
+
+Taking a look at the existing operations in the ``nodes-XXX.h`` and ``nodes-XXX.cc`` files
+will be the best guide in creating new operations. Here are some fine-grained tips for
+those that want to dive into the process.
+
+1. ``fx`` is a pointer to the (preallocated) location for the result
+   of forward to be stored
+2. ``fx`` is not initialized, so after calling forward ``fx`` must contain the correct answer
+3. dEdxi MUST **ACCUMULATE** a result since multiple calls to forward may depend on
+   the same ``x_i``. Even, e.g., Identity must be implemented as ``dEdx1 += dEdf``.
+4. scalars results of forward are placed in ``fx.v[0]``
+5. DyNet manages its own memory, not Eigen, and it is configured with the
+   EIGEN_NO_MALLOC option. If you get an error about Eigen attempting to allocate
+   memory, it is (probably) because of an implicit creation of a temporary variable.
+   If you really do need a temporary variable, its capacity must be requested by
+   Node::aux_storage_size
+
+And here are some notes on debugging problems with new operations
+
+1. fx is uninitialized when forward is called- are you relying on it being 0?
+2. dEdxi must accumulate (see point 3 above!)
 
 Coding Practices
 ----------------
diff --git a/dynet/CMakeLists.txt b/dynet/CMakeLists.txt
index 160ad6aa0..5888c7332 100644
--- a/dynet/CMakeLists.txt
+++ b/dynet/CMakeLists.txt
@@ -3,11 +3,11 @@
 set(dynet_library_SRCS
     aligned-mem-pool.cc
     cfsm-builder.cc
-    dynet.cc
     deep-lstm.cc
     devices.cc
     dict.cc
     dim.cc
+    dynet.cc
     exec.cc
     expr.cc
     fast-lstm.cc
@@ -17,34 +17,51 @@ set(dynet_library_SRCS
     gru.cc
     hsm-builder.cc
     init.cc
+    io.cc
     lstm.cc
     mem.cc
     model.cc
-    nodes.cc
-    nodes-common.cc
+    nodes-activations.cc
+    nodes-affinetransform.cc
+    nodes-arith-const.cc
+    nodes-arith-cwise.cc
+    nodes-arith-scalar.cc
+    nodes-arith-sum.cc
+    nodes-arith-unary.cc
+    nodes-concat.cc
+    nodes-const.cc
     nodes-contract.cc
     nodes-conv.cc
     nodes-conv2d.cc
+    nodes-dropout.cc
+    nodes-flow.cc
+    nodes-hinge.cc
+    nodes-linalg.cc
+    nodes-logsumexp.cc
+    nodes-losses.cc
+    nodes-matrixmultiply.cc
     nodes-maxpooling2d.cc
+    nodes-minmax.cc
+    nodes-moments.cc
+    nodes-normalization.cc
+    nodes-norms.cc
     nodes-pickneglogsoftmax.cc
-    nodes-matrixmultiply.cc
-    nodes-hinge.cc
-    nodes-affinetransform.cc
+    nodes-random.cc
+    nodes-select.cc
     nodes-similarities.cc
-    nodes-norms.cc
-    nodes-unary-arith.cc
+    nodes-softmaxes.cc
+    nodes-trig.cc
     param-init.cc
     param-nodes.cc
     pretrain.cc
-    rnn.cc
     rnn-state-machine.cc
+    rnn.cc
     saxe-init.cc
     shadow-params.cc
     tensor.cc
     training.cc
     treelstm.cc
     weight-decay.cc
-    io.cc
 )
 if(ENABLE_BOOST)
   list(APPEND dynet_library_SRCS mp.cc)
@@ -53,14 +70,18 @@ endif()
 # Headers:
 set(dynet_library_HDRS
     aligned-mem-pool.h
-    cfsm-builder.h
-    cudnn-ops.h
     c2w.h
-    dynet.h
+    cfsm-builder.h
+    cuda-matrix-multiply.h
     cuda.h
+    cudnn-ops.h
+    deep-lstm.h
     devices.h
     dict.h
     dim.h
+    dynet-helper.h
+    dynet.h
+    except.h
     exec.h
     expr.h
     fast-lstm.h
@@ -68,31 +89,35 @@ set(dynet_library_HDRS
     globals.h
     gpu-kernels.h
     gpu-ops.h
+    grad-check.h
     graph.h
     gru.h
     hsm-builder.h
     init.h
+    io.h
     lstm.h
     mem.h
     model.h
-    nodes.h
     nodes-contract.h
     nodes-conv.h
+    nodes-macros.h
+    nodes.h
     op-helper.h
+    param-init.h
     param-nodes.h
+    pretrain.h
     rnn-state-machine.h
     rnn.h
     saxe-init.h
     shadow-params.h
+    sig.h
     simd-functors.h
+    str-util.h
     tensor.h
     timing.h
     training.h
     treelstm.h
-    except.h
-    nodes-macros.h
     weight-decay.h
-    io.h
 )
 if(ENABLE_BOOST)
   list(APPEND dynet_library_HDRS mp.h)
diff --git a/dynet/gpu-nodes-unary-arith.cu b/dynet/gpu-nodes-arith-unary.cu
similarity index 76%
rename from dynet/gpu-nodes-unary-arith.cu
rename to dynet/gpu-nodes-arith-unary.cu
index b53030f48..15198bef2 100644
--- a/dynet/gpu-nodes-unary-arith.cu
+++ b/dynet/gpu-nodes-arith-unary.cu
@@ -1,3 +1,3 @@
 // This is a dummy file that contains the same content as nodes-unary-arith.cc but compiled
 // on CUDA
-#include "nodes-unary-arith.cc"
+#include "nodes-arith-unary.cc"
diff --git a/dynet/nodes-activations.cc b/dynet/nodes-activations.cc
new file mode 100644
index 000000000..678626ba9
--- /dev/null
+++ b/dynet/nodes-activations.cc
@@ -0,0 +1,181 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+#include "dynet/simd-functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Rectify *************
+
+#ifndef __CUDACC__
+
+string Rectify::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "ReLU(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim Rectify::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Rectify");
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void Rectify::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in Rectify::forward");
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().cwiseMax(0.f);
+}
+
+template<class MyDevice>
+void Rectify::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), FRectifyBackward());
+}
+DYNET_NODE_INST_DEV_IMPL(Rectify)
+
+// ************* LogisticSigmoid *************
+
+#ifndef __CUDACC__
+
+string LogisticSigmoid::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "\\sigma(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim LogisticSigmoid::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in LogisticSigmoid")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void LogisticSigmoid::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in LogisticSigmoid::forward");
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(scalar_logistic_sigmoid_op<float>());
+}
+
+template<class MyDevice>
+void LogisticSigmoid::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), scalar_logistic_sigmoid_backward_op<float>());
+}
+DYNET_NODE_INST_DEV_IMPL(LogisticSigmoid)
+
+// ************* SoftSign *************
+
+#ifndef __CUDACC__
+
+string SoftSign::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "softsign(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim SoftSign::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in SoftSign");
+  DYNET_ARG_CHECK(LooksLikeVector(xs[0]), "Bad input dimensions in SoftSign: " << xs);
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void SoftSign::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SoftSign::forward");
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(FSoftSign());
+}
+
+template<class MyDevice>
+void SoftSign::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), FSoftSignBackward());
+}
+DYNET_NODE_INST_DEV_IMPL(SoftSign)
+
+// ************* Erf *************
+
+#ifndef __CUDACC__
+
+string Erf::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "erf(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim Erf::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Erf")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void Erf::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().erf();
+}
+
+template<class MyDevice>
+void Erf::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().binaryExpr(dEdf.tvec(), scalar_erf_backward_op<float>());
+}
+DYNET_NODE_INST_DEV_IMPL(Erf)
+
+// ************* ExponentialLinearUnit *************
+
+#ifndef __CUDACC__
+
+string ExponentialLinearUnit::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "ELU(" << arg_names[0] << ", lambda=" << lambda << ", alpha=" << alpha << ')';
+  return s.str();
+}
+
+Dim ExponentialLinearUnit::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in ExponentialLinearUnit");
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void ExponentialLinearUnit::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in ExponentialLinearUnit::forward");
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(FELUForward(alpha, lambda));;
+}
+
+template<class MyDevice>
+void ExponentialLinearUnit::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().binaryExpr(dEdf.tvec(), FELUBackward(alpha, lambda));
+}
+DYNET_NODE_INST_DEV_IMPL(ExponentialLinearUnit)
+
+}
diff --git a/dynet/nodes-arith-const.cc b/dynet/nodes-arith-const.cc
new file mode 100644
index 000000000..ce7818e71
--- /dev/null
+++ b/dynet/nodes-arith-const.cc
@@ -0,0 +1,111 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+#include "dynet/simd-functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* ConstantMinusX *************
+
+#ifndef __CUDACC__
+
+string ConstantMinusX::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << c << " - " << arg_names[0];
+  return s.str();
+}
+
+Dim ConstantMinusX::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in ConstantMinusX")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void ConstantMinusX::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(const_minus_op<float>(c));
+}
+
+template<class MyDevice>
+void ConstantMinusX::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) -= dEdf.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(ConstantMinusX)
+
+// ************* ConstantPlusX *************
+
+#ifndef __CUDACC__
+
+string ConstantPlusX::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << c << " + " << arg_names[0];
+  return s.str();
+}
+
+Dim ConstantPlusX::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in ConstantPlusX")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void ConstantPlusX::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(const_add_op<float>(c));
+}
+
+template<class MyDevice>
+void ConstantPlusX::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(ConstantPlusX)
+
+// ************* ConstScalarMultiply *************
+
+#ifndef __CUDACC__
+
+string ConstScalarMultiply::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " * " << alpha;
+  return s.str();
+}
+
+Dim ConstScalarMultiply::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "ConstScalarMultiply expects one argument: " << xs);
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void ConstScalarMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec() * alpha;
+}
+
+template<class MyDevice>
+void ConstScalarMultiply::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i == 0, "Failed dimension check in ConstScalarMultiply");
+  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * alpha;
+}
+DYNET_NODE_INST_DEV_IMPL(ConstScalarMultiply)
+
+}
diff --git a/dynet/nodes-arith-cwise.cc b/dynet/nodes-arith-cwise.cc
new file mode 100644
index 000000000..26ac47b0e
--- /dev/null
+++ b/dynet/nodes-arith-cwise.cc
@@ -0,0 +1,188 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* CwiseMultiply *************
+
+#ifndef __CUDACC__
+
+string CwiseMultiply::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " \\cdot " << arg_names[1];
+  return s.str();
+}
+
+Dim CwiseMultiply::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in CwiseMultiply")
+  Dim d = xs[0].truncate();
+  DYNET_ARG_CHECK(d.single_batch() == xs[1].truncate().single_batch(),
+                          "Mismatched input dimensions in CwiseMultiply: " << xs);
+  d.bd = max(xs[1].bd, d.bd);
+  return d;
+}
+
+int CwiseMultiply::autobatch_sig(const ComputationGraph & cg, SigMap &sm) const {
+  // TODO: This does not handle the case where dimensions differ
+  Sig s(nt::cmult);
+  return cg.nodes[args[0]]->dim == cg.nodes[args[1]]->dim ? sm.get_idx(s) : 0;
+}
+
+std::vector<int> CwiseMultiply::autobatch_concat(const ComputationGraph & cg) const {
+  return vector<int>(2, 1);
+}
+
+#endif
+
+template<class MyDevice>
+void CwiseMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in CwiseMultiply::forward (cmult)");
+  if(xs[0]->d.bd == xs[1]->d.bd) {
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() * xs[1]->tvec();
+  } else {
+    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+    if(xs[0]->d.bd == 1)
+      fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast) * xs[1]->tbvec();
+    else
+      fx.tbvec().device(*dev.edevice) = xs[0]->tbvec() * xs[1]->tbvec().broadcast(bcast);
+  }
+}
+
+template<class MyDevice>
+void CwiseMultiply::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in CwiseMultiply::backward (cmult)");
+  if(xs[0]->d.bd == xs[1]->d.bd) {
+    dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * xs[1-i]->tvec();
+  } else if(xs[1-i]->d.bd == 1) {
+    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+    dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() * xs[1-i]->tbvec().broadcast(bcast);
+  } else {
+    Eigen::array<int, 1> red_axis; red_axis[0] = 1;
+    dEdxi.tvec().device(*dev.edevice) += (dEdf.tbvec() * xs[1-i]->tbvec()).sum(red_axis);
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(CwiseMultiply)
+
+// ************* CwiseQuotient *************
+
+#ifndef __CUDACC__
+
+string CwiseQuotient::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " / " << arg_names[1];
+  return s.str();
+}
+
+Dim CwiseQuotient::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in CwiseQuotient")
+  Dim d = xs[0].truncate();
+  DYNET_ARG_CHECK(d.single_batch() == xs[1].truncate().single_batch(), "Bad input dimensions in CwiseQuotient: " << xs);
+  d.bd = max(xs[1].bd, d.bd);
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void CwiseQuotient::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in CwiseQuotient::forward (cdiv)");
+  if(xs[0]->d.bd == xs[1]->d.bd) {
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() / xs[1]->tvec();
+  } else if(xs[0]->d.bd == 1) {
+    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+    fx.tb<1>().device(*dev.edevice) = xs[0]->tb<1>().broadcast(bcast) / xs[1]->tb<1>();
+  } else {
+    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+    fx.tb<1>().device(*dev.edevice) = xs[0]->tb<1>() / xs[1]->tb<1>().broadcast(bcast);
+  }
+}
+
+template<class MyDevice>
+void CwiseQuotient::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in CwiseQuotient::backward (cdiv)");
+  if (i == 0) {
+    if(xs[0]->d.bd == xs[1]->d.bd) {
+      dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() / xs[1]->tvec();
+    } else if(xs[1]->d.bd == 1) {
+      Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+      dEdxi.tb<1>().device(*dev.edevice) += dEdf.tb<1>() / xs[1]->tb<1>().broadcast(bcast);
+    } else {
+      Eigen::array<int, 1> red_axis; red_axis[0] = 1;
+      dEdxi.t<1>().device(*dev.edevice) += (dEdf.tb<1>() / xs[1]->tb<1>()).sum(red_axis);
+    }
+  } else { // i = 1
+    if(xs[0]->d.bd == xs[1]->d.bd) {
+      dEdxi.tvec().device(*dev.edevice) -= dEdf.tvec() / xs[1]->tvec().square() * xs[0]->tvec();
+    } else if(xs[1]->d.bd == 1) {
+      Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+      Eigen::array<int, 1> red_axis; red_axis[0] = 1;
+      dEdxi.t<1>().device(*dev.edevice) -= (dEdf.tb<1>() / xs[1]->tb<1>().square().broadcast(bcast) * xs[0]->tb<1>()).sum(red_axis);
+    } else {
+      Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
+      dEdxi.tb<1>().device(*dev.edevice) -= dEdf.tb<1>() / xs[1]->tb<1>().square() * xs[0]->tb<1>().broadcast(bcast);
+    }
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(CwiseQuotient)
+
+// ************* Pow *************
+
+#ifndef __CUDACC__
+
+string Pow::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " ** " << arg_names[1];
+  return s.str();
+}
+
+Dim Pow::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in Pow")
+  Dim d = xs[0].truncate();
+  DYNET_ARG_CHECK(xs[1].truncate().single_batch().size() == 1, "Bad input dimensions in Pow: " << xs);
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void Pow::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed dimension check in Pow::forward");
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().pow(as_scalar(*xs[1]));
+}
+
+template<class MyDevice>
+void Pow::backward_dev_impl(const MyDevice & dev,
+                            const vector<const Tensor*>& xs,
+                            const Tensor& fx,
+                            const Tensor& dEdf,
+                            unsigned i,
+                            Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed dimension check in Pow::backward");
+  real x2 = as_scalar(*xs[1]);
+  if (i == 0) {
+    dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().pow(x2 - 1) * dEdf.tvec() * x2;
+  } else {
+#if defined(__CUDACC__) && defined(EIGEN_NO_MALLOC)
+    DYNET_RUNTIME_ERR("CUDA memory allocation in Pow");
+#endif
+    // y = a^x
+    // dy/dx = a^x * log(a)
+    dEdxi.t<0>().device(*dev.edevice) += (fx.tvec() * xs[0]->tvec().log() * dEdf.tvec()).sum();
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(Pow)
+
+}
diff --git a/dynet/nodes-arith-scalar.cc b/dynet/nodes-arith-scalar.cc
new file mode 100644
index 000000000..baca3fe0a
--- /dev/null
+++ b/dynet/nodes-arith-scalar.cc
@@ -0,0 +1,173 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* ScalarAdd *************
+
+#ifndef __CUDACC__
+
+string ScalarAdd::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " + " << arg_names[1];
+  return s.str();
+}
+
+Dim ScalarAdd::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in ScalarAdd")
+  Dim d = xs[0].truncate();
+  DYNET_ARG_CHECK(xs[1].batch_size() == 1,
+                          "Mismatched input dimensions in ScalarAdd: " << xs);
+  d.bd = max(xs[1].bd, d.bd);
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void ScalarAdd::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in ScalarAdd::forward (+)");
+  Eigen::array<int, 2> bcast_0 = {1, (int) (fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 2> bcast_1 = {(int) fx.d.batch_size(), (int) (fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
+  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast_0) + xs[1]->tbvec().broadcast(bcast_1);
+}
+
+template<class MyDevice>
+void ScalarAdd::backward_dev_impl(const MyDevice & dev,
+                                  const vector<const Tensor*>& xs,
+                                  const Tensor& fx,
+                                  const Tensor& dEdf,
+                                  unsigned i,
+                                  Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in ScalarAdd::backward (+)");
+  Eigen::array<int, 1> red_axis_0 = {0}, red_axis_1 = {1};
+  Eigen::array<int, 2> red_axes_01 = {0, 1};
+  if (i == 0) {
+    if (xs[0]->d.bd == 1)
+      dEdxi.tvec().device(*dev.edevice) += dEdf.tbvec().sum(red_axis_1);
+    else
+      dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec();
+  } else {
+    if (xs[1]->d.bd == 1)
+      dEdxi.t<0>().device(*dev.edevice) += dEdf.tbvec().sum(red_axes_01);
+    else
+      dEdxi.tb<0>().device(*dev.edevice) += dEdf.tbvec().sum(red_axis_0);
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(ScalarAdd)
+
+// ************* ScalarMultiply *************
+
+#ifndef __CUDACC__
+
+string ScalarMultiply::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " \\cdot " << arg_names[1];
+  return s.str();
+}
+
+Dim ScalarMultiply::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in ScalarMultiply")
+  Dim d = xs[1];
+  DYNET_ARG_CHECK(xs[0].batch_size() == 1,
+                          "Mismatched input dimensions in ScalarMultiply: " << xs);
+  d.bd = max(xs[0].bd, d.bd);
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void ScalarMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in ScalarMultiply::forward (cmult)");
+
+  Eigen::array<int, 2> bcast_0 = {(int) fx.d.batch_size(), (int) (fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 2> bcast_1 = {1, (int) (fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
+  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast_0) * xs[1]->tbvec().broadcast(bcast_1);
+}
+
+template<class MyDevice>
+void ScalarMultiply::backward_dev_impl(const MyDevice & dev,
+                                       const vector<const Tensor*>& xs,
+                                       const Tensor& fx,
+                                       const Tensor& dEdf,
+                                       unsigned i,
+                                       Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in ScalarMultiply::backward (cmult)");
+  Eigen::array<int, 2> bcast_0 = {(int) fx.d.batch_size(), (int)( fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 2> bcast_1 = {1, (int)(fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 1> red_axis_0 = {0}, red_axis_1 = {1};
+  Eigen::array<int, 2> red_axes_01 = {0, 1};
+  if (i == 0) {
+    if (xs[0]->d.bd == 1)
+      dEdxi.t<0>().device(*dev.edevice) += (dEdf.tbvec() * xs[1]->tbvec().broadcast(bcast_1)).sum(red_axes_01);
+    else
+      dEdxi.tb<0>().device(*dev.edevice) += (dEdf.tbvec() * xs[1]->tbvec().broadcast(bcast_1)).sum(red_axis_0);
+  } else {
+    if (xs[1]->d.bd == 1)
+      dEdxi.tvec().device(*dev.edevice) += (dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast_0)).sum(red_axis_1);
+    else
+      dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast_0);
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(ScalarMultiply)
+
+// ************* ScalarQuotient *************
+
+#ifndef __CUDACC__
+
+string ScalarQuotient::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " / " << arg_names[1];
+  return s.str();
+}
+
+Dim ScalarQuotient::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in ScalarQuotient")
+  Dim d = xs[0].truncate();
+  DYNET_ARG_CHECK(xs[1].batch_size() == 1,
+                          "Mismatched input dimensions in ScalarQuotient: " << xs);
+  d.bd = max(xs[1].bd, d.bd);
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void ScalarQuotient::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in ScalarQuotient::forward (cdiv)");
+  Eigen::array<int, 2> bcast_0 = {1, (int) (fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 2> bcast_1 = {(int) fx.d.batch_size(), (int) (fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
+  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast_0) / xs[1]->tbvec().broadcast(bcast_1);
+}
+
+template<class MyDevice>
+void ScalarQuotient::backward_dev_impl(const MyDevice & dev,
+                                       const vector<const Tensor*>& xs,
+                                       const Tensor& fx,
+                                       const Tensor& dEdf,
+                                       unsigned i,
+                                       Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in ScalarQuotient::backward (cdiv)");
+  Eigen::array<int, 2> bcast = {(int)fx.d.batch_size(), (int)(fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 2> bcast2 = {1, (int)(fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
+  Eigen::array<int, 1> red_axis_0 = {0}, red_axis_1 = {1};
+  Eigen::array<int, 2> red_axes_01 = {0, 1};
+  if (i == 0) {
+    if (xs[0]->d.bd == 1)
+      dEdxi.tvec().device(*dev.edevice) += (dEdf.tbvec() / xs[1]->tbvec().broadcast(bcast)).sum(red_axis_1);
+    else
+      dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() / xs[1]->tbvec().broadcast(bcast);
+  } else {
+    if (xs[1]->d.bd == 1)
+      dEdxi.t<0>().device(*dev.edevice) += - (dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast2)).sum(red_axes_01) / xs[1]->t<0>().square();
+    else
+      dEdxi.tb<0>().device(*dev.edevice) += - (dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast2)).sum(red_axis_0) / xs[1]->tb<0>().square();
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(ScalarQuotient)
+
+}
diff --git a/dynet/nodes-arith-sum.cc b/dynet/nodes-arith-sum.cc
new file mode 100644
index 000000000..d3b96f24f
--- /dev/null
+++ b/dynet/nodes-arith-sum.cc
@@ -0,0 +1,348 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Sum *************
+
+#ifndef __CUDACC__
+
+string Sum::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0];
+  for (unsigned i = 1; i < arg_names.size(); ++i)
+    s << " + " << arg_names[i];
+  return s.str();
+}
+
+Dim Sum::dim_forward(const vector<Dim>& xs) const {
+  Dim d = xs[0].truncate();
+  unsigned int batch = d.bd;
+  for (unsigned i = 1; i < xs.size(); ++i) {
+    DYNET_ARG_CHECK(d.single_batch() == xs[i].truncate().single_batch(),
+                            "Mismatched input dimensions in Sum: " << xs);
+    batch = max(xs[i].bd, batch);
+  }
+  d = xs[0]; d.bd = batch;
+  return d;
+}
+
+int Sum::autobatch_sig(const ComputationGraph &cg, SigMap &sm) const {
+  Sig s(nt::sum);
+  s.add_node(args.size());
+  // Two cases:
+  // If unbatched, it's just an elementwise addition
+  // TODO: This will be more efficient if we identify arguments that are used
+  //       multiple times (e.g. bias vectors)
+  if(dim.bd == 1) {
+    s.add_int(-2);
+  // Otherwise, make sure the dimensions match and that batched nodes don't intersect
+  } else {
+    s.add_dim(dim);
+    for(auto ai : args) {
+      s.add_int(cg.nodes[ai]->dim.bd == 1 ? ai : -1);
+    }
+  }
+  return sm.get_idx(s);
+}
+
+std::vector<int> Sum::autobatch_concat(const ComputationGraph & cg) const {
+  vector<int> ret(args.size(), 1);
+  // If batched, true if multiple batched input as well
+  if(dim.bd != 1)
+    for(size_t i = 0; i < args.size(); ++i)
+      ret[i] = cg.nodes[args[i]]->dim.bd == 1 ? 0 : 1;
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void Sum::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  const unsigned num_args = xs.size();
+  if (num_args == 1) 
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+  else if (num_args == 2 && xs[0]->d.bd == xs[1]->d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec();
+  else if (num_args == 3 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec();
+  else if (num_args == 4 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd && xs[2]->d.bd == xs[3]->d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
+  else {
+    bool allSameBatchSize = std::all_of(xs.begin(), xs.end(), [&](const Tensor* x) { return x->d.bd == xs[0]->d.bd;});
+    if (allSameBatchSize) {
+      // Since they are all the same batch size, we can easily unroll the addition (results in lower GPU latency by merging multiple adds together in one CUDA call):
+      DYNET_ASSERT(num_args > 4, "Bad loop unrolling in Sum::forward");        // If it was <=4, we would have handled it in the special cases above
+      fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
+
+      const unsigned remainder = (num_args - 4 ) % 4;
+      switch (remainder) {
+        case 0: break;
+        case 1: fx.tvec().device(*dev.edevice) += xs[4]->tvec(); break;
+        case 2: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec(); break;
+        case 3: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec() + xs[6]->tvec(); break;
+      }
+      for (unsigned i = 4 + remainder; i < num_args; i += 4)
+        fx.tvec().device(*dev.edevice) += xs[i]->tvec() + xs[i + 1]->tvec() + xs[i + 2]->tvec() + xs[i + 3]->tvec();
+    }
+    else {
+      // Not all the same batch size, so need to broadcast in the cases where they differ
+      TensorTools::zero(fx);
+#ifdef __CUDACC__
+      Eigen::array<int, 2> bcast({ 1, (int)fx.d.bd });
+#endif
+      for (unsigned i = 0; i < num_args; ++i) {
+        if (xs[i]->d.bd == fx.d.bd) {
+          fx.tvec().device(*dev.edevice) += xs[i]->tvec();
+        }
+        else {
+#ifdef __CUDACC__
+          fx.tbvec().device(*dev.edevice) += xs[i]->tbvec().broadcast(bcast);
+#else
+          for (unsigned b = 0; b < fx.d.bd; ++b)
+            fx.tbvec().chip<1>(b).device(*dev.edevice) += xs[i]->tvec();
+#endif
+        }
+      }
+    }
+  }
+}
+
+template<class MyDevice>
+void Sum::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  if(dEdxi.d.bd == fx.d.bd) {
+    dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
+  } else {
+    Eigen::array<int, 1> red_axis = {1};
+    dEdxi.tvec().device(*dev.edevice) += dEdf.tbvec().sum(red_axis);
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(Sum)
+
+// ************* SumElements *************
+
+#ifndef __CUDACC__
+
+string SumElements::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "sum_elems( " << arg_names[0] << " )";
+  return s.str();
+}
+
+Dim SumElements::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in SumElements")
+  return Dim({1}, xs[0].bd);
+}
+
+#endif
+
+template<class MyDevice>
+void SumElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SumElements::forward");
+  Eigen::array<int, 1> red_axis; red_axis[0] = 0;
+  fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().sum(red_axis);
+}
+
+template<class MyDevice>
+void SumElements::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in SumElements::backward");
+  Eigen::array<int, 2> bcast = {(int)xs[0]->d.batch_size(), 1};
+  dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().broadcast(bcast);
+}
+DYNET_NODE_INST_DEV_IMPL(SumElements)
+
+// ************* SumDimension *************
+
+#ifndef __CUDACC__
+
+string SumDimension::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "sum_dim(matrix=" << arg_names[0] << ',' << dimension << '}';
+  return s.str();
+}
+
+Dim SumDimension::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in SumDimension");
+  Dim ret(xs[0]);
+  ret.delete_dim(dimension);
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void SumDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in SumDimension");
+  Eigen::array<int, 1> reduction_axis = {(int)dimension};
+  fx.t<1>().device(*dev.edevice) = xs[0]->t<2>().sum(reduction_axis);
+}
+
+template<class MyDevice>
+void SumDimension::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  // TODO: limit to 3-dimensional tensor is arbitrary
+  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = dEdxi.d[dimension];
+  Eigen::array<int, 4> morph = {(int)dEdxi.d[0],(int)dEdxi.d[1],(int)dEdxi.d[2],(int)dEdxi.d.bd}; morph[dimension] = 1;
+  dEdxi.tb<3>().device(*dev.edevice) += dEdf.tb<3>().reshape(morph).broadcast(bcast);
+}
+DYNET_NODE_INST_DEV_IMPL(SumDimension)
+
+// ************* SumBatches *************
+
+#ifndef __CUDACC__
+
+string SumBatches::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "sum_batches( " << arg_names[0] << " )";
+  return s.str();
+}
+
+Dim SumBatches::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in SumBatches")
+  return xs[0].single_batch();
+}
+
+#endif
+
+template<class MyDevice>
+void SumBatches::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SumBatches::forward");
+  unsigned num_args = xs[0]->d.bd;
+#ifdef __CUDACC__
+  Eigen::array<int, 1> red_axis; red_axis[0] = 2;
+  fx.t<2>().device(*dev.edevice) = xs[0]->tb<2>().sum(red_axis);
+#else
+  // TODO: Is this CPU version really good? Overhead can probably be reduced.
+  auto res = *fx;
+  const unsigned remainder = num_args % 4;
+  switch (remainder) {
+    case 0: res.setZero(); break;
+    case 1: res = xs[0]->batch_matrix(0); break;
+    case 2: res = xs[0]->batch_matrix(0) + xs[0]->batch_matrix(1); break;
+    case 3: res = xs[0]->batch_matrix(0) + xs[0]->batch_matrix(1) + xs[0]->batch_matrix(2); break;
+  }
+  for (unsigned i = remainder; i < num_args; i += 4)
+    res += xs[0]->batch_matrix(i) + xs[0]->batch_matrix(i+1) + xs[0]->batch_matrix(i+2) + xs[0]->batch_matrix(i+3);
+#endif
+}
+
+template<class MyDevice>
+void SumBatches::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in SumBatches::backward");
+#ifdef __CUDACC__
+  Eigen::array<int, 3> bcast({1, 1, (int)fx.d.bd});
+  dEdxi.tb<2>().device(*dev.edevice) += dEdf.tb<2>().broadcast(bcast);
+#else
+  for (unsigned i = 0; i < dEdxi.d.bd; ++i)
+    dEdxi.batch_matrix(i) += *dEdf;
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(SumBatches)
+
+// ************* AddVectorToAllColumns *************
+
+#ifndef __CUDACC__
+
+string AddVectorToAllColumns::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "colwise_add(" << arg_names[0] << ", " << arg_names[1] << ')';
+  return os.str();
+}
+
+Dim AddVectorToAllColumns::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2 &&
+                          xs[0].rows() == xs[1].rows() &&
+                          xs[0].ndims() == 2 &&
+                          (xs[1].ndims() == 1 || (xs[1].ndims() == 2 && xs[1].cols() == 1)),
+                          "Bad input dimensions in AddVectorToAllColumns: " << xs);
+  return Dim({xs[0][0], xs[0][1]}, max(xs[0].bd,xs[1].bd));
+}
+
+#endif
+
+template<class MyDevice>
+void AddVectorToAllColumns::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  // Broadcasting is slow on CPU, so split codepaths
+#ifdef __CUDACC__
+  if(xs[0]->d.bd >= xs[1]->d.bd) {
+    Eigen::array<int, 3> bcasts = {1, (int)xs[0]->d[1], (int)(xs[0]->d.bd/xs[1]->d.bd)};
+    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<2>() + xs[1]->tb<2>().broadcast(bcasts);
+  } else {
+    DYNET_ASSERT(xs[0]->d.bd == 1,
+                 "Bad dimensions in AddVectorToAllColumns::forward: " << xs[0]->d << ", " << xs[1]->d);
+    Eigen::array<int, 3> bcasts0 = {1, 1, (int)xs[1]->d.bd};
+    Eigen::array<int, 3> bcasts1 = {1, (int)xs[0]->d[1], 1};
+    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<2>().broadcast(bcasts0) + xs[1]->tb<2>().broadcast(bcasts1);
+  }
+#else
+  // First, add the matrix
+  if(xs[0]->d.bd == fx.d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+  else
+    for(size_t b = 0; b < fx.d.bd; ++b)
+      fx.tbvec().chip<1>(b).device(*dev.edevice) = xs[0]->tvec();
+  // Second, add the columns
+  if(xs[1]->d.bd == fx.d.bd) {
+    for(size_t i = 0; i < xs[0]->d[1]; ++i) 
+      fx.tb<2>().chip<1>(i).device(*dev.edevice) += xs[1]->tb<1>();
+  } else {
+    for(size_t b = 0; b < fx.d.bd; ++b)
+      for(size_t i = 0; i < fx.d[1]; ++i) 
+        fx.tb<2>().chip<2>(b).chip<1>(i).device(*dev.edevice) += xs[1]->t<1>();
+  }
+#endif
+}
+
+template<class MyDevice>
+void AddVectorToAllColumns::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in AddVetorToAllColumns::backward");
+  // TODO: profile on CPU and see whether the chip version is better
+  if (i == 0) { // x
+    if(dEdf.d.bd == dEdxi.d.bd) {
+      dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
+    } else {
+      Eigen::array<int, 1> red_axis = {2};
+      dEdxi.t<2>().device(*dev.edevice) += dEdf.tb<2>().sum(red_axis);
+    }
+  } else { // bias
+    if(dEdf.d.bd == dEdxi.d.bd) {
+      Eigen::array<int, 1> red_axis = {1};
+      dEdxi.tb<1>().device(*dev.edevice) += dEdf.tb<2>().sum(red_axis);
+    } else {
+      DYNET_ASSERT(dEdxi.d.bd == 1,
+                   "Bad dimensions in AddVectorToAllColumns::backward: " << xs[0]->d << ", " << xs[1]->d);
+      Eigen::array<int, 2> red_axis = {1,2};
+      dEdxi.t<1>().device(*dev.edevice) += dEdf.tb<2>().sum(red_axis);
+    }
+  }
+}  
+DYNET_NODE_INST_DEV_IMPL(AddVectorToAllColumns)
+
+}
diff --git a/dynet/nodes-unary-arith.cc b/dynet/nodes-arith-unary.cc
similarity index 87%
rename from dynet/nodes-unary-arith.cc
rename to dynet/nodes-arith-unary.cc
index 4bf6a41f6..4d779279e 100644
--- a/dynet/nodes-unary-arith.cc
+++ b/dynet/nodes-arith-unary.cc
@@ -240,4 +240,37 @@ void Abs::backward_dev_impl(const MyDevice & dev,
 }
 DYNET_NODE_INST_DEV_IMPL(Abs)
 
+// ************* LogGamma *************
+
+#ifndef __CUDACC__
+
+string LogGamma::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "lgamma(" << arg_names[0] << ')';
+  return os.str();
+}
+
+Dim LogGamma::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in LogGamma")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void LogGamma::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().lgamma();
+}
+
+template<class MyDevice>
+void LogGamma::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().digamma() * dEdf.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(LogGamma)
+
 }
diff --git a/dynet/nodes-common.cc b/dynet/nodes-common.cc
deleted file mode 100644
index 9a6ab7eb4..000000000
--- a/dynet/nodes-common.cc
+++ /dev/null
@@ -1,996 +0,0 @@
-#include "dynet/nodes.h"
-
-#include <limits>
-#include <cmath>
-#include <sstream>
-
-#include "dynet/nodes-macros.h"
-#include "dynet/globals.h"
-
-using namespace std;
-
-namespace dynet {
-
-string AddVectorToAllColumns::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "colwise_add(" << arg_names[0] << ", " << arg_names[1] << ')';
-  return os.str();
-}
-
-Dim AddVectorToAllColumns::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2 &&
-                          xs[0].rows() == xs[1].rows() &&
-                          xs[0].ndims() == 2 &&
-                          (xs[1].ndims() == 1 || (xs[1].ndims() == 2 && xs[1].cols() == 1)),
-                          "Bad input dimensions in AddVectorToAllColumns: " << xs);
-  return Dim({xs[0][0], xs[0][1]}, max(xs[0].bd,xs[1].bd));
-}
-
-string SparsemaxLoss::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "sparsemax(" << arg_names[0] << ", q)";
-  return s.str();
-}
-
-Dim SparsemaxLoss::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1 && LooksLikeVector(xs[0]), "Bad input dimensions in SparsemaxLoss: " << xs);
-  return Dim({1});
-}
-
-string Sparsemax::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "sparsemax(" << arg_names[0] << ")";
-  return s.str();
-}
-
-Dim Sparsemax::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1 && LooksLikeVector(xs[0]), "Bad input dimensions in Sparsemax: " << xs);
-  return xs[0];
-}
-
-string MatrixInverse::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "inverse(" << arg_names[0] << ")";
-  return s.str();
-}
-
-Dim MatrixInverse::dim_forward(const vector<Dim>& xs) const {
-  return xs[0];
-}
-
-string LogDet::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "logdet(" << arg_names[0] << ")";
-  return s.str();
-}
-
-Dim LogDet::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs[0].ndims() <= 2 && (xs[0].rows() == xs[0].cols()), "Bad arguments in LogDet: " << xs);
-  return Dim({1});
-}
-
-string SelectRows::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "select_rows(" << arg_names[0] << ", {rsize=" << prows->size() << "})";
-  return s.str();
-}
-
-Dim SelectRows::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Bad arguments in SelectRows: " << xs);
-  unsigned nrows = prows->size();
-  Dim ret(xs[0]);
-  ret.d[0] = nrows;
-  return ret;
-}
-
-string SelectCols::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "select_cols(" << arg_names[0] << ", {csize=" << pcols->size() << "})";
-  return s.str();
-}
-
-Dim SelectCols::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1 && xs[0].ndims() == 2, "Bad arguments in SelectCols: " << xs);
-  unsigned ncols = pcols->size();
-  return Dim({xs[0].rows(), ncols});
-}
-
-string Min::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "min{" << arg_names[0] << ", " << arg_names[1] << "}";
-  return s.str();
-}
-
-Dim Min::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2 && xs[0] == xs[1], "Bad arguments in Min: " << xs);
-  return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
-}
-
-string Max::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "max{" << arg_names[0] << ", " << arg_names[1] << "}";
-  return s.str();
-}
-
-Dim Max::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2 && xs[0] == xs[1], "Bad arguments in Max: " << xs);
-  return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
-}
-
-string TraceOfProduct::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "Tr(" << arg_names[0] << " * " << arg_names[1] << "^T)";
-  return s.str();
-}
-
-Dim TraceOfProduct::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2 && xs[0] == xs[1], "Bad arguments in TraceOfProduct: " << xs);
-  return Dim({1}, max(xs[0].bd, xs[1].bd));
-}
-
-string ConstScalarMultiply::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " * " << alpha;
-  return s.str();
-}
-
-Dim ConstScalarMultiply::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "ConstScalarMultiply expects one argument: " << xs);
-  return xs[0];
-}
-
-string Transpose::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "transpose("<< arg_names[0] << ", ";
-  for(size_t i = 0; i < dims.size(); ++i)
-    s << (i == 0?'{':',') << dims[i];
-  s << "})";
-  return s.str();
-}
-
-Dim Transpose::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Bad arguments to Transpose: " << xs);
-  DYNET_ARG_CHECK(xs[0].nd == dims.size() || xs[0].num_nonone_dims() == 1, "Dimensions passed to transpose (" << dims.size() << ") must be equal to dimensions in input tensor (" << xs[0].nd << ')');
-  Dim ret(xs[0]);
-  ret.nd = dims.size();
-  for(size_t i = 0; i < dims.size(); ++i)
-    ret.d[i] = xs[0][dims[i]];
-  return ret;
-}
-
-string Reshape::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "reshape(" << arg_names[0] << " --> " << to << ')';
-  return s.str();
-}
-
-Dim Reshape::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Reshape")
-  if(to.size() == xs[0].size()) {
-    return to;
-  } else {
-    DYNET_ARG_CHECK(to.batch_elems() == 1 && to.batch_size() == xs[0].batch_size(),
-                    "Bad arguments to Reshape: " << to << ", " << xs[0]);
-    Dim ret(to);
-    ret.bd = xs[0].batch_elems();
-    return ret;
-  }
-}
-
-string KMHNGram::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "kmh-ngram(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim KMHNGram::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs[0].ndims() == 2, "Bad input dimensions in KMHNGram: " << xs);
-  const unsigned new_cols = xs[0].cols() - n + 1;
-  DYNET_ARG_CHECK(new_cols >= 1, "Bad input dimensions in KMHNGram: " << xs);
-  return Dim({xs[0][0], new_cols});
-}
-
-string GaussianNoise::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " + N(0," << stddev << ')';
-  return s.str();
-}
-
-Dim GaussianNoise::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in GaussianNoise")
-  return xs[0];
-}
-
-string Dropout::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "dropout(" << arg_names[0] << ",p=" << p << ')';
-  return s.str();
-}
-
-Dim Dropout::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Dropout")
-  return xs[0];
-}
-
-string DropoutBatch::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "dropout_batch(" << arg_names[0] << ",p=" << p << ')';
-  return s.str();
-}
-
-Dim DropoutBatch::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in DropoutBatch")
-  return xs[0];
-}
-
-string DropoutDim::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "dropout_dim(" << arg_names[0] << ",p=" << p << ')';
-  return s.str();
-}
-
-Dim DropoutDim::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in DropoutDim")
-  DYNET_ARG_CHECK(xs[0].nd < 4, "DropoutDim only supports tensor up to order 3 + batch dimension, got tensor of order"<<xs[0].nd)
-  DYNET_ARG_CHECK(xs[0].nd > dimension, "In DropoutDim : tried to drop along dimension "<<dimension<<" on tensor of order"<<xs[0].nd)
-  return xs[0];
-}
-
-string BlockDropout::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "block_dropout(" << arg_names[0] << ",dropout_probability=" << dropout_probability << ')';
-  return s.str();
-}
-
-Dim BlockDropout::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in BlockDropout")
-  return xs[0];
-}
-
-string ConstantPlusX::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << c << " + " << arg_names[0];
-  return s.str();
-}
-
-Dim ConstantPlusX::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in ConstantPlusX")
-  return xs[0];
-}
-
-string ConstantMinusX::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << c << " - " << arg_names[0];
-  return s.str();
-}
-
-Dim ConstantMinusX::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in ConstantMinusX")
-  return xs[0];
-}
-
-string LogSumExp::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "log(exp " << arg_names[0];
-  for (unsigned i = 1; i < arg_names.size(); ++i)
-    s << " + exp " << arg_names[i];
-  s << ")";
-  return s.str();
-}
-
-Dim LogSumExp::dim_forward(const vector<Dim>& xs) const {
-  Dim d = xs[0].truncate();
-  for (unsigned i = 1; i < xs.size(); ++i) {
-    DYNET_ARG_CHECK(d.single_batch() == xs[i].truncate().single_batch(),
-                            "Mismatched input dimensions in LogSumExp: " << xs);
-    d.bd = max(xs[i].bd, d.bd);
-  }
-  return d;
-}
-string Sum::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0];
-  for (unsigned i = 1; i < arg_names.size(); ++i)
-    s << " + " << arg_names[i];
-  return s.str();
-}
-
-int Sum::autobatch_sig(const ComputationGraph &cg, SigMap &sm) const {
-  Sig s(nt::sum);
-  s.add_node(args.size());
-  // Two cases:
-  // If unbatched, it's just an elementwise addition
-  // TODO: This will be more efficient if we identify arguments that are used
-  //       multiple times (e.g. bias vectors)
-  if(dim.bd == 1) {
-    s.add_int(-2);
-  // Otherwise, make sure the dimensions match and that batched nodes don't intersect
-  } else {
-    s.add_dim(dim);
-    for(auto ai : args) {
-      s.add_int(cg.nodes[ai]->dim.bd == 1 ? ai : -1);
-    }
-  }
-  return sm.get_idx(s);
-}
-
-std::vector<int> Sum::autobatch_concat(const ComputationGraph & cg) const {
-  vector<int> ret(args.size(), 1);
-  // If batched, true if multiple batched input as well
-  if(dim.bd != 1)
-    for(size_t i = 0; i < args.size(); ++i)
-      ret[i] = cg.nodes[args[i]]->dim.bd == 1 ? 0 : 1;
-  return ret;
-}
-
-
-Dim Sum::dim_forward(const vector<Dim>& xs) const {
-  Dim d = xs[0].truncate();
-  unsigned int batch = d.bd;
-  for (unsigned i = 1; i < xs.size(); ++i) {
-    DYNET_ARG_CHECK(d.single_batch() == xs[i].truncate().single_batch(),
-                            "Mismatched input dimensions in Sum: " << xs);
-    batch = max(xs[i].bd, batch);
-  }
-  d = xs[0]; d.bd = batch;
-  return d;
-}
-
-string SumElements::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "sum_elems( " << arg_names[0] << " )";
-  return s.str();
-}
-
-Dim SumElements::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in SumElements")
-  return Dim({1}, xs[0].bd);
-}
-
-string SumBatches::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "sum_batches( " << arg_names[0] << " )";
-  return s.str();
-}
-
-Dim SumBatches::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in SumBatches")
-  return xs[0].single_batch();
-}
-
-string MomentElements::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "moment_elems( expression=" << arg_names[0] << ", order=" << order << " )";
-  return s.str();
-}
-
-Dim MomentElements::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MomentElements")
-  DYNET_ARG_CHECK(order>= 1, "Order of moment should be >=1 in MomentElements (recieved "<<order<<")")
-  return Dim({1}, xs[0].bd);
-}
-
-string MomentBatches::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "moment_batches( expression=" << arg_names[0] << ", order=" << order << " )";
-  return s.str();
-}
-
-Dim MomentBatches::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MomentBatches")
-  DYNET_ARG_CHECK(order>= 1, "Order of moment should be >=1 in MomentBatches (recieved "<<order<<")")
-  return xs[0].single_batch();
-}
-
-string StdElements::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "std_elems( expression=" << arg_names[0] << " )";
-  return s.str();
-}
-
-Dim StdElements::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in StdElements")
-  return Dim({1}, xs[0].bd);
-}
-
-string StdBatches::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "std_batches( expression=" << arg_names[0] << " )";
-  return s.str();
-}
-
-Dim StdBatches::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in StdBatches")
- 
-  return xs[0].single_batch();
-}
-
-string StdDimension::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "moment_dim(expression=" << arg_names[0] << ',' << dimension <<'}';
-  return s.str();
-}
-
-Dim StdDimension::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in StdDimension");
-  DYNET_ARG_CHECK(xs[0].nd <= 3, "StdDimension implemented up to tensors of order 3 (with minibatch) for now")
-  DYNET_ARG_CHECK(dimension < xs[0].nd, "dimension " << dimension << " is out of bounds of tensor of order " << xs[0].nd << " in StdDimension" )
-  Dim ret(xs[0]);
-  ret.delete_dim(dimension);
-  return ret;
-}
-
-string MomentDimension::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "moment_dim(expression=" << arg_names[0] << ',' << dimension << ", order="<<order<<'}';
-  return s.str();
-}
-
-Dim MomentDimension::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in MomentDimension");
-  DYNET_ARG_CHECK(xs[0].nd <= 3, "MomentDimension implemented up to tensors of order 3 (with minibatch) for now")
-  DYNET_ARG_CHECK(dimension < xs[0].nd, "dimension " << dimension << " is out of bounds of tensor of order " << xs[0].nd << " in MomentDimension" )
-  DYNET_ARG_CHECK(order>= 1, "Order of moment should be >=1 in MomentDimension (recieved "<<order<<")")
-  Dim ret(xs[0]);
-  ret.delete_dim(dimension);
-  return ret;
-}
-
-string Average::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "average(" << arg_names[0];
-  for (unsigned i = 1; i < arg_names.size(); ++i)
-    s << ", " << arg_names[i];
-  s << ")";
-  return s.str();
-}
-
-Dim Average::dim_forward(const vector<Dim>& xs) const {
-  Dim d(xs[0]);
-  for (unsigned i = 1; i < xs.size(); ++i) {
-    DYNET_ARG_CHECK(xs[0].single_batch() == xs[i].single_batch(),
-                            "Mismatched input dimensions in Average: " << xs);
-    d.bd = max(xs[i].bd, d.bd);
-  }
-  return d;
-}
-
-string Erf::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "erf(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim Erf::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Erf")
-  return xs[0];
-}
-
-string Tanh::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "tanh(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim Tanh::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Tanh")
-  return xs[0];
-}
-
-string LogGamma::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "lgamma(" << arg_names[0] << ')';
-  return os.str();
-}
-
-Dim LogGamma::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in LogGamma")
-  return xs[0];
-}
-
-string Concatenate::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "concat({" << arg_names[0];
-  for (unsigned i = 1; i < arg_names.size(); ++i) {
-    os << ',' << arg_names[i];
-  }
-  os << "}, " << dimension << ')';
-  return os.str();
-}
-
-Dim Concatenate::dim_forward(const vector<Dim>& xs) const {
-  unsigned new_rows = 0;
-  Dim dr = xs[0];
-  for (auto c : xs) {
-    if(dr.nd < c.nd) dr.resize(c.nd);
-    if(c.nd < dr.nd) c.resize(dr.nd);
-    new_rows += c[dimension];
-    dr.set(dimension, c[dimension]);
-    DYNET_ARG_CHECK(dr.single_batch() == c.single_batch(),
-                            "Bad input dimensions in Concatenate: " << xs);
-    dr.bd = max(dr.bd, c.bd);
-  }
-  dr.nd = max(xs[0].nd, dimension+1);
-  dr.set(dimension, new_rows);
-  return dr;
-}
-
-int Concatenate::autobatch_sig(const ComputationGraph &cg, SigMap &sm) const {
-  Sig s(nt::concat);
-  for (auto arg:args) s.add_dim(cg.nodes[arg]->dim);
-  return sm.get_idx(s);
-}
-
-
-string ConcatenateToBatch::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "concat_batch_elems(" << arg_names[0];
-  for (unsigned i = 1; i < arg_names.size(); ++i) {
-    os << ',' << arg_names[i];
-  }
-  os << ')';
-  return os.str();
-}
-
-Dim ConcatenateToBatch::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ASSERT(xs.size() > 0, "Failed input count check in ConcatenateToBatch")
-  Dim d(xs[0]);
-  for (unsigned i = 1; i < xs.size(); ++i) {
-    DYNET_ARG_CHECK(xs[0].single_batch() == xs[i].single_batch(),
-                            "Mismatched input dimensions in ConcatenateToBatch: " << xs);
-    d.bd += xs[i].bd;
-  }
-  return d;
-}
-
-string PairwiseRankLoss::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "max(0, " << margin << " - " << arg_names[0] << " + " << arg_names[1] << ')';
-  return os.str();
-}
-
-Dim PairwiseRankLoss::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2 &&
-                          xs[0] == xs[1] &&
-                          xs[0].rows() == 1 &&
-                          (xs[0].ndims() == 1 || xs[0].ndims() == 2),
-                          "Bad input dimensions in PairwiseRankLoss: " << xs);
-  return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
-}
-
-string Identity::as_string(const vector<string>& arg_names) const {
-  return arg_names[0];
-}
-
-Dim Identity::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Identity")
-  return xs[0];
-}
-
-string NoBackprop::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "nobackprop(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim NoBackprop::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in NoBackprop")
-  return xs[0];
-}
-
-string FlipGradient::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "flip_gradient(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim FlipGradient::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in FlipGradient");
-  return xs[0];
-}  
-  
-string Softmax::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "softmax(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim Softmax::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Softmax");
-  DYNET_ARG_CHECK(xs[0].nd <= 2, "Bad input dimensions in Softmax, must be 2 or fewer: " << xs);
-  return xs[0];
-}
-
-int Softmax::autobatch_sig(const ComputationGraph & cg, SigMap &sm) const {
-  Sig s(nt::softmax);
-  s.add_dim(dim);
-  return sm.get_idx(s);
-}
-std::vector<int> Softmax::autobatch_concat(const ComputationGraph & cg) const {
-  return vector<int>(1, 1);
-}
-
-string SoftSign::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "softsign(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim SoftSign::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in SoftSign");
-  DYNET_ARG_CHECK(LooksLikeVector(xs[0]), "Bad input dimensions in SoftSign: " << xs);
-  return xs[0];
-}
-
-string LogSoftmax::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "log_softmax(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim LogSoftmax::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in LogSoftmax")
-  DYNET_ARG_CHECK(xs[0].nd <= 2, "Bad input dimensions in LogSoftmax, must be 2 or fewer: " << xs);
-  return xs[0];
-}
-
-string RestrictedLogSoftmax::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "r_log_softmax(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim RestrictedLogSoftmax::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in RestrictedLogSoftmax")
-  DYNET_ARG_CHECK(LooksLikeVector(xs[0]), "Bad input dimensions in RestrictedLogSoftmax: " << xs);
-  return xs[0];
-}
-
-string PickElement::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "pick(" << arg_names[0] << ',';
-  if(pval) { 
-    s << *pval;
-  } else {
-    DYNET_ASSERT(pvals, "Have neither index nor index vector in PickElement");
-    s << '[';
-    if(pvals->size()) {
-      s << (*pvals)[0];
-      for(size_t i = 1; i < pvals->size(); ++i)
-        s << ',' << (*pvals)[i];
-    }
-    s << "]";
-  }
-  s << ", " << dimension << ")";
-  return s.str();
-}
-
-Dim PickElement::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in PickElement");
-  DYNET_ARG_CHECK(dimension < xs[0].nd,
-                          "Tried to PickElement on dimension " << dimension << " bigger than input " << xs[0]);
-  DYNET_ARG_CHECK(xs[0].nd < 4,
-                          "PickElement not currently supported for tensors of 4 or more dimensions.");
-  
-  Dim ret(xs[0]);
-  if (pvals){
-    DYNET_ARG_CHECK(xs[0].bd == 1 || xs[0].bd == pvals->size(),
-                          "Number of elements in the passed-in index vector (" <<  pvals->size() << ")"
-                            " did not match number of elements in mini-batch elements in expression (of dimension " << xs[0].bd << ") in PickElement");
-    ret.bd = pvals->size();
-  }
-
-  ret.delete_dim(dimension);
-  return ret;
-}
-
-// x_1 is a vector
-// y = (x_1)[start:end]
-string PickRange::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "slice(" << arg_names[0] << ',' << start << ':' << end << ", dim=" << dim << ')';
-  return s.str();
-}
-
-Dim PickRange::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in PickRange");
-  DYNET_ARG_CHECK(dim < xs[0].nd && start < end && xs[0][dim] >= end,
-                          "Bad input dimensions or range in PickRange: " << xs << " range(" << start << ", " << end << ") with dim=" << dim);
-  Dim ret = xs[0]; ret.d[dim] = end-start;
-  return ret;
-}
-
-int PickRange::autobatch_sig(const ComputationGraph & cg, SigMap &sm) const {
-  Sig s(nt::pickrange);
-  const Dim &in_dim = cg.nodes[args[0]]->dim;
-  s.add_dim(in_dim);
-  s.add_node(start);
-  s.add_node(end);
-  return sm.get_idx(s);
-}
-
-string PickBatchElements::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "pick_batch_elems(" << arg_names[0] << ',';
-  if (pval) {
-    s << *pval;
-  } else {
-    DYNET_ASSERT(pvals, "Have neither index nor index vector in PickBatchElements");
-    s << '[';
-    if (pvals->size()) {
-      s << (*pvals)[0];
-      for (size_t i = 1; i < pvals->size(); ++i)
-        s << ',' << (*pvals)[i];
-    }
-    s << "]";
-  }
-  s << ")";
-  return s.str();
-}
-
-Dim PickBatchElements::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in PickBatchElements")
-  DYNET_ARG_CHECK(xs[0].nd < 4, "PickElement not currently supported for tensors of 4 or more dimensions.");
-  Dim ret(xs[0]);
-  if (pval) {
-    // set batch size to one.
-    ret.bd = 1;
-  } else {
-    DYNET_ASSERT(pvals, "Have neither index nor index vector in PickBatchElements");
-    ret.bd = pvals->size();
-  }
-  return ret;
-}
-
-string CwiseMultiply::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " \\cdot " << arg_names[1];
-  return s.str();
-}
-
-Dim CwiseMultiply::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in CwiseMultiply")
-  Dim d = xs[0].truncate();
-  DYNET_ARG_CHECK(d.single_batch() == xs[1].truncate().single_batch(),
-                          "Mismatched input dimensions in CwiseMultiply: " << xs);
-  d.bd = max(xs[1].bd, d.bd);
-  return d;
-}
-
-int CwiseMultiply::autobatch_sig(const ComputationGraph & cg, SigMap &sm) const {
-  // TODO: This does not handle the case where dimensions differ
-  Sig s(nt::cmult);
-  return cg.nodes[args[0]]->dim == cg.nodes[args[1]]->dim ? sm.get_idx(s) : 0;
-}
-
-std::vector<int> CwiseMultiply::autobatch_concat(const ComputationGraph & cg) const {
-  return vector<int>(2, 1);
-}
-
-string ScalarAdd::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " + " << arg_names[1];
-  return s.str();
-}
-
-Dim ScalarAdd::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in ScalarAdd")
-  Dim d = xs[0].truncate();
-  DYNET_ARG_CHECK(xs[1].batch_size() == 1,
-                          "Mismatched input dimensions in ScalarAdd: " << xs);
-  d.bd = max(xs[1].bd, d.bd);
-  return d;
-}
-
-string ScalarMultiply::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " \\cdot " << arg_names[1];
-  return s.str();
-}
-
-Dim ScalarMultiply::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in ScalarMultiply")
-  Dim d = xs[1];
-  DYNET_ARG_CHECK(xs[0].batch_size() == 1,
-                          "Mismatched input dimensions in ScalarMultiply: " << xs);
-  d.bd = max(xs[0].bd, d.bd);
-  return d;
-}
-
-string ScalarQuotient::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " / " << arg_names[1];
-  return s.str();
-}
-
-Dim ScalarQuotient::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in ScalarQuotient")
-  Dim d = xs[0].truncate();
-  DYNET_ARG_CHECK(xs[1].batch_size() == 1,
-                          "Mismatched input dimensions in ScalarQuotient: " << xs);
-  d.bd = max(xs[1].bd, d.bd);
-  return d;
-}
-
-
-string Pow::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " ** " << arg_names[1];
-  return s.str();
-}
-
-Dim Pow::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in Pow")
-  Dim d = xs[0].truncate();
-  DYNET_ARG_CHECK(xs[1].truncate().single_batch().size() == 1, "Bad input dimensions in Pow: " << xs);
-  return d;
-}
-
-string CwiseQuotient::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << arg_names[0] << " / " << arg_names[1];
-  return s.str();
-}
-
-Dim CwiseQuotient::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in CwiseQuotient")
-  Dim d = xs[0].truncate();
-  DYNET_ARG_CHECK(d.single_batch() == xs[1].truncate().single_batch(), "Bad input dimensions in CwiseQuotient: " << xs);
-  d.bd = max(xs[1].bd, d.bd);
-  return d;
-}
-
-string Rectify::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "ReLU(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim Rectify::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Rectify");
-  return xs[0];
-}
-
-string ExponentialLinearUnit::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "ELU(" << arg_names[0] << ", lambda=" << lambda << ", alpha=" << alpha << ')';
-  return s.str();
-}
-
-Dim ExponentialLinearUnit::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in ExponentialLinearUnit");
-  return xs[0];
-}
-
-string PoissonRegressionLoss::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "-log Poisson(" << pty << "; lambda=\\exp" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim PoissonRegressionLoss::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1 && xs[0].size() == 1, "Bad input dimensions in PoissonRegressionLoss: " << xs);
-  return xs[0];
-}
-
-string LogisticSigmoid::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "\\sigma(" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim LogisticSigmoid::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in LogisticSigmoid")
-  return xs[0];
-}
-
-string BinaryLogLoss::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "binary_log_loss(" << arg_names[0] << ", " << arg_names[1] << ')';
-  return os.str();
-}
-
-Dim BinaryLogLoss::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in BinaryLogLoss")
-  DYNET_ARG_CHECK(xs[0].rows() == 2 || xs[0].ndims() == 1, "Bad input dimensions in BinaryLogLoss: " << xs);
-  DYNET_ARG_CHECK(xs[1].rows() == 2 || xs[1].ndims() == 1, "Bad input dimensions in BinaryLogLoss: " << xs);
-  return Dim({1}, max(xs[0].bd, xs[1].bd));
-}
-
-string Zeroes::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "zeroes(" << dim << ')';
-  return s.str();
-}
-
-Dim Zeroes::dim_forward(const vector<Dim>& xs) const {
-  return dim;
-}
-
-string RandomNormal::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "random_normal(" << dim << ')';
-  return s.str();
-}
-
-Dim RandomNormal::dim_forward(const vector<Dim>& xs) const {
-  return dim;
-}
-
-string RandomBernoulli::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "random_bernoulli(" << dim << ", " << p << ')';
-  return s.str();
-}
-
-Dim RandomBernoulli::dim_forward(const vector<Dim>& xs) const {
-  return dim;
-}
-
-string RandomUniform::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "random_uniform(" << dim << ", " << left << ", " << right << ')';
-  return s.str();
-}
-
-Dim RandomUniform::dim_forward(const vector<Dim>& xs) const {
-  return dim;
-}
-
-string RandomGumbel::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "random_gumbel(" << dim << ", " << mu << ", " << beta << ')';
-  return s.str();
-}
-
-Dim RandomGumbel::dim_forward(const vector<Dim>& xs) const {
-  return dim;
-}
-
-string MaxDimension::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "max_dim(" << arg_names[0] << ", reduced_dim=" << reduced_dim << ')';
-  return s.str();
-}
-
-Dim MaxDimension::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MaxDimension");
-  DYNET_ARG_CHECK(reduced_dim < xs[0].nd,
-                          "Tried to MaxDimension on dimension " << reduced_dim << " bigger than input " << xs[0]);
-  DYNET_ARG_CHECK(xs[0].nd < 4,
-                          "MaxDimension not currently supported for tensors of 4 or more dimensions.");
-  Dim ret(xs[0]);
-  ret.delete_dim(reduced_dim);
-  return ret;
-}
-
-string MinDimension::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "min_dim(" << arg_names[0] << ", reduced_dim=" << reduced_dim << ')';
-  return s.str();
-}
-
-Dim MinDimension::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MinDimension");
-  DYNET_ARG_CHECK(reduced_dim < xs[0].nd,
-                          "Tried to MinDimension on dimension " << reduced_dim << " bigger than input " << xs[0]);
-  DYNET_ARG_CHECK(xs[0].nd < 4,
-                          "MinDimension not currently supported for tensors of 4 or more dimensions.");
-  Dim ret(xs[0]);
-  ret.delete_dim(reduced_dim);
-  return ret;
-}
-
-string WeightNormalization::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "weight_norm(" << arg_names[0] << ", " << arg_names[1] << ')';
-  return s.str();
-}
-
-Dim WeightNormalization::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in WeightNormalization");
-  DYNET_ARG_CHECK(1 == xs[1].size()," Size of gain parameter in WeightNormalization should be 1, received " << xs[1].size());
-  return xs[0];
-}
-
-} // namespace dynet
diff --git a/dynet/nodes-concat.cc b/dynet/nodes-concat.cc
new file mode 100644
index 000000000..f601ef67d
--- /dev/null
+++ b/dynet/nodes-concat.cc
@@ -0,0 +1,148 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Concatenate *************
+
+#ifndef __CUDACC__
+
+string Concatenate::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "concat({" << arg_names[0];
+  for (unsigned i = 1; i < arg_names.size(); ++i) {
+    os << ',' << arg_names[i];
+  }
+  os << "}, " << dimension << ')';
+  return os.str();
+}
+
+Dim Concatenate::dim_forward(const vector<Dim>& xs) const {
+  unsigned new_rows = 0;
+  Dim dr = xs[0];
+  for (auto c : xs) {
+    if(dr.nd < c.nd) dr.resize(c.nd);
+    if(c.nd < dr.nd) c.resize(dr.nd);
+    new_rows += c[dimension];
+    dr.set(dimension, c[dimension]);
+    DYNET_ARG_CHECK(dr.single_batch() == c.single_batch(),
+                            "Bad input dimensions in Concatenate: " << xs);
+    dr.bd = max(dr.bd, c.bd);
+  }
+  dr.nd = max(xs[0].nd, dimension+1);
+  dr.set(dimension, new_rows);
+  return dr;
+}
+
+int Concatenate::autobatch_sig(const ComputationGraph &cg, SigMap &sm) const {
+  Sig s(nt::concat);
+  for (auto arg:args) s.add_dim(cg.nodes[arg]->dim);
+  return sm.get_idx(s);
+}
+
+#endif
+
+template<class MyDevice>
+void Concatenate::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  unsigned curr_row = 0;
+  src_indices.resize(xs.size());
+  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(fx.d[0], fx.d[1], fx.d[2], fx.d[3],static_cast<ptrdiff_t>(fx.d.bd));
+  for (unsigned i = 0; i < xs.size(); ++i) {
+    indices[dimension] = src_indices[i] = curr_row;
+    const unsigned row_size = xs[i]->d[dimension];
+    sizes[dimension] = row_size;
+    if(fx.d.bd == xs[i]->d.bd) {
+      fx.tb<4>().slice(indices, sizes).device(*dev.edevice) = xs[i]->tb<4>();
+    } else {
+      Eigen::array<ptrdiff_t, 5> bcast; bcast[0] = bcast[1] = bcast[2] = bcast[3] = 1; bcast[4] = fx.d.bd;
+      fx.tb<4>().slice(indices, sizes).device(*dev.edevice) = xs[i]->tb<4>().broadcast(bcast);
+    }
+    curr_row += row_size;
+  }
+}
+
+template<class MyDevice>
+void Concatenate::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < src_indices.size(), "Failed boundary check in Concatenate::backward: " << i << " >= " << src_indices.size());
+  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0); indices[dimension] = src_indices[i];
+  Eigen::DSizes<ptrdiff_t, 5> sizes(static_cast<ptrdiff_t>(dEdxi.d[0]),
+                                    static_cast<ptrdiff_t>(dEdxi.d[1]),
+                                    static_cast<ptrdiff_t>(dEdxi.d[2]),
+                                    static_cast<ptrdiff_t>(dEdxi.d[3]),
+                                    static_cast<ptrdiff_t>(fx.d.bd));
+  if(dEdxi.d.bd == dEdf.d.bd) {
+    dEdxi.tb<4>().device(*dev.edevice) += dEdf.tb<4>().slice(indices, sizes);
+  } else {
+    Eigen::array<int, 1> red_axis; red_axis[0] = 4;
+    dEdxi.t<4>().device(*dev.edevice) += dEdf.tb<4>().slice(indices, sizes).sum(red_axis);
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(Concatenate)
+
+// ************* ConcatenateToBatch *************
+
+#ifndef __CUDACC__
+
+string ConcatenateToBatch::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "concat_batch_elems(" << arg_names[0];
+  for (unsigned i = 1; i < arg_names.size(); ++i) {
+    os << ',' << arg_names[i];
+  }
+  os << ')';
+  return os.str();
+}
+
+Dim ConcatenateToBatch::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ASSERT(xs.size() > 0, "Failed input count check in ConcatenateToBatch")
+  Dim d(xs[0]);
+  for (unsigned i = 1; i < xs.size(); ++i) {
+    DYNET_ARG_CHECK(xs[0].single_batch() == xs[i].single_batch(),
+                            "Mismatched input dimensions in ConcatenateToBatch: " << xs);
+    d.bd += xs[i].bd;
+  }
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void ConcatenateToBatch::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const { 
+  unsigned curr_e = 0;
+  src_element_indices.resize(xs.size());
+  Eigen::DSizes<ptrdiff_t, 2> indices(0,0);
+  Eigen::DSizes<ptrdiff_t, 2> sizes(static_cast<ptrdiff_t>(fx.d.batch_size()), 0);
+  for (unsigned i = 0; i < xs.size(); ++i) {
+    indices[1] = src_element_indices[i] = curr_e;
+    sizes[1] = xs[i]->d.bd;
+    fx.tbvec().slice(indices, sizes).device(*dev.edevice) = xs[i]->tbvec();
+    curr_e += xs[i]->d.bd;
+  }
+  
+}
+
+template<class MyDevice>
+void ConcatenateToBatch::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < src_element_indices.size(), "Failed boundary check in ConcatenateToBatch::backward: " << i << " >= " << src_element_indices.size());
+  Eigen::DSizes<ptrdiff_t, 2> indices(0, static_cast<ptrdiff_t>(src_element_indices[i]));
+  Eigen::DSizes<ptrdiff_t, 2> sizes(static_cast<ptrdiff_t>(fx.d.batch_size()), static_cast<ptrdiff_t>(xs[i]->d.bd));
+  dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().slice(indices, sizes);
+}
+DYNET_NODE_INST_DEV_IMPL(ConcatenateToBatch)
+
+}
diff --git a/dynet/nodes-const.cc b/dynet/nodes-const.cc
new file mode 100644
index 000000000..3f99a84d4
--- /dev/null
+++ b/dynet/nodes-const.cc
@@ -0,0 +1,42 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Zeroes *************
+
+#ifndef __CUDACC__
+
+string Zeroes::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "zeroes(" << dim << ')';
+  return s.str();
+}
+
+Dim Zeroes::dim_forward(const vector<Dim>& xs) const {
+  return dim;
+}
+
+#endif
+
+template<class MyDevice>
+void Zeroes::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in Zeroes::forward");
+  TensorTools::zero(fx);
+}
+
+template<class MyDevice>
+void Zeroes::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
+}
+DYNET_NODE_INST_DEV_IMPL(Zeroes)
+
+}
diff --git a/dynet/nodes-conv.cc b/dynet/nodes-conv.cc
index 8bec337ee..d8c56ee1b 100644
--- a/dynet/nodes-conv.cc
+++ b/dynet/nodes-conv.cc
@@ -20,77 +20,9 @@ using namespace std;
 
 namespace dynet {
 
-#ifndef __CUDACC__
-
-string AverageColumns::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "average_cols(matrix=" << arg_names[0] << ')';
-  return s.str();
-}
-
-Dim AverageColumns::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ASSERT(xs.size() == 1 || xs.size() == 2, "Failed input count check in AverageColumns");
-  int bd = (xs.size() == 1 ? xs[0].bd : max(xs[0].bd, xs[1].bd));
-  return Dim({xs[0].rows()}, bd);
-}
-
-string FoldRows::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "fold_rows(" << arg_names[0] << ", nrows=" << nrows << ')';
-  return os.str();
-}
+// ************* Filter1DNarrow *************
 
-Dim FoldRows::dim_forward(const vector<Dim>& xs) const {
-  unsigned orows = xs[0].rows() / nrows;
-  if ((orows * nrows != xs[0].rows()) || xs.size() != 1 || xs[0].ndims() > 2) {
-    ostringstream s; s << "Bad input dimensions in FoldRows: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  return Dim({orows, xs[0].cols()});
-}
-
-/* Deprecated
-string Conv1DNarrow::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "conv1d_narrow(" << arg_names[0] << ", f=" << arg_names[1] << ')';
-  return os.str();
-}
-
-Dim Conv1DNarrow::dim_forward(const vector<Dim>& xs) const {
-  if (xs.size() != 2) {
-    ostringstream s; s << "Conv1DNarrow requires two inputs: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  int ocols = xs[0].cols() - xs[1].cols() + 1;
-  if (xs[0].ndims() != 2 || xs[1].ndims() != 2 ||
-      xs[0].rows() != xs[1].rows() ||
-      ocols < 1) {
-    ostringstream s; s << "Bad input dimensions in Conv1DNarrow: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  return Dim({xs[0].rows(), (unsigned)ocols});
-}
-
-string Conv1DWide::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "conv1d_wide(" << arg_names[0] << ", f=" << arg_names[1] << ')';
-  return os.str();
-}
-
-Dim Conv1DWide::dim_forward(const vector<Dim>& xs) const {
-  if (xs.size() != 2) {
-    ostringstream s; s << "Conv1DWide requires two inputs: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  unsigned ocols = xs[0].cols() + xs[1].cols() - 1;
-  if (xs[0].ndims() != 2 || xs[1].ndims() != 2 ||
-      xs[0].rows() != xs[1].rows()) {
-    ostringstream s; s << "Bad input dimensions in Conv1DWide: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  return Dim({xs[0].rows(), ocols});
-}
-*/
+#ifndef __CUDACC__
 
 string Filter1DNarrow::as_string(const vector<string>& arg_names) const {
   ostringstream os;
@@ -114,143 +46,7 @@ Dim Filter1DNarrow::dim_forward(const vector<Dim>& xs) const {
   return Dim({fids, (unsigned)ocols});
 }
 
-string KMaxPooling::as_string(const vector<string>& arg_names) const {
-  ostringstream os;
-  os << "kmaxpool(" << arg_names[0] << ", k=" << k << ", d=" << pooled_dim << ')';
-  return os.str();
-}
-
-Dim KMaxPooling::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ARG_CHECK(pooled_dim < xs[0].nd,
-                          "Tried to MaxDimension on dimension " << pooled_dim << " bigger than input " << xs[0]);
-  DYNET_ARG_CHECK(xs[0].nd < 4,
-                          "MaxDimension not currently supported for tensors of 4 or more dimensions.");
-  DYNET_ARG_CHECK(k >= 1, "Bad bad k in KMaxPooling: " << k);
-  DYNET_ARG_CHECK(k <= xs[0][pooled_dim], 
-                          "Bad k in KMaxPooling: k = " << k << " bigger than the size of pooled dimension " 
-                          << pooled_dim << " with size = " << xs[0][pooled_dim]);
-  Dim ret(xs[0]);
-  ret.set(pooled_dim, k);
-  return ret;
-}
-
-size_t KMaxPooling::aux_storage_size() const {
-  // map of where the entries in f(x) go to entries in x
-  return sizeof(Eigen::DenseIndex) * dim.size();
-}
-
-string SumDimension::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "sum_dim(matrix=" << arg_names[0] << ',' << dimension << '}';
-  return s.str();
-}
-
-Dim SumDimension::dim_forward(const vector<Dim>& xs) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in SumDimension");
-  Dim ret(xs[0]);
-  ret.delete_dim(dimension);
-  return ret;
-}
-#endif
-
-template<class MyDevice>
-void AverageColumns::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in AverageColumns");
-  unsigned cols = xs[0]->d.cols();
-#ifdef __CUDACC__
-  // The reduction used on CPU is better, but not implemented in GPU
-  fx.t<1>().device(*dev.edevice) = xs[0]->t<2>().chip<1>(0);
-  for(unsigned i = 1; i < cols; ++i)
-    fx.t<1>().device(*dev.edevice) += xs[0]->t<2>().chip<1>(i);
-  fx.t<1>().device(*dev.edevice) = fx.t<1>() / (float)cols;
-#else
-  const Eigen::array<Eigen::DenseIndex, 1> reduction_axis = {1};
-  fx.t<1>().device(*dev.edevice) = xs[0]->t<2>().sum(reduction_axis) / (float)cols;
 #endif
-}
-
-template<class MyDevice>
-void AverageColumns::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  const Eigen::array<Eigen::DenseIndex, 2> broadcasts = {1, xs[0]->d[1]};
-  dEdxi.t<2>().device(*dev.edevice) += (dEdf.t<2>() / (float)xs[0]->d[1]).broadcast(broadcasts);
-}
-DYNET_NODE_INST_DEV_IMPL(AverageColumns)
-
-/* Deprecated
-template<class MyDevice>
-void Conv1DNarrow::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  const unsigned ycols = dim.cols();
-  const unsigned fcols = xs[1]->d.cols();
-  for (unsigned j = 0; j < ycols; ++j) {
-    fx.t<2>().chip<1>(j).device(*dev.edevice) = xs[0]->t<2>().chip<1>(j) * xs[1]->t<2>().chip<1>(0);
-    for (unsigned k = 1; k < fcols; ++k)
-      fx.t<2>().chip<1>(j).device(*dev.edevice) += xs[0]->t<2>().chip<1>(j+k) * xs[1]->t<2>().chip<1>(k);
-  }
-  // TODO: This following version without chip is better, but for some reason dimensions don't match.
-  // Eigen::array<ptrdiff_t, 1> dims; dims[0] = 1;
-  // fx.t<2>().device(*dev.edevice) = xs[0]->t<2>().convolve(xs[1]->t<2>(), dims);
-}
-
-template<class MyDevice>
-void Conv1DNarrow::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed input count check in Conv1DNarrow");
-  const unsigned ycols = dim.cols();
-  const unsigned fcols = xs[1]->d.cols();
-  // TODO: Can this be done with a kernel and without using chip?
-  if (i == 0) { // derivative wrt input x
-    for (unsigned j = 0; j < ycols; ++j)
-      for (unsigned k = 0; k < fcols; ++k)
-        dEdxi.t<2>().chip<1>(j+k).device(*dev.edevice) += xs[1]->t<2>().chip<1>(k) * dEdf.t<2>().chip<1>(j);
-  } else { // derivative wrt filter f
-    for (unsigned j = 0; j < ycols; ++j)
-      for (unsigned k = 0; k < fcols; ++k)
-        dEdxi.t<2>().chip<1>(k).device(*dev.edevice) += xs[0]->t<2>().chip<1>(j+k) * dEdf.t<2>().chip<1>(j);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Conv1DNarrow)
-
-template<class MyDevice>
-void Conv1DWide::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  TensorTools::zero(fx);
-  const unsigned xcols = xs[0]->d.cols();
-  const unsigned fcols = xs[1]->d.cols();
-  for (unsigned j = 0; j < xcols; ++j)
-    for (unsigned k = 0; k < fcols; ++k)
-      fx.t<2>().chip<1>(j+k).device(*dev.edevice) += xs[1]->t<2>().chip<1>(k) * xs[0]->t<2>().chip<1>(j);
-}
-
-
-template<class MyDevice>
-void Conv1DWide::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  const unsigned xcols = xs[0]->d.cols();
-  const unsigned fcols = xs[1]->d.cols();
-  if (i == 0) { // derivative wrt input x
-    for (unsigned j = 0; j < xcols; ++j)
-      for (unsigned k = 0; k < fcols; ++k)
-        dEdxi.t<2>().chip<1>(j).device(*dev.edevice) += xs[1]->t<2>().chip<1>(k) * dEdf.t<2>().chip<1>(j + k);
-  } else { // derivative wrt filter f
-    for (unsigned j = 0; j < xcols; ++j)
-      for (unsigned k = 0; k < fcols; ++k)
-        dEdxi.t<2>().chip<1>(k).device(*dev.edevice) += xs[0]->t<2>().chip<1>(j) * dEdf.t<2>().chip<1>(j + k);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Conv1DWide)
-*/
 
 template<class MyDevice>
 void Filter1DNarrow::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
@@ -313,6 +109,26 @@ void Filter1DNarrow::backward_dev_impl(const MyDevice & dev,
 }
 DYNET_NODE_INST_DEV_IMPL(Filter1DNarrow)
 
+// ************* FoldRows *************
+
+#ifndef __CUDACC__
+
+string FoldRows::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "fold_rows(" << arg_names[0] << ", nrows=" << nrows << ')';
+  return os.str();
+}
+
+Dim FoldRows::dim_forward(const vector<Dim>& xs) const {
+  unsigned orows = xs[0].rows() / nrows;
+  if ((orows * nrows != xs[0].rows()) || xs.size() != 1 || xs[0].ndims() > 2) {
+    ostringstream s; s << "Bad input dimensions in FoldRows: " << xs;
+    throw std::invalid_argument(s.str());
+  }
+  return Dim({orows, xs[0].cols()});
+}
+
+#endif
 
 template<class MyDevice>
 void FoldRows::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
@@ -343,6 +159,37 @@ void FoldRows::backward_dev_impl(const MyDevice & dev,
 }
 DYNET_NODE_INST_DEV_IMPL(FoldRows)
 
+// ************* KMaxPooling *************
+
+#ifndef __CUDACC__
+
+string KMaxPooling::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "kmaxpool(" << arg_names[0] << ", k=" << k << ", d=" << pooled_dim << ')';
+  return os.str();
+}
+
+Dim KMaxPooling::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(pooled_dim < xs[0].nd,
+                          "Tried to MaxDimension on dimension " << pooled_dim << " bigger than input " << xs[0]);
+  DYNET_ARG_CHECK(xs[0].nd < 4,
+                          "MaxDimension not currently supported for tensors of 4 or more dimensions.");
+  DYNET_ARG_CHECK(k >= 1, "Bad bad k in KMaxPooling: " << k);
+  DYNET_ARG_CHECK(k <= xs[0][pooled_dim], 
+                          "Bad k in KMaxPooling: k = " << k << " bigger than the size of pooled dimension " 
+                          << pooled_dim << " with size = " << xs[0][pooled_dim]);
+  Dim ret(xs[0]);
+  ret.set(pooled_dim, k);
+  return ret;
+}
+
+size_t KMaxPooling::aux_storage_size() const {
+  // map of where the entries in f(x) go to entries in x
+  return sizeof(Eigen::DenseIndex) * dim.size();
+}
+
+#endif
+
 template<class MyDevice>
 void KMaxPooling::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
 #ifdef __CUDACC__
@@ -429,25 +276,59 @@ void KMaxPooling::backward_dev_impl(const MyDevice & dev,
 }
 DYNET_NODE_INST_DEV_IMPL(KMaxPooling)
 
+// ************* KMHNgram *************
+
+#ifndef __CUDACC__
+
+string KMHNGram::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "kmh-ngram(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim KMHNGram::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs[0].ndims() == 2, "Bad input dimensions in KMHNGram: " << xs);
+  const unsigned new_cols = xs[0].cols() - n + 1;
+  DYNET_ARG_CHECK(new_cols >= 1, "Bad input dimensions in KMHNGram: " << xs);
+  return Dim({xs[0][0], new_cols});
+}
+
+#endif
+
 template<class MyDevice>
-void SumDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in SumDimension");
-  Eigen::array<int, 1> reduction_axis = {(int)dimension};
-  fx.t<1>().device(*dev.edevice) = xs[0]->t<2>().sum(reduction_axis);
+void KMHNGram::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("KMHNGram not implemented for CUDA");
+#else
+  auto x = **xs[0];
+  const int new_cols = x.cols() - n + 1;
+  DYNET_ASSERT(new_cols > 0, "Failed dimension check in KMHNGram");
+  auto res = *fx;
+  res.setZero();
+  for (int j = 0; j < new_cols; ++j) {
+    auto c_j = res.col(j);
+    for (unsigned k = 0; k < n; ++k)
+      c_j += x.col(j + k);
+  }
+#endif
 }
 
 template<class MyDevice>
-void SumDimension::backward_dev_impl(const MyDevice & dev,
+void KMHNGram::backward_dev_impl(const MyDevice & dev,
                              const vector<const Tensor*>& xs,
                              const Tensor& fx,
                              const Tensor& dEdf,
                              unsigned i,
                              Tensor& dEdxi) const {
-  // TODO: limit to 3-dimensional tensor is arbitrary
-  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = dEdxi.d[dimension];
-  Eigen::array<int, 4> morph = {(int)dEdxi.d[0],(int)dEdxi.d[1],(int)dEdxi.d[2],(int)dEdxi.d.bd}; morph[dimension] = 1;
-  dEdxi.tb<3>().device(*dev.edevice) += dEdf.tb<3>().reshape(morph).broadcast(bcast);
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("KMHNGram not implemented for CUDA");
+#else
+  const int c = dEdf.d.cols();
+  for (int j = 0; j < c; ++j)
+    for (unsigned k = 0; k < n; ++k)
+      (*dEdxi).col(j+k) += (*dEdf).col(j);
+#endif
 }
-DYNET_NODE_INST_DEV_IMPL(SumDimension)
+DYNET_NODE_INST_DEV_IMPL(KMHNGram)
 
 } // namespace dynet
diff --git a/dynet/nodes-conv.h b/dynet/nodes-conv.h
index 9465b16be..a4fd2ca02 100644
--- a/dynet/nodes-conv.h
+++ b/dynet/nodes-conv.h
@@ -11,31 +11,6 @@
 
 namespace dynet {
 
-// with a single argument x \in R^{n x m}
-// y_i = \sum_j x_i,j / m
-struct AverageColumns : public Node {
-  template <typename T> explicit AverageColumns(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-/* Deprecated
-// y = x_1 *conv x_2
-// x_1 \in R^{d x s} (input)
-// x_2 \in R^{d x m} (filter)
-struct Conv1DNarrow : public Node {
-  explicit Conv1DNarrow(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 *conv x_2
-// x_1 \in R^{d x s} (input)
-// x_2 \in R^{d x m} (filter)
-struct Conv1DWide : public Node {
-  explicit Conv1DWide(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-*/
-
 // y = x_1 *filter x_2
 // x_1 \in R^{d x s} (input)
 // x_2 \in R^{d x m} (filter)
@@ -64,13 +39,6 @@ struct KMaxPooling : public Node {
   unsigned second_dim;
 };
 
-// sum along a single dimension
-struct SumDimension : public Node {
-  template <typename T> explicit SumDimension(const T& a, unsigned d) : Node(a), dimension(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  unsigned dimension;
-};
-
 // conv2d 
 // y = x_1 *conv2d x_2
 // x_1 \in R^{H x W x Ci x N} (input)
@@ -119,6 +87,13 @@ struct MaxPooling2D: public Node {
 #endif
 };
 
+// y_i = \sum_{j=1}^n x_1:{i-1+j}
+struct KMHNGram : public Node {
+  explicit KMHNGram(const std::initializer_list<VariableIndex>& a, unsigned n) : Node(a), n(n) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  unsigned n;  // width, n=2 for Karl's paper
+};
+
 
 } // namespace dynet
 
diff --git a/dynet/nodes-dropout.cc b/dynet/nodes-dropout.cc
new file mode 100644
index 000000000..7addca034
--- /dev/null
+++ b/dynet/nodes-dropout.cc
@@ -0,0 +1,187 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Dropout *************
+
+#ifndef __CUDACC__
+
+string Dropout::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "dropout(" << arg_names[0] << ",p=" << p << ')';
+  return s.str();
+}
+
+Dim Dropout::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Dropout")
+  return xs[0];
+}
+
+size_t Dropout::aux_storage_size() const {
+  return dim.size() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void Dropout::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Tensor m(dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  TensorTools::randomize_bernoulli(m, (1.f-p), 1.f / (1.f-p));
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec() * m.tvec();
+}
+
+template<class MyDevice>
+void Dropout::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Tensor m(dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * m.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(Dropout)
+
+// ************* DropoutDim *************
+
+#ifndef __CUDACC__
+
+string DropoutDim::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "dropout_dim(" << arg_names[0] << ",p=" << p << ')';
+  return s.str();
+}
+
+Dim DropoutDim::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in DropoutDim")
+  DYNET_ARG_CHECK(xs[0].nd < 4, "DropoutDim only supports tensor up to order 3 + batch dimension, got tensor of order"<<xs[0].nd)
+  DYNET_ARG_CHECK(xs[0].nd > dimension, "In DropoutDim : tried to drop along dimension "<<dimension<<" on tensor of order"<<xs[0].nd)
+  return xs[0];
+}
+
+size_t DropoutDim::aux_storage_size() const {
+  return (dim.size() / dim[dimension]) * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void DropoutDim::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Dim mask_dim(dim);
+  mask_dim.d[dimension]=1;
+  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  TensorTools::randomize_bernoulli(m, (1.f-p), 1.f / (1.f-p));
+  Eigen::array<ptrdiff_t, 4> bcast = {1, 1, 1, 1}; bcast[dimension] = xs[0]->d[dimension];
+  fx.tb<3>().device(*dev.edevice) = xs[0]->tb<3>() * m.tb<3>().broadcast(bcast);
+}
+
+template<class MyDevice>
+void DropoutDim::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Dim mask_dim(dim);
+  mask_dim.d[dimension]=1;
+  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  Eigen::array<ptrdiff_t, 4> bcast = {1, 1, 1, 1}; bcast[dimension] = dEdf.d[dimension];
+  dEdxi.tb<3>().device(*dev.edevice) += dEdf.tb<3>() * m.tb<3>().broadcast(bcast);
+}
+DYNET_NODE_INST_DEV_IMPL(DropoutDim)
+
+// ************* DropoutBatch *************
+
+#ifndef __CUDACC__
+
+string DropoutBatch::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "dropout_batch(" << arg_names[0] << ",p=" << p << ')';
+  return s.str();
+}
+
+Dim DropoutBatch::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in DropoutBatch")
+  return xs[0];
+}
+
+size_t DropoutBatch::aux_storage_size() const {
+  return dim.batch_elems() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void DropoutBatch::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Dim mask_dim({1},xs[0]->d.batch_elems());
+  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  TensorTools::randomize_bernoulli(m, (1.f-p), 1.f / (1.f-p));
+  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
+  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec() * m.tbvec().broadcast(bcast);
+}
+
+template<class MyDevice>
+void DropoutBatch::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Dim mask_dim({1},xs[0]->d.batch_elems());
+  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
+  dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() * m.tbvec().broadcast(bcast);
+}
+DYNET_NODE_INST_DEV_IMPL(DropoutBatch)
+
+// ************* BlockDropout *************
+
+#ifndef __CUDACC__
+
+string BlockDropout::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "block_dropout(" << arg_names[0] << ",dropout_probability=" << dropout_probability << ')';
+  return s.str();
+}
+
+Dim BlockDropout::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in BlockDropout")
+  return xs[0];
+}
+
+size_t BlockDropout::aux_storage_size() const {
+  // we just need to remember whether this entire block is turned on (1.0) or off (0.0)
+  return 1 * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void BlockDropout::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  bernoulli_distribution distribution(1.0 - dropout_probability);
+  float block_multiplier = distribution(*rndeng)? 1.0 : 0.0;
+  block_multiplier = 
+    dropout_probability == 1.0? 0.0 : block_multiplier / (1.0 - dropout_probability);
+  if (dropout_probability > 1.0 || dropout_probability < 0.0)
+    DYNET_INVALID_ARG("Dropout probability must be in the range [0, 1]");
+  *(static_cast<float*>(aux_mem)) = block_multiplier;
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec() * block_multiplier;
+}
+
+template<class MyDevice>
+void BlockDropout::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  float block_multiplier = *(static_cast<float*>(aux_mem));
+  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * block_multiplier;
+}
+DYNET_NODE_INST_DEV_IMPL(BlockDropout)
+
+}
diff --git a/dynet/nodes-flow.cc b/dynet/nodes-flow.cc
new file mode 100644
index 000000000..95d0b10ae
--- /dev/null
+++ b/dynet/nodes-flow.cc
@@ -0,0 +1,151 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Reshape *************
+
+#ifndef __CUDACC__
+
+string Reshape::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "reshape(" << arg_names[0] << " --> " << to << ')';
+  return s.str();
+}
+
+Dim Reshape::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Reshape")
+  if(to.size() == xs[0].size()) {
+    return to;
+  } else {
+    DYNET_ARG_CHECK(to.batch_elems() == 1 && to.batch_size() == xs[0].batch_size(),
+                    "Bad arguments to Reshape: " << to << ", " << xs[0]);
+    Dim ret(to);
+    ret.bd = xs[0].batch_elems();
+    return ret;
+  }
+}
+
+#endif
+
+template<class MyDevice>
+void Reshape::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  // just point to the input memory and change dimensions
+  // dimensions are handled by forward_dim
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+}
+
+template<class MyDevice>
+void Reshape::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  const Tensor reshaped(dEdxi.d, dEdf.v, dEdxi.device, dEdf.mem_pool);
+  dEdxi.tvec().device(*dev.edevice) += reshaped.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(Reshape)
+
+// ************* Identity *************
+
+#ifndef __CUDACC__
+
+string Identity::as_string(const vector<string>& arg_names) const {
+  return arg_names[0];
+}
+
+Dim Identity::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Identity")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void Identity::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+}
+
+template<class MyDevice>
+void Identity::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(Identity)
+
+// ************* NoBackprop *************
+
+#ifndef __CUDACC__
+
+string NoBackprop::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "nobackprop(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim NoBackprop::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in NoBackprop")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void NoBackprop::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+}
+
+template<class MyDevice>
+void NoBackprop::backward_dev_impl(const MyDevice & dev,
+                                   const vector<const Tensor*>& xs,
+                                   const Tensor& fx,
+                                   const Tensor& dEdf,
+                                   unsigned i,
+                                   Tensor& dEdxi) const {
+  // no op
+}
+DYNET_NODE_INST_DEV_IMPL(NoBackprop)
+
+// ************* FlipGradient *************
+
+#ifndef __CUDACC__
+
+string FlipGradient::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "flip_gradient(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim FlipGradient::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in FlipGradient");
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void FlipGradient::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+}
+
+template<class MyDevice>
+void FlipGradient::backward_dev_impl(const MyDevice & dev,
+                                   const vector<const Tensor*>& xs,
+                                   const Tensor& fx,
+                                   const Tensor& dEdf,
+                                   unsigned i,
+                                   Tensor& dEdxi) const {
+  // takes negative on backprop
+  dEdxi.tvec().device(*dev.edevice) -= dEdf.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(FlipGradient)
+
+}
diff --git a/dynet/nodes-linalg.cc b/dynet/nodes-linalg.cc
new file mode 100644
index 000000000..a1c17eaec
--- /dev/null
+++ b/dynet/nodes-linalg.cc
@@ -0,0 +1,224 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Transpose *************
+
+#ifndef __CUDACC__
+
+string Transpose::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "transpose("<< arg_names[0] << ", ";
+  for(size_t i = 0; i < dims.size(); ++i)
+    s << (i == 0?'{':',') << dims[i];
+  s << "})";
+  return s.str();
+}
+
+Dim Transpose::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Bad arguments to Transpose: " << xs);
+  DYNET_ARG_CHECK(xs[0].nd == dims.size() || xs[0].num_nonone_dims() == 1, "Dimensions passed to transpose (" << dims.size() << ") must be equal to dimensions in input tensor (" << xs[0].nd << ')');
+  Dim ret(xs[0]);
+  ret.nd = dims.size();
+  for(size_t i = 0; i < dims.size(); ++i)
+    ret.d[i] = xs[0][dims[i]];
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void Transpose::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  if (dim.num_nonone_dims() <= 1) {
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+  } else {
+    Eigen::array<ptrdiff_t, 5> order;
+    for(size_t i = 0; i < 5; ++i)
+      order[i] = (i >= dims.size() ? i : dims[i]);
+    fx.tb<4>().device(*dev.edevice) = xs[0]->tb<4>().shuffle(order);
+  }
+}
+
+template<class MyDevice>
+void Transpose::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Eigen::array<ptrdiff_t, 5> order;
+  for(size_t i = 0; i < 5; ++i)
+    order[(i >= dims.size() ? i : dims[i])] = i;
+  dEdxi.tb<4>().device(*dev.edevice) += dEdf.tb<4>().shuffle(order);
+}
+DYNET_NODE_INST_DEV_IMPL(Transpose)
+
+// ************* MatrixInverse *************
+
+#ifndef __CUDACC__
+
+string MatrixInverse::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "inverse(" << arg_names[0] << ")";
+  return s.str();
+}
+
+Dim MatrixInverse::dim_forward(const vector<Dim>& xs) const {
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void MatrixInverse::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in MatrixInverse::forward");
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("MatrixInverse not yet implemented for CUDA");
+#else
+  auto x = **xs[0];
+  auto y = *fx;
+  y = x.inverse();
+#endif
+  // TODO: Change into tensors after resolving test errors
+  // fx.t<2>().device(*dev.edevice) = xs[0]->t<2>().inverse();
+}
+
+template<class MyDevice>
+void MatrixInverse::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in MatrixInverse::backward");
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("MatrixInverse not yet implemented for CUDA");
+#else
+  auto d = *dEdf;
+  auto y = *fx;
+  (*dEdxi) -= y * d * y;
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(MatrixInverse)
+
+// ************* LogDet *************
+
+#ifndef __CUDACC__
+
+string LogDet::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "logdet(" << arg_names[0] << ")";
+  return s.str();
+}
+
+Dim LogDet::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs[0].ndims() <= 2 && (xs[0].rows() == xs[0].cols()), "Bad arguments in LogDet: " << xs);
+  return Dim({1});
+}
+
+// set use_cholesky if M is symmetric - it's faster and more stable
+// for dep parsing it won't be
+template <typename MatrixType>
+inline typename MatrixType::Scalar logdet(const MatrixType& M, bool use_cholesky = false) {
+  using namespace Eigen;
+  using std::log;
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar ld = 0;
+  if (use_cholesky) {
+    LLT<Matrix<Scalar,Dynamic,Dynamic>> chol(M);
+    auto& U = chol.matrixL();
+    for (unsigned i = 0; i < M.rows(); ++i)
+      ld += log(U(i,i));
+    ld *= 2;
+  } else {
+    PartialPivLU<Matrix<Scalar,Dynamic,Dynamic>> lu(M);
+    auto& LU = lu.matrixLU();
+    Scalar c = lu.permutationP().determinant(); // -1 or 1
+    for (unsigned i = 0; i < LU.rows(); ++i) {
+      const auto& lii = LU(i,i);
+      if (lii < Scalar(0)) c *= -1;
+      ld += log(abs(lii));
+    }
+    ld += log(c);
+  }
+  return ld;
+}
+
+#endif
+
+template<class MyDevice>
+void LogDet::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("LogDet not implemented for CUDA");
+#else
+  fx.v[0] = logdet(**xs[0], false);
+#endif
+}
+
+template<class MyDevice>
+void LogDet::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("KMHNGram not implemented for CUDA");
+#else
+  auto trans = (**xs[0]).transpose();
+  (*dEdxi) += (dEdf.v[0]) * trans.inverse();
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(LogDet)
+
+// ************* TraceOfProduct *************
+
+#ifndef __CUDACC__
+
+string TraceOfProduct::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "Tr(" << arg_names[0] << " * " << arg_names[1] << "^T)";
+  return s.str();
+}
+
+Dim TraceOfProduct::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2 && xs[0] == xs[1], "Bad arguments in TraceOfProduct: " << xs);
+  return Dim({1}, max(xs[0].bd, xs[1].bd));
+}
+
+#endif
+
+template<class MyDevice>
+void TraceOfProduct::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("TraceOfProduct not yet implemented for CUDA");
+#else
+  auto x1 = **xs[0];
+  auto x2 = **xs[1];
+  fx.v[0] = (x1 * x2.transpose()).trace();
+#endif
+}
+
+template<class MyDevice>
+void TraceOfProduct::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i < 2, "Failed dimension check in TraceOfProduce::backward");
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("TraceOfProduct not yet implemented for CUDA");
+#else
+  const float d = dEdf.v[0];
+  auto xother = **xs[1 - i];
+  *dEdxi += d * xother;
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(TraceOfProduct)
+
+}
diff --git a/dynet/nodes-logsumexp.cc b/dynet/nodes-logsumexp.cc
new file mode 100644
index 000000000..71fb3e0b8
--- /dev/null
+++ b/dynet/nodes-logsumexp.cc
@@ -0,0 +1,115 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* LogSumExp *************
+
+#define MAX_LOG_SUM_EXP 65536
+
+#ifndef __CUDACC__
+
+// template <class T>
+// EIGEN_STRONG_INLINE real logsumexp(const T& x, const vector<unsigned>& denom) {
+//   real m = x(denom[0],0);
+//   for (auto i : denom) {
+//     real r = x(i,0);
+//     if (r > m) m = r;
+//   }
+//   real z = 0;
+//   for (auto i : denom)
+//     z += expf(x(i,0) - m);
+//   return m + logf(z);
+// }
+
+string LogSumExp::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "log(exp " << arg_names[0];
+  for (unsigned i = 1; i < arg_names.size(); ++i)
+    s << " + exp " << arg_names[i];
+  s << ")";
+  return s.str();
+}
+
+Dim LogSumExp::dim_forward(const vector<Dim>& xs) const {
+  Dim d = xs[0].truncate();
+  for (unsigned i = 1; i < xs.size(); ++i) {
+    DYNET_ARG_CHECK(d.single_batch() == xs[i].truncate().single_batch(),
+                            "Mismatched input dimensions in LogSumExp: " << xs);
+    d.bd = max(xs[i].bd, d.bd);
+  }
+  return d;
+}
+
+// this i need to do something better, but this is a work-around
+// if this is too small, just make it bigger
+size_t LogSumExp::aux_storage_size() const {
+  return (MAX_LOG_SUM_EXP + 1) * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void LogSumExp::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  if (xs.size() == 1) {
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+  } else {
+    // TODO: Ideally we wouldn't need to allocate this memory permanently.
+    //       We need a good method for allocating "scratch" memory that is only used temporarily.
+    Tensor ms(fx.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
+    Eigen::array<ptrdiff_t, 2> bcast = {1,fx.d.bd};
+    // Calculate the max
+    if(ms.d.bd == xs[0]->d.bd)
+      ms.tvec().device(*dev.edevice) = xs[0]->tvec();
+    else
+      ms.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast); 
+    for (size_t i = 1; i < xs.size(); ++i) {
+      if(ms.d.bd == xs[i]->d.bd)
+        ms.tvec().device(*dev.edevice) = ms.tvec().cwiseMax(xs[i]->tvec());
+      else
+        ms.tbvec().device(*dev.edevice) = ms.tbvec().cwiseMax(xs[i]->tbvec().broadcast(bcast)); 
+    }
+    // sumexp
+    if(ms.d.bd == xs[0]->d.bd)
+      fx.tvec().device(*dev.edevice) = (xs[0]->tvec() - ms.tvec()).exp();
+    else
+      fx.tbvec().device(*dev.edevice) = (xs[0]->tbvec().broadcast(bcast) - ms.tbvec()).exp();
+    for (size_t i = 1; i < xs.size(); ++i) {
+      if(ms.d.bd == xs[i]->d.bd)
+        fx.tvec().device(*dev.edevice) += (xs[i]->tvec() - ms.tvec()).exp();
+      else
+        fx.tbvec().device(*dev.edevice) += (xs[i]->tbvec().broadcast(bcast) - ms.tbvec()).exp();
+    }
+    // log and add max
+    fx.tvec().device(*dev.edevice) = fx.tvec().log() + ms.tvec();
+  }
+}
+
+template<class MyDevice>
+void LogSumExp::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  if (xs.size() == 1) {
+    dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
+  } else {
+    // df/dx_i = 1/{sum_j exp(x_j)} * exp(x_i)}
+    //         = 1/{exp f(x)} * exp(x_i)
+    //         = exp(x_i - f(x))
+    if(fx.d.bd == xs[i]->d.bd) {
+      dEdxi.tvec().device(*dev.edevice) += (xs[i]->tvec() - fx.tvec()).exp() * dEdf.tvec();
+    } else {
+      Eigen::array<ptrdiff_t, 2> bcast = {1,fx.d.bd};
+      Eigen::array<int, 1> red_axis = {1};
+      dEdxi.tvec().device(*dev.edevice) += ((xs[i]->tbvec().broadcast(bcast) - fx.tbvec()).exp() * dEdf.tbvec()).sum(red_axis);
+    }
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(LogSumExp)
+
+}
diff --git a/dynet/nodes-losses.cc b/dynet/nodes-losses.cc
new file mode 100644
index 000000000..a12db7bc8
--- /dev/null
+++ b/dynet/nodes-losses.cc
@@ -0,0 +1,123 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* PairwiseRankLoss *************
+
+#ifndef __CUDACC__
+
+string PairwiseRankLoss::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "max(0, " << margin << " - " << arg_names[0] << " + " << arg_names[1] << ')';
+  return os.str();
+}
+
+Dim PairwiseRankLoss::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2 &&
+                          xs[0] == xs[1] &&
+                          xs[0].rows() == 1 &&
+                          (xs[0].ndims() == 1 || xs[0].ndims() == 2),
+                          "Bad input dimensions in PairwiseRankLoss: " << xs);
+  return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
+}
+
+template<class MyDevice>
+void PairwiseRankLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().binaryExpr(xs[1]->tvec(), FPairwiseRankLoss(margin));
+}
+
+template<class MyDevice>
+void PairwiseRankLoss::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  if (i == 0) {
+    dEdxi.tvec().device(*dev.edevice) -= fx.tvec().binaryExpr(dEdf.tvec(), FRectifyBackward());
+  } else {
+    dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), FRectifyBackward());
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(PairwiseRankLoss)
+
+#endif
+
+// ************* BinaryLogLoss *************
+
+#ifndef __CUDACC__
+
+string BinaryLogLoss::as_string(const vector<string>& arg_names) const {
+  ostringstream os;
+  os << "binary_log_loss(" << arg_names[0] << ", " << arg_names[1] << ')';
+  return os.str();
+}
+
+Dim BinaryLogLoss::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in BinaryLogLoss")
+  DYNET_ARG_CHECK(xs[0].rows() == 2 || xs[0].ndims() == 1, "Bad input dimensions in BinaryLogLoss: " << xs);
+  DYNET_ARG_CHECK(xs[1].rows() == 2 || xs[1].ndims() == 1, "Bad input dimensions in BinaryLogLoss: " << xs);
+  return Dim({1}, max(xs[0].bd, xs[1].bd));
+}
+
+#endif
+
+template<class MyDevice>
+void BinaryLogLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.t<0>().device(*dev.edevice) = xs[0]->tvec().binaryExpr(xs[1]->tvec(), FBinaryLogLoss()).sum();
+}
+
+template<class MyDevice>
+void BinaryLogLoss::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += xs[i]->tvec().binaryExpr(xs[1-i]->tvec(), FBinaryLogLossBackward(as_scalar(dEdf)));
+}
+DYNET_NODE_INST_DEV_IMPL(BinaryLogLoss)
+
+// ************* PoissonRegressionLoss *************
+
+#ifndef __CUDACC__
+
+string PoissonRegressionLoss::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "-log Poisson(" << pty << "; lambda=\\exp" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim PoissonRegressionLoss::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1 && xs[0].size() == 1, "Bad input dimensions in PoissonRegressionLoss: " << xs);
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void PoissonRegressionLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  const real y = *pty;
+  const auto z = std::lgamma(y + 1);
+  // const auto x = as_scalar(*xs[0]);
+  fx.t<0>().device(*dev.edevice) = xs[0]->t<0>().exp() + z - xs[0]->t<0>() * y;
+}
+
+template<class MyDevice>
+void PoissonRegressionLoss::backward_dev_impl(const MyDevice & dev,
+                            const vector<const Tensor*>& xs,
+                            const Tensor& fx,
+                            const Tensor& dEdf,
+                            unsigned i,
+                            Tensor& dEdxi) const {
+  const real y = *pty;
+  dEdxi.t<0>().device(*dev.edevice) += xs[0]->t<0>().exp() - y;
+}
+DYNET_NODE_INST_DEV_IMPL(PoissonRegressionLoss)
+
+}
diff --git a/dynet/nodes-minmax.cc b/dynet/nodes-minmax.cc
new file mode 100644
index 000000000..6b5d45f87
--- /dev/null
+++ b/dynet/nodes-minmax.cc
@@ -0,0 +1,252 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Min *************
+
+#ifndef __CUDACC__
+
+string Min::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "min{" << arg_names[0] << ", " << arg_names[1] << "}";
+  return s.str();
+}
+
+Dim Min::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2 && xs[0] == xs[1], "Bad arguments in Min: " << xs);
+  return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
+}
+
+size_t Min::aux_storage_size() const {
+  return dim.size() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void Min::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Tensor t(fx.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
+  t.tvec().device(*dev.edevice) = (xs[0]->tvec() < xs[1]->tvec()).cast<float>();
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().cwiseMin(xs[1]->tvec());
+}
+
+template<class MyDevice>
+void Min::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in Min::backward");
+  const Tensor t(dEdxi.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
+  if (i == 0) {
+    dEdxi.tvec().device(*dev.edevice) += t.tvec() * dEdf.tvec();
+  } else {
+    dEdxi.tvec().device(*dev.edevice) += t.tvec().binaryExpr(dEdf.tvec(), FMaxBackwardInv());
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(Min)
+
+// ************* Max *************
+
+#ifndef __CUDACC__
+
+string Max::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "max{" << arg_names[0] << ", " << arg_names[1] << "}";
+  return s.str();
+}
+
+Dim Max::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2 && xs[0] == xs[1], "Bad arguments in Max: " << xs);
+  return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
+}
+
+size_t Max::aux_storage_size() const {
+  return dim.size() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void Max::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Tensor t(fx.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
+  t.tvec().device(*dev.edevice) = (xs[0]->tvec() > xs[1]->tvec()).cast<float>();
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().cwiseMax(xs[1]->tvec());
+}
+
+template<class MyDevice>
+void Max::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 2, "Failed dimension check in Max::backward");
+  const Tensor t(dEdxi.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
+  if (i == 0) {
+    dEdxi.tvec().device(*dev.edevice) += t.tvec() * dEdf.tvec();
+  } else {
+    dEdxi.tvec().device(*dev.edevice) += t.tvec().binaryExpr(dEdf.tvec(), FMaxBackwardInv());
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(Max)
+
+// ************* MinDimension *************
+
+#ifndef __CUDACC__
+
+string MinDimension::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "min_dim(" << arg_names[0] << ", reduced_dim=" << reduced_dim << ')';
+  return s.str();
+}
+
+Dim MinDimension::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MinDimension");
+  DYNET_ARG_CHECK(reduced_dim < xs[0].nd,
+                          "Tried to MinDimension on dimension " << reduced_dim << " bigger than input " << xs[0]);
+  DYNET_ARG_CHECK(xs[0].nd < 4,
+                          "MinDimension not currently supported for tensors of 4 or more dimensions.");
+  Dim ret(xs[0]);
+  ret.delete_dim(reduced_dim);
+  return ret;
+}
+
+size_t MinDimension::aux_storage_size() const {
+  return sizeof(Eigen::DenseIndex) * dim.size();
+}
+
+#endif
+
+template<class MyDevice>
+void MinDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Eigen::DenseIndex* minmap = static_cast<Eigen::DenseIndex*>(aux_mem);
+  const unsigned batch_size = dim.batch_elems();
+  const unsigned first_dim_size = dim[0];
+  const unsigned second_dim_size = dim[1];
+  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(minmap, first_dim_size, second_dim_size, batch_size);
+  const Eigen::array<Eigen::DenseIndex, 1> reduction_axis = {reduced_dim};
+  locs.device(*dev.edevice) = xs[0]->tb<3>().argmin(reduced_dim);
+  fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().minimum(reduction_axis);
+}
+
+template<class MyDevice>
+void MinDimension::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MinDimension::backward");
+#ifdef __CUDACC__
+  vector<Eigen::DenseIndex> indices(dim.size());
+  Eigen::DenseIndex* minmap = &indices[0];
+  CUDA_CHECK(cudaMemcpy((void*)minmap, aux_mem, sizeof(Eigen::DenseIndex) * dim.size(), cudaMemcpyDeviceToHost));
+#else
+  Eigen::DenseIndex* minmap = static_cast<Eigen::DenseIndex*>(aux_mem);
+#endif
+  const unsigned batch_size = dim.batch_elems();
+  const unsigned first_dim_size = dim[0];
+  const unsigned second_dim_size = dim[1];
+  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(minmap, first_dim_size, second_dim_size, batch_size);
+  for(unsigned b = 0; b < batch_size; ++b){
+    for(unsigned j = 0; j < second_dim_size; ++j){
+      for(unsigned i = 0; i < first_dim_size; ++i){
+        if (reduced_dim > second_dim)
+          dEdxi.tb<3>().chip<3>(b).chip(locs(i, j, b), reduced_dim).chip(j, second_dim).chip(i, first_dim).device(*dev.edevice) 
+            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
+        else if (reduced_dim > first_dim)
+          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(locs(i, j, b), reduced_dim).chip(i, first_dim).device(*dev.edevice) 
+            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
+        else
+          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(i, first_dim).chip(locs(i, j, b), reduced_dim).device(*dev.edevice) 
+            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
+      }
+    }
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(MinDimension)
+
+// ************* MaxDimension *************
+
+#ifndef __CUDACC__
+
+string MaxDimension::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "max_dim(" << arg_names[0] << ", reduced_dim=" << reduced_dim << ')';
+  return s.str();
+}
+
+Dim MaxDimension::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MaxDimension");
+  DYNET_ARG_CHECK(reduced_dim < xs[0].nd,
+                          "Tried to MaxDimension on dimension " << reduced_dim << " bigger than input " << xs[0]);
+  DYNET_ARG_CHECK(xs[0].nd < 4,
+                          "MaxDimension not currently supported for tensors of 4 or more dimensions.");
+  Dim ret(xs[0]);
+  ret.delete_dim(reduced_dim);
+  return ret;
+}
+
+size_t MaxDimension::aux_storage_size() const {
+  return sizeof(Eigen::DenseIndex) * dim.size();
+}
+
+#endif
+
+template<class MyDevice>
+void MaxDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Eigen::DenseIndex* maxmap = static_cast<Eigen::DenseIndex*>(aux_mem);
+  const unsigned batch_size = dim.batch_elems();
+  const unsigned first_dim_size = dim[0];
+  const unsigned second_dim_size = dim[1];
+  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(maxmap, first_dim_size, second_dim_size, batch_size);
+  const Eigen::array<Eigen::DenseIndex, 1> reduction_axis = {reduced_dim};
+  locs.device(*dev.edevice) = xs[0]->tb<3>().argmax(reduced_dim);
+  fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().maximum(reduction_axis);
+}
+
+template<class MyDevice>
+void MaxDimension::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MaxDimension::backward");
+#ifdef __CUDACC__
+  vector<Eigen::DenseIndex> indices(dim.size());
+  Eigen::DenseIndex* maxmap = &indices[0];
+  CUDA_CHECK(cudaMemcpy((void*)maxmap, aux_mem, sizeof(Eigen::DenseIndex) * dim.size(), cudaMemcpyDeviceToHost));
+#else
+  Eigen::DenseIndex* maxmap = static_cast<Eigen::DenseIndex*>(aux_mem);
+#endif
+  const unsigned batch_size = dim.batch_elems();
+  const unsigned first_dim_size = dim[0];
+  const unsigned second_dim_size = dim[1];
+  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(maxmap, first_dim_size, second_dim_size, batch_size);
+  for(unsigned b = 0; b < batch_size; ++b){
+    for(unsigned j = 0; j < second_dim_size; ++j){
+      for(unsigned i = 0; i < first_dim_size; ++i){
+        if (reduced_dim > second_dim)
+          dEdxi.tb<3>().chip<3>(b).chip(locs(i, j, b), reduced_dim).chip(j, second_dim).chip(i, first_dim).device(*dev.edevice) 
+            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
+        else if (reduced_dim > first_dim)
+          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(locs(i, j, b), reduced_dim).chip(i, first_dim).device(*dev.edevice) 
+            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
+        else
+          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(i, first_dim).chip(locs(i, j, b), reduced_dim).device(*dev.edevice) 
+            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
+      }
+    }
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(MaxDimension)
+
+}
diff --git a/dynet/nodes-moments.cc b/dynet/nodes-moments.cc
new file mode 100644
index 000000000..b4d618165
--- /dev/null
+++ b/dynet/nodes-moments.cc
@@ -0,0 +1,440 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Average *************
+
+#ifndef __CUDACC__
+
+string Average::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "average(" << arg_names[0];
+  for (unsigned i = 1; i < arg_names.size(); ++i)
+    s << ", " << arg_names[i];
+  s << ")";
+  return s.str();
+}
+
+Dim Average::dim_forward(const vector<Dim>& xs) const {
+  Dim d(xs[0]);
+  for (unsigned i = 1; i < xs.size(); ++i) {
+    DYNET_ARG_CHECK(xs[0].single_batch() == xs[i].single_batch(),
+                            "Mismatched input dimensions in Average: " << xs);
+    d.bd = max(xs[i].bd, d.bd);
+  }
+  return d;
+}
+
+#endif
+
+template<class MyDevice>
+void Average::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  const unsigned num_args = xs.size();
+  if (num_args == 1) {
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
+    return;
+  }
+  if (num_args == 2 && xs[0]->d.bd == xs[1]->d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec();
+  else if (num_args == 3 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec();
+  else if (num_args == 4 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd && xs[2]->d.bd == xs[3]->d.bd)
+    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
+  else {
+    bool allSameBatchSize = std::all_of(xs.begin(), xs.end(), [&](const Tensor* x) { return x->d.bd == xs[0]->d.bd;});
+    if (allSameBatchSize) {
+      // Since they are all the same batch size, we can easily unroll the addition (results in lower GPU latency by merging multiple adds together in one CUDA call):
+      DYNET_ASSERT(num_args > 4, "Bad loop unrolling in Average::forward");        // If it was <=4, we would have handled it in the special cases above
+      fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
+
+      const unsigned remainder = (num_args - 4 ) % 4;
+      switch (remainder) {
+        case 0: break;
+        case 1: fx.tvec().device(*dev.edevice) += xs[4]->tvec(); break;
+        case 2: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec(); break;
+        case 3: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec() + xs[6]->tvec(); break;
+      }
+      for (unsigned i = 4 + remainder; i < num_args; i += 4)
+        fx.tvec().device(*dev.edevice) += xs[i]->tvec() + xs[i + 1]->tvec() + xs[i + 2]->tvec() + xs[i + 3]->tvec();
+    }
+    else {
+      // Not all the same batch size, so need to broadcast in the cases where they differ
+      TensorTools::zero(fx);
+#ifdef __CUDACC__
+      Eigen::array<int, 2> bcast({ 1, (int)fx.d.bd });
+#endif
+      for (unsigned i = 0; i < num_args; ++i) {
+        if (xs[i]->d.bd == fx.d.bd) {
+          fx.tvec().device(*dev.edevice) += xs[i]->tvec();
+        }
+        else {
+#ifdef __CUDACC__
+          fx.tbvec().device(*dev.edevice) += xs[i]->tbvec().broadcast(bcast);
+#else
+          for (unsigned b = 0; b < fx.d.bd; ++b)
+            fx.tbvec().chip<1>(b).device(*dev.edevice) += xs[i]->tvec();
+#endif
+        }
+      }
+    }
+  }
+  fx.tvec().device(*dev.edevice) = fx.tvec() / (float)xs.size();
+}
+
+template<class MyDevice>
+void Average::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += (dEdf.tvec() / (float)xs.size());
+}
+DYNET_NODE_INST_DEV_IMPL(Average)
+
+// ************* AverageColumns *************
+
+#ifndef __CUDACC__
+
+string AverageColumns::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "average_cols(matrix=" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim AverageColumns::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ASSERT(xs.size() == 1 || xs.size() == 2, "Failed input count check in AverageColumns");
+  int bd = (xs.size() == 1 ? xs[0].bd : max(xs[0].bd, xs[1].bd));
+  return Dim({xs[0].rows()}, bd);
+}
+
+#endif
+
+template<class MyDevice>
+void AverageColumns::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in AverageColumns");
+  unsigned cols = xs[0]->d.cols();
+#ifdef __CUDACC__
+  // The reduction used on CPU is better, but not implemented in GPU
+  fx.t<1>().device(*dev.edevice) = xs[0]->t<2>().chip<1>(0);
+  for(unsigned i = 1; i < cols; ++i)
+    fx.t<1>().device(*dev.edevice) += xs[0]->t<2>().chip<1>(i);
+  fx.t<1>().device(*dev.edevice) = fx.t<1>() / (float)cols;
+#else
+  const Eigen::array<Eigen::DenseIndex, 1> reduction_axis = {1};
+  fx.t<1>().device(*dev.edevice) = xs[0]->t<2>().sum(reduction_axis) / (float)cols;
+#endif
+}
+
+template<class MyDevice>
+void AverageColumns::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  const Eigen::array<Eigen::DenseIndex, 2> broadcasts = {1, xs[0]->d[1]};
+  dEdxi.t<2>().device(*dev.edevice) += (dEdf.t<2>() / (float)xs[0]->d[1]).broadcast(broadcasts);
+}
+DYNET_NODE_INST_DEV_IMPL(AverageColumns)
+
+// ************* MomentElements *************
+
+#ifndef __CUDACC__
+
+string MomentElements::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "moment_elems( expression=" << arg_names[0] << ", order=" << order << " )";
+  return s.str();
+}
+
+Dim MomentElements::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MomentElements")
+  DYNET_ARG_CHECK(order>= 1, "Order of moment should be >=1 in MomentElements (recieved "<<order<<")")
+  return Dim({1}, xs[0].bd);
+}
+
+#endif
+
+template<class MyDevice>
+void MomentElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in MomentElements::forward");
+  Eigen::array<int, 1> red_axis; red_axis[0] = 0;
+  if(order == 1)
+    fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().sum(red_axis) / (float) xs[0]->d.batch_size();
+  else if (order == 2)
+    fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().square().sum(red_axis) / (float) xs[0]->d.batch_size();
+  else
+    fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().pow(order).sum(red_axis) / (float) xs[0]->d.batch_size();
+}
+
+template<class MyDevice>
+void MomentElements::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MomentElements::backward");
+  Eigen::array<int, 2> bcast = {(int)xs[0]->d.batch_size(), 1};
+  if (order == 1)
+    dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().broadcast(bcast) / (float) xs[0]->d.batch_size();
+  else if (order == 2)
+    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec()) * ( 2.f / (float) xs[0]->d.batch_size());
+  else if (order == 3)
+    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().square()) * ( 3.f / (float) xs[0]->d.batch_size());
+  else
+    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().pow(order - 1)) * ( (float) order / (float) xs[0]->d.batch_size());
+}
+DYNET_NODE_INST_DEV_IMPL(MomentElements)
+
+// ************* MomentDimension *************
+
+#ifndef __CUDACC__
+
+string MomentDimension::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "moment_dim(expression=" << arg_names[0] << ',' << dimension << ", order="<<order<<'}';
+  return s.str();
+}
+
+Dim MomentDimension::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in MomentDimension");
+  DYNET_ARG_CHECK(xs[0].nd <= 3, "MomentDimension implemented up to tensors of order 3 (with minibatch) for now")
+  DYNET_ARG_CHECK(dimension < xs[0].nd, "dimension " << dimension << " is out of bounds of tensor of order " << xs[0].nd << " in MomentDimension" )
+  DYNET_ARG_CHECK(order>= 1, "Order of moment should be >=1 in MomentDimension (recieved "<<order<<")")
+  Dim ret(xs[0]);
+  ret.delete_dim(dimension);
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void MomentDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in MomentDimension");
+  Eigen::array<int, 1> reduction_axis = {(int)dimension};
+  float n = (float) xs[0]->d[dimension];
+  if(order == 1)
+    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().sum(reduction_axis) / n;
+  else if (order == 2)
+    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().square().sum(reduction_axis) / n;
+  else
+    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().pow(order).sum(reduction_axis) / n;
+}
+
+template<class MyDevice>
+void MomentDimension::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MomentDimension::backward");
+  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = xs[0]->d[dimension];
+  Eigen::array<int, 4> morph = {(int)xs[0]->d[0],(int)xs[0]->d[1],(int)xs[0]->d[2],(int)xs[0]->d.bd}; morph[dimension] = 1;
+  float n = (float) xs[0]->d[dimension];
+  if (order == 1)
+    dEdxi.tb<3>().device(*dev.edevice) += dEdf.tb<2>().reshape(morph).broadcast(bcast) / n;
+  else if (order == 2)
+    dEdxi.tb<3>().device(*dev.edevice) += (dEdf.tb<2>().reshape(morph).broadcast(bcast) * xs[0]->tb<3>()) * ( 2.f / n);
+  else if (order == 3)
+    dEdxi.tb<3>().device(*dev.edevice) += (dEdf.tb<2>().reshape(morph).broadcast(bcast) * xs[0]->tb<3>().square()) * ( 3.f / n);
+  else
+    dEdxi.tb<3>().device(*dev.edevice) += (dEdf.tb<2>().reshape(morph).broadcast(bcast) * xs[0]->tb<3>().pow(order - 1)) * ( (float) order / n);
+}
+DYNET_NODE_INST_DEV_IMPL(MomentDimension)
+
+// ************* MomentBatches *************
+
+#ifndef __CUDACC__
+
+string MomentBatches::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "moment_batches( expression=" << arg_names[0] << ", order=" << order << " )";
+  return s.str();
+}
+
+Dim MomentBatches::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in MomentBatches")
+  DYNET_ARG_CHECK(order>= 1, "Order of moment should be >=1 in MomentBatches (recieved "<<order<<")")
+  return xs[0].single_batch();
+}
+
+#endif
+
+template<class MyDevice>
+void MomentBatches::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in MomentBatches::forward");
+  Eigen::array<int, 1> red_axis; red_axis[0] = 1;
+  if(order == 1)
+    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().sum(red_axis) / (float) xs[0]->d.bd;
+  else if (order == 2)
+    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().square().sum(red_axis) / (float) xs[0]->d.bd;
+  else
+    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().pow(order).sum(red_axis) / (float) xs[0]->d.bd;
+}
+
+template<class MyDevice>
+void MomentBatches::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MomentBatches::backward");
+  Eigen::array<int, 2> bcast = {1, (int)xs[0]->d.bd};
+  if (order == 1)
+    dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().broadcast(bcast) / (float) xs[0]->d.bd;
+  else if (order == 2)
+    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec()) * ( 2.f / (float) xs[0]->d.bd);
+  else if (order == 3)
+    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().square()) * ( 3.f / (float) xs[0]->d.bd);
+  else
+    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().pow(order - 1)) * ( (float) order / (float) xs[0]->d.bd);
+}
+DYNET_NODE_INST_DEV_IMPL(MomentBatches)
+
+// ************* StdElements *************
+
+#ifndef __CUDACC__
+
+string StdElements::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "std_elems( expression=" << arg_names[0] << " )";
+  return s.str();
+}
+
+Dim StdElements::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in StdElements")
+  return Dim({1}, xs[0].bd);
+}
+
+#endif
+
+template<class MyDevice>
+void StdElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in StdElements::forward");
+  Eigen::array<ptrdiff_t, 1> red_axis = {0};
+  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
+  Eigen::array<ptrdiff_t, 2> newaxis = {1, xs[0]->d.bd};
+  float n = (float) xs[0]->d.batch_size();
+  fx.tb<0>().device(*dev.edevice) = ((xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)).square().sum(red_axis) / n).sqrt();
+}
+
+template<class MyDevice>
+void StdElements::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 1, "Failed dimension check in StdElements::backward");
+  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
+  Eigen::array<ptrdiff_t, 2> newaxis = {1, xs[0]->d.bd};
+  Eigen::array<ptrdiff_t, 1> red_axis = {0};
+  float n = (float) xs[0]->d.batch_size();
+  dEdxi.tbvec().device(*dev.edevice) +=  (2 / n) * (xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)) * (fx.tbvec().binaryExpr(dEdf.tbvec(), FSqrtBackward())).broadcast(bcast);
+
+}
+DYNET_NODE_INST_DEV_IMPL(StdElements)
+
+// ************* StdDimension *************
+
+#ifndef __CUDACC__
+
+string StdDimension::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "moment_dim(expression=" << arg_names[0] << ',' << dimension <<'}';
+  return s.str();
+}
+
+Dim StdDimension::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in StdDimension");
+  DYNET_ARG_CHECK(xs[0].nd <= 3, "StdDimension implemented up to tensors of order 3 (with minibatch) for now")
+  DYNET_ARG_CHECK(dimension < xs[0].nd, "dimension " << dimension << " is out of bounds of tensor of order " << xs[0].nd << " in StdDimension" )
+  Dim ret(xs[0]);
+  ret.delete_dim(dimension);
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void StdDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed input count check in StdDimension");
+  Eigen::array<int, 1> red_axis = {(int)dimension};
+  Eigen::array<int, 4> morph = {(int)xs[0]->d[0],(int)xs[0]->d[1],(int)xs[0]->d[2],(int)xs[0]->d.bd}; morph[dimension] = 1;
+  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = xs[0]->d[dimension];
+  float n = (float) xs[0]->d[dimension];
+  fx.tb<2>().device(*dev.edevice) = ((xs[0]->tb<3>() - (xs[0]->tb<3>().sum(red_axis).reshape(morph) / n).broadcast(bcast)).square().sum(red_axis) / n).sqrt();
+}
+
+template<class MyDevice>
+void StdDimension::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in StdDimension::backward");
+  Eigen::array<int, 1> red_axis = {(int)dimension};
+  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = xs[0]->d[dimension];
+  Eigen::array<int, 4> morph = {(int)xs[0]->d[0],(int)xs[0]->d[1],(int)xs[0]->d[2],(int)xs[0]->d.bd}; morph[dimension] = 1;
+  float n = (float) xs[0]->d[dimension];
+  dEdxi.tb<3>().device(*dev.edevice) +=  (2 / n) * (xs[0]->tb<3>() - (xs[0]->tb<3>().sum(red_axis).reshape(morph) / n).broadcast(bcast)) * (fx.tb<2>().binaryExpr(dEdf.tb<2>(), FSqrtBackward())).reshape(morph).broadcast(bcast);
+
+}
+DYNET_NODE_INST_DEV_IMPL(StdDimension)
+
+// ************* StdBatches *************
+
+#ifndef __CUDACC__
+
+string StdBatches::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "std_batches( expression=" << arg_names[0] << " )";
+  return s.str();
+}
+
+Dim StdBatches::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in StdBatches")
+ 
+  return xs[0].single_batch();
+}
+
+#endif
+
+template<class MyDevice>
+void StdBatches::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in StdBatches::forward");
+  Eigen::array<ptrdiff_t, 1> red_axis = {1};
+  Eigen::array<ptrdiff_t, 2> newaxis = {xs[0]->d.batch_size(), 1};
+  Eigen::array<ptrdiff_t, 2> bcast = {1, xs[0]->d.bd};
+  float n = (float)xs[0]->d.bd;
+  fx.t<1>().device(*dev.edevice) = ((xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)).square().sum(red_axis) / n).sqrt();
+}
+
+template<class MyDevice>
+void StdBatches::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i < 1, "Failed dimension check in StdBatches::backward");
+  Eigen::array<ptrdiff_t, 1> red_axis = {1};
+  Eigen::array<ptrdiff_t, 2> bcast = {1, xs[0]->d.bd};
+  Eigen::array<ptrdiff_t, 2> newaxis = {xs[0]->d.batch_size(), 1};
+  float n = (float)xs[0]->d.bd;
+  dEdxi.tbvec().device(*dev.edevice) +=  (2 / n) * (xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)) * (fx.tbvec().binaryExpr(dEdf.tbvec(), FSqrtBackward())).broadcast(bcast);
+
+}
+DYNET_NODE_INST_DEV_IMPL(StdBatches)
+
+}
diff --git a/dynet/nodes-normalization.cc b/dynet/nodes-normalization.cc
new file mode 100644
index 000000000..d4faacb9d
--- /dev/null
+++ b/dynet/nodes-normalization.cc
@@ -0,0 +1,54 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* WeightNormalization *************
+
+#ifndef __CUDACC__
+
+string WeightNormalization::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "weight_norm(" << arg_names[0] << ", " << arg_names[1] << ')';
+  return s.str();
+}
+
+Dim WeightNormalization::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 2, "Failed input count check in WeightNormalization");
+  DYNET_ARG_CHECK(1 == xs[1].size()," Size of gain parameter in WeightNormalization should be 1, received " << xs[1].size());
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void WeightNormalization::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in WeightNormalization::forward");
+  Eigen::array<ptrdiff_t, 1> red_axis = {0};
+  Eigen::array<ptrdiff_t, 1> bcast = {xs[0]->d.size()};
+  Eigen::array<ptrdiff_t, 1> morph = {1};
+  fx.tvec().device(*dev.edevice) = (xs[0]->tvec() / xs[0]->tvec().square().sum(red_axis).sqrt().reshape(morph).broadcast(bcast)) * as_scalar(*xs[1]);
+}
+
+template<class MyDevice>
+void WeightNormalization::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Eigen::array<ptrdiff_t, 1> red_axis = {0};
+  Eigen::array<ptrdiff_t, 1> bcast = {xs[0]->d.size()};
+  Eigen::array<ptrdiff_t, 1> morph = {1};
+  if (i==0){
+    dEdxi.tvec().device(*dev.edevice) += (dEdf.tvec() / xs[0]->tvec().square().sum(red_axis).sqrt().reshape(morph).broadcast(bcast)) * as_scalar(*xs[1]) - fx.tvec() * (((dEdf.tvec() * xs[0]->tvec()).sum(red_axis)) / xs[0]->tvec().square().sum(red_axis)).reshape(morph).broadcast(bcast);
+  }else{
+    dEdxi.t<0>().device(*dev.edevice) += ((dEdf.tvec() * xs[0]->tvec()).sum(red_axis)) /  xs[0]->tvec().square().sum(red_axis).sqrt();
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(WeightNormalization)
+
+}
diff --git a/dynet/nodes-random.cc b/dynet/nodes-random.cc
new file mode 100644
index 000000000..9e221112b
--- /dev/null
+++ b/dynet/nodes-random.cc
@@ -0,0 +1,184 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* GaussianNoise *************
+
+#ifndef __CUDACC__
+
+string GaussianNoise::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << arg_names[0] << " + N(0," << stddev << ')';
+  return s.str();
+}
+
+Dim GaussianNoise::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in GaussianNoise")
+  return xs[0];
+}
+
+size_t GaussianNoise::aux_storage_size() const {
+  return dim.size() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void GaussianNoise::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Tensor m(dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  TensorTools::randomize_normal(m, 0, stddev);
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec() + m.tvec();
+}
+
+template<class MyDevice>
+void GaussianNoise::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
+}
+DYNET_NODE_INST_DEV_IMPL(GaussianNoise)
+
+// ************* RandomNormal *************
+
+#ifndef __CUDACC__
+
+string RandomNormal::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "random_normal(" << dim << ')';
+  return s.str();
+}
+
+Dim RandomNormal::dim_forward(const vector<Dim>& xs) const {
+  return dim;
+}
+
+#endif
+
+template<class MyDevice>
+void RandomNormal::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomNormal::forward");
+  TensorTools::randomize_normal(fx);
+}
+
+template<class MyDevice>
+void RandomNormal::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
+}
+DYNET_NODE_INST_DEV_IMPL(RandomNormal)
+
+// ************* RandomBernoulli *************
+
+#ifndef __CUDACC__
+
+string RandomBernoulli::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "random_bernoulli(" << dim << ", " << p << ')';
+  return s.str();
+}
+
+Dim RandomBernoulli::dim_forward(const vector<Dim>& xs) const {
+  return dim;
+}
+
+#endif
+
+template<class MyDevice>
+void RandomBernoulli::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomBernoulli::forward");
+  TensorTools::randomize_bernoulli(fx, p, scale);
+}
+
+template<class MyDevice>
+void RandomBernoulli::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
+}
+DYNET_NODE_INST_DEV_IMPL(RandomBernoulli)
+
+// ************* RandomUniform *************
+
+#ifndef __CUDACC__
+
+string RandomUniform::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "random_uniform(" << dim << ", " << left << ", " << right << ')';
+  return s.str();
+}
+
+Dim RandomUniform::dim_forward(const vector<Dim>& xs) const {
+  return dim;
+}
+
+#endif
+
+template<class MyDevice>
+void RandomUniform::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomUniform::forward");
+  TensorTools::randomize_uniform(fx, left, right);
+}
+
+template<class MyDevice>
+void RandomUniform::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
+}
+DYNET_NODE_INST_DEV_IMPL(RandomUniform)
+
+// ************* RandomGumbel *************
+
+#ifndef __CUDACC__
+
+string RandomGumbel::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "random_gumbel(" << dim << ", " << mu << ", " << beta << ')';
+  return s.str();
+}
+
+Dim RandomGumbel::dim_forward(const vector<Dim>& xs) const {
+  return dim;
+}
+
+#endif
+
+template<class MyDevice>
+void RandomGumbel::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomGumbel::forward");
+  DYNET_ARG_CHECK(mu == 0.0 && beta == 1.0, "RandomGumbel only supports Gumbel(0,1) at the moment (pull requests welcome)");
+  TensorTools::randomize_uniform(fx, 0, 1);
+  float eps = 1e-20;
+  fx.tvec().device(*dev.edevice) = -(-fx.tvec().cwiseMax(eps).log()).cwiseMax(eps).log();
+}
+
+template<class MyDevice>
+void RandomGumbel::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
+}
+DYNET_NODE_INST_DEV_IMPL(RandomGumbel)
+
+
+}
diff --git a/dynet/nodes-select.cc b/dynet/nodes-select.cc
new file mode 100644
index 000000000..f8760b4c7
--- /dev/null
+++ b/dynet/nodes-select.cc
@@ -0,0 +1,333 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* SelectRows *************
+
+#ifndef __CUDACC__
+
+string SelectRows::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "select_rows(" << arg_names[0] << ", {rsize=" << prows->size() << "})";
+  return s.str();
+}
+
+Dim SelectRows::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Bad arguments in SelectRows: " << xs);
+  unsigned nrows = prows->size();
+  Dim ret(xs[0]);
+  ret.d[0] = nrows;
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void SelectRows::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectRows::forward");
+  auto& rm = *prows;
+  for (unsigned i = 0; i < rm.size(); ++i) {
+    DYNET_ARG_CHECK(rm[i] < xs[0]->d.rows(),
+                            "Out-of-bounds index " << rm[i] << " in SelectRows over expression of dimensions " << xs[0]->d);
+    fx.t<4>().chip<0>(i).device(*dev.edevice) = xs[0]->t<4>().chip<0>(rm[i]);
+  }
+}
+
+template<class MyDevice>
+void SelectRows::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectRows::backward");
+  auto& rm = *prows;
+  for (unsigned i = 0; i < rm.size(); ++i)
+    dEdxi.t<4>().chip<0>(rm[i]).device(*dev.edevice) += dEdf.t<4>().chip<0>(i);
+}
+DYNET_NODE_INST_DEV_IMPL(SelectRows)
+
+// ************* SelectCols *************
+
+#ifndef __CUDACC__
+
+string SelectCols::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "select_cols(" << arg_names[0] << ", {csize=" << pcols->size() << "})";
+  return s.str();
+}
+
+Dim SelectCols::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1 && xs[0].ndims() == 2, "Bad arguments in SelectCols: " << xs);
+  unsigned ncols = pcols->size();
+  return Dim({xs[0].rows(), ncols});
+}
+
+#endif
+
+template<class MyDevice>
+void SelectCols::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectCols::forward");
+  auto& rm = *pcols;
+  for (unsigned i = 0; i < rm.size(); ++i) {
+    DYNET_ARG_CHECK(rm[i] < xs[0]->d.cols(),
+                            "Out-of-bounds index " << rm[i] << " in SelectCols over expression of dimensions " << xs[0]->d);
+    fx.t<2>().chip<1>(i).device(*dev.edevice) = xs[0]->t<2>().chip<1>(rm[i]);
+  }
+}
+
+template<class MyDevice>
+void SelectCols::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectCols::backward");
+  auto& rm = *pcols;
+  for (unsigned i = 0; i < rm.size(); ++i)
+    dEdxi.t<2>().chip<1>(rm[i]).device(*dev.edevice) += dEdf.t<2>().chip<1>(i);
+}
+DYNET_NODE_INST_DEV_IMPL(SelectCols)
+
+// ************* PickElement *************
+
+#ifndef __CUDACC__
+
+string PickElement::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "pick(" << arg_names[0] << ',';
+  if(pval) { 
+    s << *pval;
+  } else {
+    DYNET_ASSERT(pvals, "Have neither index nor index vector in PickElement");
+    s << '[';
+    if(pvals->size()) {
+      s << (*pvals)[0];
+      for(size_t i = 1; i < pvals->size(); ++i)
+        s << ',' << (*pvals)[i];
+    }
+    s << "]";
+  }
+  s << ", " << dimension << ")";
+  return s.str();
+}
+
+Dim PickElement::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in PickElement");
+  DYNET_ARG_CHECK(dimension < xs[0].nd,
+                          "Tried to PickElement on dimension " << dimension << " bigger than input " << xs[0]);
+  DYNET_ARG_CHECK(xs[0].nd < 4,
+                          "PickElement not currently supported for tensors of 4 or more dimensions.");
+  
+  Dim ret(xs[0]);
+  if (pvals){
+    DYNET_ARG_CHECK(xs[0].bd == 1 || xs[0].bd == pvals->size(),
+                          "Number of elements in the passed-in index vector (" <<  pvals->size() << ")"
+                            " did not match number of elements in mini-batch elements in expression (of dimension " << xs[0].bd << ") in PickElement");
+    ret.bd = pvals->size();
+  }
+
+  ret.delete_dim(dimension);
+  return ret;
+}
+
+#endif
+
+// x_1 is a vector
+// y = (x_1)_{*pval}
+template<class MyDevice>
+void PickElement::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  if(pval) {
+    DYNET_ARG_CHECK(*pval < xs[0]->d[dimension], 
+                            "PickElement::forward_impl requested element " << *pval << " from a dimension of length " << xs[0]->d[dimension]);
+    // TODO: This limit of up to 4 is somewhat arbitrary. We need to decide how to handle
+    //       things with "maximum tensor size".
+    fx.tb<3>().device(*dev.edevice) = xs[0]->tb<4>().chip(*pval, dimension); 
+  } else {
+    DYNET_ASSERT(pvals != nullptr, "Neither single nor vector of elements available in PickElement::forward");
+    DYNET_ARG_CHECK(pvals->size() == fx.d.batch_elems(),
+                            "In PickElement::forward, number of elements in the passed-in index vector (" <<  pvals->size() << ")"
+                            " did not match number of elements in mini-batch elements in expression (of dimension" << fx.d << ")");
+    for(unsigned b = 0; b < pvals->size(); ++b) {
+      DYNET_ARG_CHECK((*pvals)[b] < xs[0]->d[dimension], 
+                              "PickElement::forward_impl requested element " << (*pvals)[b] << " from a dimension of length " << xs[0]->d[dimension]);
+      if(xs[0]->d.bd == 1){
+        fx.tb<2>().chip<2>(b).device(*dev.edevice) = xs[0]->t<3>().chip((*pvals)[b], dimension); 
+      }else{
+        fx.tb<2>().chip<2>(b).device(*dev.edevice) = xs[0]->tb<3>().chip<3>(b).chip((*pvals)[b], dimension); 
+      }
+    }
+  }
+}
+
+// derivative is 0 in all dimensions except 1 for the selected element
+template<class MyDevice>
+void PickElement::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ARG_CHECK(i == 0, "Failed dimension check in PickElement::backward");
+  if(pval) {
+    dEdxi.tb<3>().chip(*pval, dimension).device(*dev.edevice) += dEdf.tb<2>();
+  } else {
+    DYNET_ASSERT(pvals, "Neither single nor vector of elements available in PickElement::forward");
+    for(unsigned b = 0; b < pvals->size(); ++b){
+      if(xs[0]->d.bd == 1){
+        dEdxi.t<3>().chip((*pvals)[b], dimension).device(*dev.edevice) += dEdf.tb<2>().chip<2>(b);
+      }else{
+        dEdxi.tb<3>().chip<3>(b).chip((*pvals)[b], dimension).device(*dev.edevice) += dEdf.tb<2>().chip<2>(b);
+      }
+    }
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(PickElement)
+
+// ************* PickRange *************
+
+#ifndef __CUDACC__
+
+// x_1 is a vector
+// y = (x_1)[start:end]
+string PickRange::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "slice(" << arg_names[0] << ',' << start << ':' << end << ", dim=" << dim << ')';
+  return s.str();
+}
+
+Dim PickRange::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in PickRange");
+  DYNET_ARG_CHECK(dim < xs[0].nd && start < end && xs[0][dim] >= end,
+                          "Bad input dimensions or range in PickRange: " << xs << " range(" << start << ", " << end << ") with dim=" << dim);
+  Dim ret = xs[0]; ret.d[dim] = end-start;
+  return ret;
+}
+
+int PickRange::autobatch_sig(const ComputationGraph & cg, SigMap &sm) const {
+  Sig s(nt::pickrange);
+  const Dim &in_dim = cg.nodes[args[0]]->dim;
+  s.add_dim(in_dim);
+  s.add_node(start);
+  s.add_node(end);
+  return sm.get_idx(s);
+}
+
+#endif
+
+// x_1 is a matrix
+// y = (x_1)[start:end]
+// slice of matrix from index start (inclusive) to index end (exclusive)
+template<class MyDevice>
+void PickRange::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0);
+  indices[dim] = start;
+  Eigen::DSizes<ptrdiff_t, 5> sizes(static_cast<ptrdiff_t>(fx.d[0]), 
+                                    static_cast<ptrdiff_t>(fx.d[1]),
+                                    static_cast<ptrdiff_t>(fx.d[2]),
+                                    static_cast<ptrdiff_t>(fx.d[3]),
+                                    static_cast<ptrdiff_t>(fx.d.bd));
+  sizes[dim] = end-start;
+  fx.tb<4>().device(*dev.edevice) = xs[0]->tb<4>().slice(indices, sizes);
+}
+
+// derivative is 0 in all dimensions except the slice range
+template<class MyDevice>
+void PickRange::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0);
+  indices[dim] = start;
+  Eigen::DSizes<ptrdiff_t, 5> sizes(static_cast<ptrdiff_t>(fx.d[0]), 
+                                    static_cast<ptrdiff_t>(fx.d[1]),
+                                    static_cast<ptrdiff_t>(fx.d[2]),
+                                    static_cast<ptrdiff_t>(fx.d[3]),
+                                    static_cast<ptrdiff_t>(fx.d.bd));
+  sizes[dim] = end-start;
+  dEdxi.tb<4>().slice(indices, sizes).device(*dev.edevice) += dEdf.tb<4>();
+}
+DYNET_NODE_INST_DEV_IMPL(PickRange)
+
+// ************* PickBatchElements *************
+
+#ifndef __CUDACC__
+
+string PickBatchElements::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "pick_batch_elems(" << arg_names[0] << ',';
+  if (pval) {
+    s << *pval;
+  } else {
+    DYNET_ASSERT(pvals, "Have neither index nor index vector in PickBatchElements");
+    s << '[';
+    if (pvals->size()) {
+      s << (*pvals)[0];
+      for (size_t i = 1; i < pvals->size(); ++i)
+        s << ',' << (*pvals)[i];
+    }
+    s << "]";
+  }
+  s << ")";
+  return s.str();
+}
+
+Dim PickBatchElements::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in PickBatchElements")
+  DYNET_ARG_CHECK(xs[0].nd < 4, "PickElement not currently supported for tensors of 4 or more dimensions.");
+  Dim ret(xs[0]);
+  if (pval) {
+    // set batch size to one.
+    ret.bd = 1;
+  } else {
+    DYNET_ASSERT(pvals, "Have neither index nor index vector in PickBatchElements");
+    ret.bd = pvals->size();
+  }
+  return ret;
+}
+
+#endif
+
+template<class MyDevice>
+void PickBatchElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  if (pval) {
+    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().chip<1>(*pval);
+  } else {
+    DYNET_ASSERT(pvals != nullptr, "Neither single nor vector of elements available in PickBatchElements::forward");
+    DYNET_ARG_CHECK(pvals->size() == fx.d.batch_elems(), 
+                            "In PickBatchElements::forward, number of elements in the passed-in index vector (" << pvals->size() << ") "
+                            "did not match number of elements in mini-batch elements in expression (of dimension" << fx.d << ")");
+    for (unsigned b = 0; b < pvals->size(); ++b) {
+      DYNET_ARG_CHECK((*pvals)[b] < xs[0]->d.bd,
+                              "PickBatchElements::forward_impl requested element " << (*pvals)[b] << " from a batch size of " << xs[0]->d.bd);
+      fx.tbvec().chip<1>(b).device(*dev.edevice) = xs[0]->tbvec().chip<1>((*pvals)[b]);
+    }
+  }
+}
+
+template<class MyDevice>
+void PickBatchElements::backward_dev_impl(const MyDevice & dev,
+                                  const vector<const Tensor*>& xs,
+                                  const Tensor& fx,
+                                  const Tensor& dEdf,
+                                  unsigned i,
+                                  Tensor& dEdxi) const {
+  DYNET_ASSERT(i == 0, "Failed dimension check in PickBatchElements::backward");
+  if (pval) {
+    dEdxi.tbvec().chip<1>(*pval).device(*dev.edevice) += dEdf.tvec();
+  } else {
+    DYNET_ASSERT(pvals, "Neither single nor vector of elements available in PickBatchElements::backward");
+    for (unsigned b = 0; b < pvals->size(); ++b)
+      dEdxi.tbvec().chip<1>((*pvals)[b]).device(*dev.edevice) += dEdf.tbvec().chip<1>(b);
+  }
+}
+DYNET_NODE_INST_DEV_IMPL(PickBatchElements)
+
+}
diff --git a/dynet/nodes-softmaxes.cc b/dynet/nodes-softmaxes.cc
new file mode 100644
index 000000000..e8b672757
--- /dev/null
+++ b/dynet/nodes-softmaxes.cc
@@ -0,0 +1,362 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* Softmax *************
+
+#ifndef __CUDACC__
+
+string Softmax::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "softmax(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim Softmax::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Softmax");
+  DYNET_ARG_CHECK(xs[0].nd <= 2, "Bad input dimensions in Softmax, must be 2 or fewer: " << xs);
+  return xs[0];
+}
+
+int Softmax::autobatch_sig(const ComputationGraph & cg, SigMap &sm) const {
+  Sig s(nt::softmax);
+  s.add_dim(dim);
+  return sm.get_idx(s);
+}
+
+std::vector<int> Softmax::autobatch_concat(const ComputationGraph & cg) const {
+  return vector<int>(1, 1);
+}
+
+size_t Softmax::aux_storage_size() const {
+  return 2 * dim.size() / dim.rows() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void Softmax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in Softmax::forward");
+  Tensor z(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  Tensor m(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem + z.d.size(), fx.device, DeviceMempool::FXS);
+  TensorTools::logsumexp_dev(dev, *xs[0], m, z);
+  // TODO? Is this broadcast efficient on CPU?
+  Eigen::array<int, 3> bcasts = {(int)xs[0]->d.rows(), 1, 1};
+  Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
+  fx.tb<2>().device(*dev.edevice) = (xs[0]->tb<2>() - z.tvec().reshape(morph).broadcast(bcasts)).exp();
+}
+
+template<class MyDevice>
+void Softmax::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Tensor z(Dim({fx.d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  // TODO? Is this broadcast efficient on CPU?
+  Eigen::array<int, 1> red_axis = {0};
+  z.tb<1>().device(*dev.edevice) = (fx.tb<2>() * dEdf.tb<2>()).sum(red_axis);
+  Eigen::array<int, 3> bcast = {(int)xs[0]->d.rows(), 1, 1};
+  Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
+  dEdxi.tb<2>().device(*dev.edevice) += (dEdf.tb<2>() - z.tvec().reshape(morph).broadcast(bcast)) * fx.tb<2>();
+}
+DYNET_NODE_INST_DEV_IMPL(Softmax)
+
+// ************* LogSoftmax *************
+
+#ifndef __CUDACC__
+
+string LogSoftmax::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "log_softmax(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim LogSoftmax::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in LogSoftmax")
+  DYNET_ARG_CHECK(xs[0].nd <= 2, "Bad input dimensions in LogSoftmax, must be 2 or fewer: " << xs);
+  return xs[0];
+}
+
+size_t LogSoftmax::aux_storage_size() const {
+  return 2 * dim.size() / dim.rows() * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void LogSoftmax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in LogSoftmax::forward");
+  Tensor z(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  Tensor m(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem + z.d.size(), fx.device, DeviceMempool::FXS);
+  TensorTools::logsumexp_dev(dev, *xs[0], m, z);
+  if(fx.d.size() == fx.d.rows()) {
+#ifdef __CUDACC__
+    Eigen::array<int, 1> bcast;
+    bcast[0] = xs[0]->d[0];
+    fx.t<1>().device(*dev.edevice) = xs[0]->t<1>() - z.t<1>().broadcast(bcast);
+#else
+    fx.t<1>().device(*dev.edevice) = xs[0]->t<1>() - as_scalar(z);
+#endif
+  } else {
+    // TODO? Is this broadcast efficient on CPU?
+    Eigen::array<int, 3> bcasts = {(int)xs[0]->d.rows(), 1, 1};
+    Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
+    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<2>() - z.tvec().reshape(morph).broadcast(bcasts);
+  }
+}
+
+template<class MyDevice>
+void LogSoftmax::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  Tensor z(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
+  // TODO? Is this broadcast efficient on CPU?
+  Eigen::array<int, 1> red_axis; red_axis[0] = 0;
+  z.tb<1>().device(*dev.edevice) = dEdf.tb<2>().sum(red_axis);
+  Eigen::array<int, 3> bcast = {(int)fx.d.rows(), 1, 1};
+  Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
+  dEdxi.tb<2>().device(*dev.edevice) += fx.tb<2>().exp() * -z.tvec().reshape(morph).broadcast(bcast) + dEdf.tb<2>();
+}
+DYNET_NODE_INST_DEV_IMPL(LogSoftmax)
+
+// ************* RestrictedLogSoftmax *************
+
+#ifndef __CUDACC__
+
+string RestrictedLogSoftmax::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "r_log_softmax(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim RestrictedLogSoftmax::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in RestrictedLogSoftmax")
+  DYNET_ARG_CHECK(LooksLikeVector(xs[0]), "Bad input dimensions in RestrictedLogSoftmax: " << xs);
+  return xs[0];
+}
+
+template <class T>
+EIGEN_STRONG_INLINE real logsumexp(const T& x, const vector<unsigned>& denom) {
+  real m = x(denom[0],0);
+  for (auto i : denom) {
+    real r = x(i,0);
+    if (r > m) m = r;
+  }
+  real z = 0;
+  for (auto i : denom)
+    z += expf(x(i,0) - m);
+  return m + logf(z);
+}
+
+#endif
+
+template<class MyDevice>
+void RestrictedLogSoftmax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in RestrictedLogSoftmax");
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("RestrictedLogSoftmax not yet implemented for CUDA (contributions welcome!)");
+#else
+  // TODO create auxiliary mask with -infty's
+  // and do usual LogSoftmax stuff
+  if(denom.size() == 0)
+    DYNET_INVALID_ARG("Number of elements in denominator of RestrictedLogSoftmax::forward must be zero");
+  auto x = **xs[0];
+  if(denom.size() == 0)
+    DYNET_RUNTIME_ERR("RestrictedLogSoftmax currently only supports single column expressions (contributions expanding support to multiple columns welcome!)");
+  const real logz = logsumexp(x, denom);
+  TensorTools::constant(fx, -numeric_limits<real>::infinity());
+  for (auto i : denom)
+    (*fx)(i,0) = x(i,0) - logz;
+  if (denom.size() == 1) (*fx)(denom.front(), 0) = 0;
+#endif
+}
+
+template<class MyDevice>
+void RestrictedLogSoftmax::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  DYNET_ASSERT(i == 0, "Failed dimension check in RestrictedLogSoftmax");
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("RestrictedLogSoftmax not yet implemented for CUDA (contributions welcome!)");
+#else
+  float z = 0;
+  for (auto ind : denom)
+    z += (*dEdf)(ind, 0);
+  for (auto ind : denom)
+    (*dEdxi)(ind, 0) += (*dEdf)(ind, 0) - expf((*fx)(ind, 0)) * z;
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(RestrictedLogSoftmax)
+
+// ************* Sparsemax *************
+
+#define MAX_SPARSEMAX_LOSS_ROWS 65536
+
+#ifndef __CUDACC__
+
+string Sparsemax::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "sparsemax(" << arg_names[0] << ")";
+  return s.str();
+}
+
+Dim Sparsemax::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1 && LooksLikeVector(xs[0]), "Bad input dimensions in Sparsemax: " << xs);
+  return xs[0];
+}
+
+size_t Sparsemax::aux_storage_size() const {
+  return (dim.size() + 1) * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void Sparsemax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  if (xs[0]->d.cols() == 1) {
+#ifdef __CUDACC__
+    DYNET_RUNTIME_ERR("Sparsemax not implemented for CUDA");
+#else
+    const unsigned rows = xs[0]->d.rows();
+    float *zs = static_cast<float*>(aux_mem);
+    std::partial_sort_copy(xs[0]->v, xs[0]->v+rows, zs, zs + rows, std::greater<float>());
+    float sum = 0, maxsum = 0;
+    unsigned k = 0;
+    for (k = 0; k < rows; ++k) {
+      sum += zs[k];
+      float t = 1 + (k + 1) * zs[k];
+      if (t <= sum) break;
+      maxsum = sum;
+    }
+    float tau = (maxsum - 1) / k;
+    auto y = *fx;
+    fx.tvec() = (xs[0]->tvec() - tau).cwiseMax(0.f);
+    int c = 1;
+    int *cc = static_cast<int*>(aux_mem);
+    for (unsigned i = 0; i < rows; ++i)
+      if (y(i,0) > 0.f) cc[c++] = i;
+    cc[0] = c - 1;
+#endif
+  } else {
+    DYNET_RUNTIME_ERR("Sparsemax not yet implemented for multiple columns");
+  }
+}
+
+template<class MyDevice>
+void Sparsemax::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("Sparsemax not implemented for CUDA");
+#else
+  const int ssize = static_cast<int*>(aux_mem)[0];
+  int *support = static_cast<int*>(aux_mem) + 1;
+  float dhat = 0;
+  auto& d = *dEdf;
+  for (int i = 0; i < ssize; ++i)
+    dhat += d(support[i], 0);
+  dhat /= ssize;
+  for (int i = 0; i < ssize; ++i)
+    (*dEdxi)(support[i], 0) += d(support[i], 0) - dhat;
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(Sparsemax)
+
+// ************* SparsemaxLoss *************
+
+#ifndef __CUDACC__
+
+string SparsemaxLoss::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "sparsemax(" << arg_names[0] << ", q)";
+  return s.str();
+}
+
+Dim SparsemaxLoss::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1 && LooksLikeVector(xs[0]), "Bad input dimensions in SparsemaxLoss: " << xs);
+  return Dim({1});
+}
+
+size_t SparsemaxLoss::aux_storage_size() const {
+  // first dim.size dimensions is the sparsemax
+  const unsigned rows = MAX_SPARSEMAX_LOSS_ROWS;  // this should be xs[0]->d.rows()
+  return rows * sizeof(float);
+}
+
+#endif
+
+template<class MyDevice>
+void SparsemaxLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  if (xs[0]->d.cols() == 1) {
+#ifdef __CUDACC__
+    DYNET_RUNTIME_ERR("SparsemaxLoss not implemented for CUDA");
+#else
+    const int rows = xs[0]->d.rows();
+    if (rows > MAX_SPARSEMAX_LOSS_ROWS)
+      DYNET_RUNTIME_ERR("MAX_SPARSEMAX_LOSS_ROWS is not sufficient. Recompile with larger value.");
+    const unsigned qsupport_size = pq->size();
+    const float qprop = 1.f / qsupport_size;
+
+    float *zs = static_cast<float*>(aux_mem);
+    std::partial_sort_copy(xs[0]->v, xs[0]->v+rows, zs, zs + rows, std::greater<float>());
+    float sum = 0, maxsum = 0;
+    int k = 0;
+    for (k = 0; k < rows; ++k) {
+      sum += zs[k];
+      float t = 1 + (k + 1) * zs[k];
+      if (t <= sum) break;
+      maxsum = sum;
+    }
+    float tau = (maxsum - 1) / k;
+    Tensor tsm(xs[0]->d, (float*)aux_mem, xs[0]->device, DeviceMempool::FXS);
+    tsm.t<1>() = (xs[0]->t<1>() - tau).cwiseMax(0.f);
+    fx.t<0>() = ( (tsm.t<1>() != 0.f).cast<float>() * (xs[0]->t<1>().square() - (tau * tau)) ).sum();
+    fx.t<0>() = ( fx.t<0>() + qprop * qprop * qsupport_size ) / 2.f;
+    for (unsigned i = 0; i < qsupport_size; ++i)
+      fx.t<0>() = fx.t<0>() - xs[0]->t<1>().chip<0>((*pq)[i]) * qprop;
+    fx.t<0>() = fx.t<0>().cwiseMax(0.f);
+#endif
+  } else {
+    DYNET_RUNTIME_ERR("SparsemaxLoss not yet implemented for multiple columns");
+  }
+}
+
+template<class MyDevice>
+void SparsemaxLoss::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+#ifdef __CUDACC__
+  DYNET_RUNTIME_ERR("SparsemaxLoss not implemented for CUDA");
+#else
+  const float d = dEdf.v[0];
+  float* psm = static_cast<float*>(aux_mem);
+  float dqprop = d / pq->size();
+  Tensor tsm(xs[0]->d, psm, xs[0]->device, DeviceMempool::FXS);
+  auto sm = *tsm;  // sparsemax(z)
+  *dEdxi += sm * d;
+  for (unsigned i = 0; i < pq->size(); ++i)
+    (*dEdxi)((*pq)[i], 0) -= dqprop;
+#endif
+}
+DYNET_NODE_INST_DEV_IMPL(SparsemaxLoss)
+
+}
diff --git a/dynet/nodes-trig.cc b/dynet/nodes-trig.cc
new file mode 100644
index 000000000..c5965879b
--- /dev/null
+++ b/dynet/nodes-trig.cc
@@ -0,0 +1,43 @@
+#include "dynet/nodes.h"
+
+#include "dynet/nodes-macros.h"
+#include "dynet/simd-functors.h"
+
+using namespace std;
+
+namespace dynet {
+
+// ************* *************
+
+#ifndef __CUDACC__
+
+string Tanh::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "tanh(" << arg_names[0] << ')';
+  return s.str();
+}
+
+Dim Tanh::dim_forward(const vector<Dim>& xs) const {
+  DYNET_ARG_CHECK(xs.size() == 1, "Failed input count check in Tanh")
+  return xs[0];
+}
+
+#endif
+
+template<class MyDevice>
+void Tanh::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
+  fx.tvec().device(*dev.edevice) = xs[0]->tvec().tanh();
+}
+
+template<class MyDevice>
+void Tanh::backward_dev_impl(const MyDevice & dev,
+                             const vector<const Tensor*>& xs,
+                             const Tensor& fx,
+                             const Tensor& dEdf,
+                             unsigned i,
+                             Tensor& dEdxi) const {
+  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), scalar_tanh_backward_op<float>());
+}
+DYNET_NODE_INST_DEV_IMPL(Tanh)
+
+}
diff --git a/dynet/nodes.cc b/dynet/nodes.cc
deleted file mode 100644
index 4b4a06a6c..000000000
--- a/dynet/nodes.cc
+++ /dev/null
@@ -1,2200 +0,0 @@
-#include "dynet/nodes.h"
-
-#include <limits>
-#include <cmath>
-#include <stdexcept>
-
-#include "dynet/simd-functors.h"
-#include "dynet/functors.h"
-#include "dynet/nodes-macros.h"
-#include "dynet/globals.h"
-
-#ifdef __CUDACC__
-#include "dynet/cuda.h"
-#include "dynet/gpu-ops.h"
-#endif
-
-using namespace std;
-
-inline string print_vec(const std::vector<float> & vec) {
-  string sep = "[";
-  ostringstream oss;
-  for(auto f : vec) {
-    oss << sep << f; sep = ",";
-  }
-  oss << "]";
-  return oss.str();
-}
-
-// notes on implementing differentiable components
-// 1) fx can be understood as a pointer to the (preallocated) location for the result
-//    of forward to be stored
-// 2) fx is not initialized, so after calling forward fx must point to the correct answer
-// 3) fx can be repointed to an input, if forward(x) evaluates to x (e.g., in reshaping)
-// 4) dEdxi MUST **ACCUMULATE** a result since multiple calls to forward may depend on
-//    the same x_i. Even, e.g., Identity must be implemented as
-//    dEdx1 += dEdf. THIS IS EXTREMELY IMPORTANT
-// 5) scalars results of forward are placed in fx.v[0]
-// 6) DYNET manages its own memory, not Eigen, and it is configured with the
-//    EIGEN_NO_MALLOC option. If you get an error about Eigen attempting to allocate
-//    memory, it is (probably) because of an implicit creation of a temporary variable.
-//    To tell Eigen this is not necessary, the noalias() method is available. If you really
-//    do need a temporary variable, its capacity must be requested by Node::aux_storage_size
-//
-// notes on debugging problems with differentiable components
-// 1) fx is uninitialized when forward is called- are you relying on it being 0?
-// 2) dEdxi must accummulate (see point 4 above!)
-//
-
-namespace dynet {
-
-// ======= Shared definitions
-#define MAX_LOG_SUM_EXP 65536
-#define MAX_SPARSEMAX_LOSS_ROWS 65536
-
-// ======= Functions to be compiled on only CPU
-#ifndef __CUDACC__
-
-// set use_cholesky if M is symmetric - it's faster and more stable
-// for dep paring it won't be
-template <typename MatrixType>
-inline typename MatrixType::Scalar logdet(const MatrixType& M, bool use_cholesky = false) {
-  using namespace Eigen;
-  using std::log;
-  typedef typename MatrixType::Scalar Scalar;
-  Scalar ld = 0;
-  if (use_cholesky) {
-    LLT<Matrix<Scalar,Dynamic,Dynamic>> chol(M);
-    auto& U = chol.matrixL();
-    for (unsigned i = 0; i < M.rows(); ++i)
-      ld += log(U(i,i));
-    ld *= 2;
-  } else {
-    PartialPivLU<Matrix<Scalar,Dynamic,Dynamic>> lu(M);
-    auto& LU = lu.matrixLU();
-    Scalar c = lu.permutationP().determinant(); // -1 or 1
-    for (unsigned i = 0; i < LU.rows(); ++i) {
-      const auto& lii = LU(i,i);
-      if (lii < Scalar(0)) c *= -1;
-      ld += log(abs(lii));
-    }
-    ld += log(c);
-  }
-  return ld;
-}
-
-template <class T>
-EIGEN_STRONG_INLINE real logsumexp(const T& x, const vector<unsigned>& denom) {
-  real m = x(denom[0],0);
-  for (auto i : denom) {
-    real r = x(i,0);
-    if (r > m) m = r;
-  }
-  real z = 0;
-  for (auto i : denom)
-    z += expf(x(i,0) - m);
-  return m + logf(z);
-}
-
-// ===== Auxiliary functions
-
-size_t BlockDropout::aux_storage_size() const {
-  // we just need to remember whether this entire block is turned on (1.0) or off (0.0)
-  return 1 * sizeof(float);
-}
-
-size_t Dropout::aux_storage_size() const {
-  return dim.size() * sizeof(float);
-}
-
-size_t DropoutDim::aux_storage_size() const {
-  return (dim.size() / dim[dimension]) * sizeof(float);
-}
-
-size_t DropoutBatch::aux_storage_size() const {
-  return dim.batch_elems() * sizeof(float);
-}
-
-size_t GaussianNoise::aux_storage_size() const {
-  return dim.size() * sizeof(float);
-}
-
-size_t LogSoftmax::aux_storage_size() const {
-  return 2 * dim.size() / dim.rows() * sizeof(float);
-}
-
-// this i need to do something better, but this is a work-around
-// if this is too small, just make it bigger
-size_t LogSumExp::aux_storage_size() const {
-  return (MAX_LOG_SUM_EXP + 1) * sizeof(float);
-}
-
-size_t Max::aux_storage_size() const {
-  return dim.size() * sizeof(float);
-}
-
-size_t Min::aux_storage_size() const {
-  return dim.size() * sizeof(float);
-}
-
-size_t Softmax::aux_storage_size() const {
-  return 2 * dim.size() / dim.rows() * sizeof(float);
-}
-
-size_t Sparsemax::aux_storage_size() const {
-  return (dim.size() + 1) * sizeof(float);
-}
-
-size_t SparsemaxLoss::aux_storage_size() const {
-  // first dim.size dimensions is the sparsemax
-  const unsigned rows = MAX_SPARSEMAX_LOSS_ROWS;  // this should be xs[0]->d.rows()
-  return rows * sizeof(float);
-}
-
-size_t MaxDimension::aux_storage_size() const {
-  return sizeof(Eigen::DenseIndex) * dim.size();
-}
-
-size_t MinDimension::aux_storage_size() const {
-  return sizeof(Eigen::DenseIndex) * dim.size();
-}
-
-#endif // Finish CPU only functions
-
-// ===== Functions to be compiled on both CPU and GPU
-
-template<class MyDevice>
-void AddVectorToAllColumns::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  // Broadcasting is slow on CPU, so split codepaths
-#ifdef __CUDACC__
-  if(xs[0]->d.bd >= xs[1]->d.bd) {
-    Eigen::array<int, 3> bcasts = {1, (int)xs[0]->d[1], (int)(xs[0]->d.bd/xs[1]->d.bd)};
-    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<2>() + xs[1]->tb<2>().broadcast(bcasts);
-  } else {
-    DYNET_ASSERT(xs[0]->d.bd == 1,
-                 "Bad dimensions in AddVectorToAllColumns::forward: " << xs[0]->d << ", " << xs[1]->d);
-    Eigen::array<int, 3> bcasts0 = {1, 1, (int)xs[1]->d.bd};
-    Eigen::array<int, 3> bcasts1 = {1, (int)xs[0]->d[1], 1};
-    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<2>().broadcast(bcasts0) + xs[1]->tb<2>().broadcast(bcasts1);
-  }
-#else
-  // First, add the matrix
-  if(xs[0]->d.bd == fx.d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-  else
-    for(size_t b = 0; b < fx.d.bd; ++b)
-      fx.tbvec().chip<1>(b).device(*dev.edevice) = xs[0]->tvec();
-  // Second, add the columns
-  if(xs[1]->d.bd == fx.d.bd) {
-    for(size_t i = 0; i < xs[0]->d[1]; ++i) 
-      fx.tb<2>().chip<1>(i).device(*dev.edevice) += xs[1]->tb<1>();
-  } else {
-    for(size_t b = 0; b < fx.d.bd; ++b)
-      for(size_t i = 0; i < fx.d[1]; ++i) 
-        fx.tb<2>().chip<2>(b).chip<1>(i).device(*dev.edevice) += xs[1]->t<1>();
-  }
-#endif
-}
-
-template<class MyDevice>
-void AddVectorToAllColumns::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in AddVetorToAllColumns::backward");
-  // TODO: profile on CPU and see whether the chip version is better
-  if (i == 0) { // x
-    if(dEdf.d.bd == dEdxi.d.bd) {
-      dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
-    } else {
-      Eigen::array<int, 1> red_axis = {2};
-      dEdxi.t<2>().device(*dev.edevice) += dEdf.tb<2>().sum(red_axis);
-    }
-  } else { // bias
-    if(dEdf.d.bd == dEdxi.d.bd) {
-      Eigen::array<int, 1> red_axis = {1};
-      dEdxi.tb<1>().device(*dev.edevice) += dEdf.tb<2>().sum(red_axis);
-    } else {
-      DYNET_ASSERT(dEdxi.d.bd == 1,
-                   "Bad dimensions in AddVectorToAllColumns::backward: " << xs[0]->d << ", " << xs[1]->d);
-      Eigen::array<int, 2> red_axis = {1,2};
-      dEdxi.t<1>().device(*dev.edevice) += dEdf.tb<2>().sum(red_axis);
-    }
-  }
-}  
-DYNET_NODE_INST_DEV_IMPL(AddVectorToAllColumns)
-
-template<class MyDevice>
-void Average::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  const unsigned num_args = xs.size();
-  if (num_args == 1) {
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-    return;
-  }
-  if (num_args == 2 && xs[0]->d.bd == xs[1]->d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec();
-  else if (num_args == 3 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec();
-  else if (num_args == 4 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd && xs[2]->d.bd == xs[3]->d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
-  else {
-    bool allSameBatchSize = std::all_of(xs.begin(), xs.end(), [&](const Tensor* x) { return x->d.bd == xs[0]->d.bd;});
-    if (allSameBatchSize) {
-      // Since they are all the same batch size, we can easily unroll the addition (results in lower GPU latency by merging multiple adds together in one CUDA call):
-      DYNET_ASSERT(num_args > 4, "Bad loop unrolling in Sum::forward");        // If it was <=4, we would have handled it in the special cases above
-      fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
-
-      const unsigned remainder = (num_args - 4 ) % 4;
-      switch (remainder) {
-        case 0: break;
-        case 1: fx.tvec().device(*dev.edevice) += xs[4]->tvec(); break;
-        case 2: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec(); break;
-        case 3: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec() + xs[6]->tvec(); break;
-      }
-      for (unsigned i = 4 + remainder; i < num_args; i += 4)
-        fx.tvec().device(*dev.edevice) += xs[i]->tvec() + xs[i + 1]->tvec() + xs[i + 2]->tvec() + xs[i + 3]->tvec();
-    }
-    else {
-      // Not all the same batch size, so need to broadcast in the cases where they differ
-      TensorTools::zero(fx);
-#ifdef __CUDACC__
-      Eigen::array<int, 2> bcast({ 1, (int)fx.d.bd });
-#endif
-      for (unsigned i = 0; i < num_args; ++i) {
-        if (xs[i]->d.bd == fx.d.bd) {
-          fx.tvec().device(*dev.edevice) += xs[i]->tvec();
-        }
-        else {
-#ifdef __CUDACC__
-          fx.tbvec().device(*dev.edevice) += xs[i]->tbvec().broadcast(bcast);
-#else
-          for (unsigned b = 0; b < fx.d.bd; ++b)
-            fx.tbvec().chip<1>(b).device(*dev.edevice) += xs[i]->tvec();
-#endif
-        }
-      }
-    }
-  }
-  fx.tvec().device(*dev.edevice) = fx.tvec() / (float)xs.size();
-}
-
-template<class MyDevice>
-void Average::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += (dEdf.tvec() / (float)xs.size());
-}
-DYNET_NODE_INST_DEV_IMPL(Average)
-
-template<class MyDevice>
-void Concatenate::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  unsigned curr_row = 0;
-  src_indices.resize(xs.size());
-  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0);
-  Eigen::DSizes<ptrdiff_t, 5> sizes(fx.d[0], fx.d[1], fx.d[2], fx.d[3],static_cast<ptrdiff_t>(fx.d.bd));
-  for (unsigned i = 0; i < xs.size(); ++i) {
-    indices[dimension] = src_indices[i] = curr_row;
-    const unsigned row_size = xs[i]->d[dimension];
-    sizes[dimension] = row_size;
-    if(fx.d.bd == xs[i]->d.bd) {
-      fx.tb<4>().slice(indices, sizes).device(*dev.edevice) = xs[i]->tb<4>();
-    } else {
-      Eigen::array<ptrdiff_t, 5> bcast; bcast[0] = bcast[1] = bcast[2] = bcast[3] = 1; bcast[4] = fx.d.bd;
-      fx.tb<4>().slice(indices, sizes).device(*dev.edevice) = xs[i]->tb<4>().broadcast(bcast);
-    }
-    curr_row += row_size;
-  }
-}
-
-template<class MyDevice>
-void Concatenate::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < src_indices.size(), "Failed boundary check in Concatenate::backward: " << i << " >= " << src_indices.size());
-  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0); indices[dimension] = src_indices[i];
-  Eigen::DSizes<ptrdiff_t, 5> sizes(static_cast<ptrdiff_t>(dEdxi.d[0]),
-                                    static_cast<ptrdiff_t>(dEdxi.d[1]),
-                                    static_cast<ptrdiff_t>(dEdxi.d[2]),
-                                    static_cast<ptrdiff_t>(dEdxi.d[3]),
-                                    static_cast<ptrdiff_t>(fx.d.bd));
-  if(dEdxi.d.bd == dEdf.d.bd) {
-    dEdxi.tb<4>().device(*dev.edevice) += dEdf.tb<4>().slice(indices, sizes);
-  } else {
-    Eigen::array<int, 1> red_axis; red_axis[0] = 4;
-    dEdxi.t<4>().device(*dev.edevice) += dEdf.tb<4>().slice(indices, sizes).sum(red_axis);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Concatenate)
-
-template<class MyDevice>
-void ConcatenateToBatch::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const { 
-  unsigned curr_e = 0;
-  src_element_indices.resize(xs.size());
-  Eigen::DSizes<ptrdiff_t, 2> indices(0,0);
-  Eigen::DSizes<ptrdiff_t, 2> sizes(static_cast<ptrdiff_t>(fx.d.batch_size()), 0);
-  for (unsigned i = 0; i < xs.size(); ++i) {
-    indices[1] = src_element_indices[i] = curr_e;
-    sizes[1] = xs[i]->d.bd;
-    fx.tbvec().slice(indices, sizes).device(*dev.edevice) = xs[i]->tbvec();
-    curr_e += xs[i]->d.bd;
-  }
-  
-}
-
-template<class MyDevice>
-void ConcatenateToBatch::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < src_element_indices.size(), "Failed boundary check in ConcatenateToBatch::backward: " << i << " >= " << src_element_indices.size());
-  Eigen::DSizes<ptrdiff_t, 2> indices(0, static_cast<ptrdiff_t>(src_element_indices[i]));
-  Eigen::DSizes<ptrdiff_t, 2> sizes(static_cast<ptrdiff_t>(fx.d.batch_size()), static_cast<ptrdiff_t>(xs[i]->d.bd));
-  dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().slice(indices, sizes);
-}
-DYNET_NODE_INST_DEV_IMPL(ConcatenateToBatch)
-
-template<class MyDevice>
-void BinaryLogLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.t<0>().device(*dev.edevice) = xs[0]->tvec().binaryExpr(xs[1]->tvec(), FBinaryLogLoss()).sum();
-}
-
-template<class MyDevice>
-void BinaryLogLoss::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += xs[i]->tvec().binaryExpr(xs[1-i]->tvec(), FBinaryLogLossBackward(as_scalar(dEdf)));
-}
-DYNET_NODE_INST_DEV_IMPL(BinaryLogLoss)
-
-template<class MyDevice>
-void BlockDropout::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  bernoulli_distribution distribution(1.0 - dropout_probability);
-  float block_multiplier = distribution(*rndeng)? 1.0 : 0.0;
-  block_multiplier = 
-    dropout_probability == 1.0? 0.0 : block_multiplier / (1.0 - dropout_probability);
-  if (dropout_probability > 1.0 || dropout_probability < 0.0)
-    DYNET_INVALID_ARG("Dropout probability must be in the range [0, 1]");
-  *(static_cast<float*>(aux_mem)) = block_multiplier;
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec() * block_multiplier;
-}
-
-template<class MyDevice>
-void BlockDropout::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  float block_multiplier = *(static_cast<float*>(aux_mem));
-  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * block_multiplier;
-}
-DYNET_NODE_INST_DEV_IMPL(BlockDropout)
-
-template<class MyDevice>
-void ConstantMinusX::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(const_minus_op<float>(c));
-}
-
-template<class MyDevice>
-void ConstantMinusX::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) -= dEdf.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(ConstantMinusX)
-
-template<class MyDevice>
-void ConstantPlusX::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(const_add_op<float>(c));
-}
-
-template<class MyDevice>
-void ConstantPlusX::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(ConstantPlusX)
-
-template<class MyDevice>
-void ConstScalarMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec() * alpha;
-}
-
-template<class MyDevice>
-void ConstScalarMultiply::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i == 0, "Failed dimension check in ConstScalarMultiply");
-  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * alpha;
-}
-DYNET_NODE_INST_DEV_IMPL(ConstScalarMultiply)
-
-template<class MyDevice>
-void CwiseQuotient::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in CwiseQuotient::forward (cdiv)");
-  if(xs[0]->d.bd == xs[1]->d.bd) {
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() / xs[1]->tvec();
-  } else if(xs[0]->d.bd == 1) {
-    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-    fx.tb<1>().device(*dev.edevice) = xs[0]->tb<1>().broadcast(bcast) / xs[1]->tb<1>();
-  } else {
-    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-    fx.tb<1>().device(*dev.edevice) = xs[0]->tb<1>() / xs[1]->tb<1>().broadcast(bcast);
-  }
-}
-
-template<class MyDevice>
-void CwiseQuotient::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in CwiseQuotient::backward (cdiv)");
-  if (i == 0) {
-    if(xs[0]->d.bd == xs[1]->d.bd) {
-      dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() / xs[1]->tvec();
-    } else if(xs[1]->d.bd == 1) {
-      Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-      dEdxi.tb<1>().device(*dev.edevice) += dEdf.tb<1>() / xs[1]->tb<1>().broadcast(bcast);
-    } else {
-      Eigen::array<int, 1> red_axis; red_axis[0] = 1;
-      dEdxi.t<1>().device(*dev.edevice) += (dEdf.tb<1>() / xs[1]->tb<1>()).sum(red_axis);
-    }
-  } else { // i = 1
-    if(xs[0]->d.bd == xs[1]->d.bd) {
-      dEdxi.tvec().device(*dev.edevice) -= dEdf.tvec() / xs[1]->tvec().square() * xs[0]->tvec();
-    } else if(xs[1]->d.bd == 1) {
-      Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-      Eigen::array<int, 1> red_axis; red_axis[0] = 1;
-      dEdxi.t<1>().device(*dev.edevice) -= (dEdf.tb<1>() / xs[1]->tb<1>().square().broadcast(bcast) * xs[0]->tb<1>()).sum(red_axis);
-    } else {
-      Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-      dEdxi.tb<1>().device(*dev.edevice) -= dEdf.tb<1>() / xs[1]->tb<1>().square() * xs[0]->tb<1>().broadcast(bcast);
-    }
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(CwiseQuotient)
-
-template<class MyDevice>
-void CwiseMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in CwiseMultiply::forward (cmult)");
-  if(xs[0]->d.bd == xs[1]->d.bd) {
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() * xs[1]->tvec();
-  } else {
-    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-    if(xs[0]->d.bd == 1)
-      fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast) * xs[1]->tbvec();
-    else
-      fx.tbvec().device(*dev.edevice) = xs[0]->tbvec() * xs[1]->tbvec().broadcast(bcast);
-  }
-}
-
-template<class MyDevice>
-void CwiseMultiply::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in CwiseMultiply::backward (cmult)");
-  if(xs[0]->d.bd == xs[1]->d.bd) {
-    dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * xs[1-i]->tvec();
-  } else if(xs[1-i]->d.bd == 1) {
-    Eigen::array<int, 2> bcast; bcast[0] = 1; bcast[1] = fx.d.bd;
-    dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() * xs[1-i]->tbvec().broadcast(bcast);
-  } else {
-    Eigen::array<int, 1> red_axis; red_axis[0] = 1;
-    dEdxi.tvec().device(*dev.edevice) += (dEdf.tbvec() * xs[1-i]->tbvec()).sum(red_axis);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(CwiseMultiply)
-
-template<class MyDevice>
-void ScalarAdd::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in ScalarAdd::forward (+)");
-  Eigen::array<int, 2> bcast_0 = {1, (int) (fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 2> bcast_1 = {(int) fx.d.batch_size(), (int) (fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
-  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast_0) + xs[1]->tbvec().broadcast(bcast_1);
-}
-
-template<class MyDevice>
-void ScalarAdd::backward_dev_impl(const MyDevice & dev,
-                                  const vector<const Tensor*>& xs,
-                                  const Tensor& fx,
-                                  const Tensor& dEdf,
-                                  unsigned i,
-                                  Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in ScalarAdd::backward (+)");
-  Eigen::array<int, 1> red_axis_0 = {0}, red_axis_1 = {1};
-  Eigen::array<int, 2> red_axes_01 = {0, 1};
-  if (i == 0) {
-    if (xs[0]->d.bd == 1)
-      dEdxi.tvec().device(*dev.edevice) += dEdf.tbvec().sum(red_axis_1);
-    else
-      dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec();
-  } else {
-    if (xs[1]->d.bd == 1)
-      dEdxi.t<0>().device(*dev.edevice) += dEdf.tbvec().sum(red_axes_01);
-    else
-      dEdxi.tb<0>().device(*dev.edevice) += dEdf.tbvec().sum(red_axis_0);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(ScalarAdd)
-
-template<class MyDevice>
-void ScalarMultiply::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in ScalarMultiply::forward (cmult)");
-
-  Eigen::array<int, 2> bcast_0 = {(int) fx.d.batch_size(), (int) (fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 2> bcast_1 = {1, (int) (fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
-  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast_0) * xs[1]->tbvec().broadcast(bcast_1);
-}
-
-template<class MyDevice>
-void ScalarMultiply::backward_dev_impl(const MyDevice & dev,
-                                       const vector<const Tensor*>& xs,
-                                       const Tensor& fx,
-                                       const Tensor& dEdf,
-                                       unsigned i,
-                                       Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in ScalarMultiply::backward (cmult)");
-  Eigen::array<int, 2> bcast_0 = {(int) fx.d.batch_size(), (int)( fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 2> bcast_1 = {1, (int)(fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 1> red_axis_0 = {0}, red_axis_1 = {1};
-  Eigen::array<int, 2> red_axes_01 = {0, 1};
-  if (i == 0) {
-    if (xs[0]->d.bd == 1)
-      dEdxi.t<0>().device(*dev.edevice) += (dEdf.tbvec() * xs[1]->tbvec().broadcast(bcast_1)).sum(red_axes_01);
-    else
-      dEdxi.tb<0>().device(*dev.edevice) += (dEdf.tbvec() * xs[1]->tbvec().broadcast(bcast_1)).sum(red_axis_0);
-  } else {
-    if (xs[1]->d.bd == 1)
-      dEdxi.tvec().device(*dev.edevice) += (dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast_0)).sum(red_axis_1);
-    else
-      dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast_0);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(ScalarMultiply)
-
-template<class MyDevice>
-void ScalarQuotient::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in ScalarQuotient::forward (cdiv)");
-  Eigen::array<int, 2> bcast_0 = {1, (int) (fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 2> bcast_1 = {(int) fx.d.batch_size(), (int) (fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
-  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast_0) / xs[1]->tbvec().broadcast(bcast_1);
-}
-
-template<class MyDevice>
-void ScalarQuotient::backward_dev_impl(const MyDevice & dev,
-                                       const vector<const Tensor*>& xs,
-                                       const Tensor& fx,
-                                       const Tensor& dEdf,
-                                       unsigned i,
-                                       Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in ScalarQuotient::backward (cdiv)");
-  Eigen::array<int, 2> bcast = {(int)fx.d.batch_size(), (int)(fx.d.bd == xs[1]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 2> bcast2 = {1, (int)(fx.d.bd == xs[0]->d.bd ? 1 : fx.d.bd)};
-  Eigen::array<int, 1> red_axis_0 = {0}, red_axis_1 = {1};
-  Eigen::array<int, 2> red_axes_01 = {0, 1};
-  if (i == 0) {
-    if (xs[0]->d.bd == 1)
-      dEdxi.tvec().device(*dev.edevice) += (dEdf.tbvec() / xs[1]->tbvec().broadcast(bcast)).sum(red_axis_1);
-    else
-      dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() / xs[1]->tbvec().broadcast(bcast);
-  } else {
-    if (xs[1]->d.bd == 1)
-      dEdxi.t<0>().device(*dev.edevice) += - (dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast2)).sum(red_axes_01) / xs[1]->t<0>().square();
-    else
-      dEdxi.tb<0>().device(*dev.edevice) += - (dEdf.tbvec() * xs[0]->tbvec().broadcast(bcast2)).sum(red_axis_0) / xs[1]->tb<0>().square();
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(ScalarQuotient)
-
-template<class MyDevice>
-void Dropout::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Tensor m(dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  TensorTools::randomize_bernoulli(m, (1.f-p), 1.f / (1.f-p));
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec() * m.tvec();
-}
-
-template<class MyDevice>
-void Dropout::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Tensor m(dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec() * m.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(Dropout)
-
-
-template<class MyDevice>
-void DropoutBatch::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Dim mask_dim({1},xs[0]->d.batch_elems());
-  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  TensorTools::randomize_bernoulli(m, (1.f-p), 1.f / (1.f-p));
-  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
-  fx.tbvec().device(*dev.edevice) = xs[0]->tbvec() * m.tbvec().broadcast(bcast);
-}
-
-template<class MyDevice>
-void DropoutBatch::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Dim mask_dim({1},xs[0]->d.batch_elems());
-  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
-  dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec() * m.tbvec().broadcast(bcast);
-}
-DYNET_NODE_INST_DEV_IMPL(DropoutBatch)
-
-
-template<class MyDevice>
-void DropoutDim::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Dim mask_dim(dim);
-  mask_dim.d[dimension]=1;
-  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  TensorTools::randomize_bernoulli(m, (1.f-p), 1.f / (1.f-p));
-  Eigen::array<ptrdiff_t, 4> bcast = {1, 1, 1, 1}; bcast[dimension] = xs[0]->d[dimension];
-  fx.tb<3>().device(*dev.edevice) = xs[0]->tb<3>() * m.tb<3>().broadcast(bcast);
-}
-
-template<class MyDevice>
-void DropoutDim::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Dim mask_dim(dim);
-  mask_dim.d[dimension]=1;
-  Tensor m(mask_dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  Eigen::array<ptrdiff_t, 4> bcast = {1, 1, 1, 1}; bcast[dimension] = dEdf.d[dimension];
-  dEdxi.tb<3>().device(*dev.edevice) += dEdf.tb<3>() * m.tb<3>().broadcast(bcast);
-}
-DYNET_NODE_INST_DEV_IMPL(DropoutDim)
-
-template<class MyDevice>
-void Erf::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().erf();
-}
-
-template<class MyDevice>
-void Erf::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().binaryExpr(dEdf.tvec(), scalar_erf_backward_op<float>());
-}
-DYNET_NODE_INST_DEV_IMPL(Erf)
-
-template<class MyDevice>
-void GaussianNoise::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Tensor m(dim, (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  TensorTools::randomize_normal(m, 0, stddev);
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec() + m.tvec();
-}
-
-template<class MyDevice>
-void GaussianNoise::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(GaussianNoise)
-
-template<class MyDevice>
-void Identity::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-}
-
-template<class MyDevice>
-void Identity::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(Identity)
-
-template<class MyDevice>
-void KMHNGram::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("KMHNGram not implemented for CUDA");
-#else
-  auto x = **xs[0];
-  const int new_cols = x.cols() - n + 1;
-  DYNET_ASSERT(new_cols > 0, "Failed dimension check in KMHNGram");
-  auto res = *fx;
-  res.setZero();
-  for (int j = 0; j < new_cols; ++j) {
-    auto c_j = res.col(j);
-    for (unsigned k = 0; k < n; ++k)
-      c_j += x.col(j + k);
-  }
-#endif
-}
-
-template<class MyDevice>
-void KMHNGram::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("KMHNGram not implemented for CUDA");
-#else
-  const int c = dEdf.d.cols();
-  for (int j = 0; j < c; ++j)
-    for (unsigned k = 0; k < n; ++k)
-      (*dEdxi).col(j+k) += (*dEdf).col(j);
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(KMHNGram)
-
-template<class MyDevice>
-void LogDet::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("LogDet not implemented for CUDA");
-#else
-  fx.v[0] = logdet(**xs[0], false);
-#endif
-}
-
-template<class MyDevice>
-void LogDet::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("KMHNGram not implemented for CUDA");
-#else
-  auto trans = (**xs[0]).transpose();
-  (*dEdxi) += (dEdf.v[0]) * trans.inverse();
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(LogDet)
-
-template<class MyDevice>
-void LogGamma::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().lgamma();
-}
-
-template<class MyDevice>
-void LogGamma::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().digamma() * dEdf.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(LogGamma)
-
-template<class MyDevice>
-void LogisticSigmoid::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in LogisticSigmoid::forward");
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(scalar_logistic_sigmoid_op<float>());
-}
-
-template<class MyDevice>
-void LogisticSigmoid::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), scalar_logistic_sigmoid_backward_op<float>());
-}
-DYNET_NODE_INST_DEV_IMPL(LogisticSigmoid)
-
-template<class MyDevice>
-void LogSoftmax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in LogSoftmax::forward");
-  Tensor z(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  Tensor m(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem + z.d.size(), fx.device, DeviceMempool::FXS);
-  TensorTools::logsumexp_dev(dev, *xs[0], m, z);
-  if(fx.d.size() == fx.d.rows()) {
-#ifdef __CUDACC__
-    Eigen::array<int, 1> bcast;
-    bcast[0] = xs[0]->d[0];
-    fx.t<1>().device(*dev.edevice) = xs[0]->t<1>() - z.t<1>().broadcast(bcast);
-#else
-    fx.t<1>().device(*dev.edevice) = xs[0]->t<1>() - as_scalar(z);
-#endif
-  } else {
-    // TODO? Is this broadcast efficient on CPU?
-    Eigen::array<int, 3> bcasts = {(int)xs[0]->d.rows(), 1, 1};
-    Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
-    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<2>() - z.tvec().reshape(morph).broadcast(bcasts);
-  }
-}
-
-template<class MyDevice>
-void LogSoftmax::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Tensor z(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  // TODO? Is this broadcast efficient on CPU?
-  Eigen::array<int, 1> red_axis; red_axis[0] = 0;
-  z.tb<1>().device(*dev.edevice) = dEdf.tb<2>().sum(red_axis);
-  Eigen::array<int, 3> bcast = {(int)fx.d.rows(), 1, 1};
-  Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
-  dEdxi.tb<2>().device(*dev.edevice) += fx.tb<2>().exp() * -z.tvec().reshape(morph).broadcast(bcast) + dEdf.tb<2>();
-}
-DYNET_NODE_INST_DEV_IMPL(LogSoftmax)
-
-template<class MyDevice>
-void LogSumExp::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  if (xs.size() == 1) {
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-  } else {
-    // TODO: Ideally we wouldn't need to allocate this memory permanently.
-    //       We need a good method for allocating "scratch" memory that is only used temporarily.
-    Tensor ms(fx.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
-    Eigen::array<ptrdiff_t, 2> bcast = {1,fx.d.bd};
-    // Calculate the max
-    if(ms.d.bd == xs[0]->d.bd)
-      ms.tvec().device(*dev.edevice) = xs[0]->tvec();
-    else
-      ms.tbvec().device(*dev.edevice) = xs[0]->tbvec().broadcast(bcast); 
-    for (size_t i = 1; i < xs.size(); ++i) {
-      if(ms.d.bd == xs[i]->d.bd)
-        ms.tvec().device(*dev.edevice) = ms.tvec().cwiseMax(xs[i]->tvec());
-      else
-        ms.tbvec().device(*dev.edevice) = ms.tbvec().cwiseMax(xs[i]->tbvec().broadcast(bcast)); 
-    }
-    // sumexp
-    if(ms.d.bd == xs[0]->d.bd)
-      fx.tvec().device(*dev.edevice) = (xs[0]->tvec() - ms.tvec()).exp();
-    else
-      fx.tbvec().device(*dev.edevice) = (xs[0]->tbvec().broadcast(bcast) - ms.tbvec()).exp();
-    for (size_t i = 1; i < xs.size(); ++i) {
-      if(ms.d.bd == xs[i]->d.bd)
-        fx.tvec().device(*dev.edevice) += (xs[i]->tvec() - ms.tvec()).exp();
-      else
-        fx.tbvec().device(*dev.edevice) += (xs[i]->tbvec().broadcast(bcast) - ms.tbvec()).exp();
-    }
-    // log and add max
-    fx.tvec().device(*dev.edevice) = fx.tvec().log() + ms.tvec();
-  }
-}
-
-template<class MyDevice>
-void LogSumExp::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  if (xs.size() == 1) {
-    dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
-  } else {
-    // df/dx_i = 1/{sum_j exp(x_j)} * exp(x_i)}
-    //         = 1/{exp f(x)} * exp(x_i)
-    //         = exp(x_i - f(x))
-    if(fx.d.bd == xs[i]->d.bd) {
-      dEdxi.tvec().device(*dev.edevice) += (xs[i]->tvec() - fx.tvec()).exp() * dEdf.tvec();
-    } else {
-      Eigen::array<ptrdiff_t, 2> bcast = {1,fx.d.bd};
-      Eigen::array<int, 1> red_axis = {1};
-      dEdxi.tvec().device(*dev.edevice) += ((xs[i]->tbvec().broadcast(bcast) - fx.tbvec()).exp() * dEdf.tbvec()).sum(red_axis);
-    }
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(LogSumExp)
-
-template<class MyDevice>
-void MatrixInverse::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in MatrixInverse::forward");
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("MatrixInverse not yet implemented for CUDA");
-#else
-  auto x = **xs[0];
-  auto y = *fx;
-  y = x.inverse();
-#endif
-  // TODO: Change into tensors after resolving test errors
-  // fx.t<2>().device(*dev.edevice) = xs[0]->t<2>().inverse();
-}
-
-template<class MyDevice>
-void MatrixInverse::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in MatrixInverse::backward");
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("MatrixInverse not yet implemented for CUDA");
-#else
-  auto d = *dEdf;
-  auto y = *fx;
-  (*dEdxi) -= y * d * y;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(MatrixInverse)
-
-template<class MyDevice>
-void Max::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Tensor t(fx.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
-  t.tvec().device(*dev.edevice) = (xs[0]->tvec() > xs[1]->tvec()).cast<float>();
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().cwiseMax(xs[1]->tvec());
-}
-
-template<class MyDevice>
-void Max::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in Max::backward");
-  const Tensor t(dEdxi.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
-  if (i == 0) {
-    dEdxi.tvec().device(*dev.edevice) += t.tvec() * dEdf.tvec();
-  } else {
-    dEdxi.tvec().device(*dev.edevice) += t.tvec().binaryExpr(dEdf.tvec(), FMaxBackwardInv());
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Max)
-
-template<class MyDevice>
-void NoBackprop::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-}
-
-template<class MyDevice>
-void NoBackprop::backward_dev_impl(const MyDevice & dev,
-                                   const vector<const Tensor*>& xs,
-                                   const Tensor& fx,
-                                   const Tensor& dEdf,
-                                   unsigned i,
-                                   Tensor& dEdxi) const {
-  // no op
-}
-DYNET_NODE_INST_DEV_IMPL(NoBackprop)
-
-template<class MyDevice>
-void FlipGradient::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-}
-
-template<class MyDevice>
-void FlipGradient::backward_dev_impl(const MyDevice & dev,
-                                   const vector<const Tensor*>& xs,
-                                   const Tensor& fx,
-                                   const Tensor& dEdf,
-                                   unsigned i,
-                                   Tensor& dEdxi) const {
-  // takes negative on backprop
-  dEdxi.tvec().device(*dev.edevice) -= dEdf.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(FlipGradient)  
-  
-template<class MyDevice>
-void MaxPooling1D::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_RUNTIME_ERR("MaxPooling1D::forward_dev_impl not implemented yet");
-#if 0
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in MaxPooling1D::forward");
-  const Tensor& x = *xs.front();
-  const unsigned x_rows = x.rows();
-  DYNET_ASSERT(x.cols() == 1, "Failed dimension check in MaxPooling1D::forward");
-  const unsigned fx_rows = x_rows / width;
-  ind.resize(fx_rows);
-  Tensor fx = Zero(Dim(fx_rows, 1));
-  for (unsigned i = 0; i < fx_rows; ++i) {
-    unsigned from = i * width;
-    unsigned to = from + width;
-    if (to > x_rows) to = x_rows;
-    real best = x(from, 0);
-    unsigned bestr = from;
-    for (unsigned r = from + 1; r < to; ++r) {
-      if (x(r, 0) > best) {
-        best = x(r,0);
-        bestr = r;
-      }
-    }
-    ind[i] = bestr;
-    fx(i, 0) = best;
-  }
-  return fx;
-#endif
-}
-
-template<class MyDevice>
-void MaxPooling1D::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_RUNTIME_ERR("MaxPooling1D::backward_dev_impl not implemented yet");
-#if 0
-  const Tensor& x = *xs.front();
-  const unsigned x_rows = x.rows();
-  Tensor dEdx = Zero(Dim(x_rows, 1));
-  const unsigned fx_rows = x_rows / width;
-  DYNET_ASSERT(fx_rows == ind.size(), "Failed dimension check in MaxPooling1D::backward");
-  DYNET_ASSERT(fx_rows == dEdf.rows(), "Failed dimension check in MaxPooling1D::backward");
-  for (unsigned i = 0; i < fx_rows; ++i)
-    dEdx(ind[i], 0) = dEdf(i, 0);
-  return dEdx;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(MaxPooling1D)
-
-template<class MyDevice>
-void Min::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Tensor t(fx.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
-  t.tvec().device(*dev.edevice) = (xs[0]->tvec() < xs[1]->tvec()).cast<float>();
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().cwiseMin(xs[1]->tvec());
-}
-
-template<class MyDevice>
-void Min::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 2, "Failed dimension check in Min::backward");
-  const Tensor t(dEdxi.d, static_cast<float*>(aux_mem), fx.device, DeviceMempool::FXS);
-  if (i == 0) {
-    dEdxi.tvec().device(*dev.edevice) += t.tvec() * dEdf.tvec();
-  } else {
-    dEdxi.tvec().device(*dev.edevice) += t.tvec().binaryExpr(dEdf.tvec(), FMaxBackwardInv());
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Min)
-
-template<class MyDevice>
-void PairwiseRankLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().binaryExpr(xs[1]->tvec(), FPairwiseRankLoss(margin));
-}
-
-template<class MyDevice>
-void PairwiseRankLoss::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  if (i == 0) {
-    dEdxi.tvec().device(*dev.edevice) -= fx.tvec().binaryExpr(dEdf.tvec(), FRectifyBackward());
-  } else {
-    dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), FRectifyBackward());
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(PairwiseRankLoss)
-
-// x_1 is a vector
-// y = (x_1)_{*pval}
-template<class MyDevice>
-void PickElement::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  if(pval) {
-    DYNET_ARG_CHECK(*pval < xs[0]->d[dimension], 
-                            "PickElement::forward_impl requested element " << *pval << " from a dimension of length " << xs[0]->d[dimension]);
-    // TODO: This limit of up to 4 is somewhat arbitrary. We need to decide how to handle
-    //       things with "maximum tensor size".
-    fx.tb<3>().device(*dev.edevice) = xs[0]->tb<4>().chip(*pval, dimension); 
-  } else {
-    DYNET_ASSERT(pvals != nullptr, "Neither single nor vector of elements available in PickElement::forward");
-    DYNET_ARG_CHECK(pvals->size() == fx.d.batch_elems(),
-                            "In PickElement::forward, number of elements in the passed-in index vector (" <<  pvals->size() << ")"
-                            " did not match number of elements in mini-batch elements in expression (of dimension" << fx.d << ")");
-    for(unsigned b = 0; b < pvals->size(); ++b) {
-      DYNET_ARG_CHECK((*pvals)[b] < xs[0]->d[dimension], 
-                              "PickElement::forward_impl requested element " << (*pvals)[b] << " from a dimension of length " << xs[0]->d[dimension]);
-      if(xs[0]->d.bd == 1){
-        fx.tb<2>().chip<2>(b).device(*dev.edevice) = xs[0]->t<3>().chip((*pvals)[b], dimension); 
-      }else{
-        fx.tb<2>().chip<2>(b).device(*dev.edevice) = xs[0]->tb<3>().chip<3>(b).chip((*pvals)[b], dimension); 
-      }
-    }
-  }
-}
-
-// derivative is 0 in all dimensions except 1 for the selected element
-template<class MyDevice>
-void PickElement::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in PickElement::backward");
-  if(pval) {
-    dEdxi.tb<3>().chip(*pval, dimension).device(*dev.edevice) += dEdf.tb<2>();
-  } else {
-    DYNET_ASSERT(pvals, "Neither single nor vector of elements available in PickElement::forward");
-    for(unsigned b = 0; b < pvals->size(); ++b){
-      if(xs[0]->d.bd == 1){
-        dEdxi.t<3>().chip((*pvals)[b], dimension).device(*dev.edevice) += dEdf.tb<2>().chip<2>(b);
-      }else{
-        dEdxi.tb<3>().chip<3>(b).chip((*pvals)[b], dimension).device(*dev.edevice) += dEdf.tb<2>().chip<2>(b);
-      }
-    }
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(PickElement)
-
-// x_1 is a matrix
-// y = (x_1)[start:end]
-// slice of matrix from index start (inclusive) to index end (exclusive)
-template<class MyDevice>
-void PickRange::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0);
-  indices[dim] = start;
-  Eigen::DSizes<ptrdiff_t, 5> sizes(static_cast<ptrdiff_t>(fx.d[0]), 
-                                    static_cast<ptrdiff_t>(fx.d[1]),
-                                    static_cast<ptrdiff_t>(fx.d[2]),
-                                    static_cast<ptrdiff_t>(fx.d[3]),
-                                    static_cast<ptrdiff_t>(fx.d.bd));
-  sizes[dim] = end-start;
-  fx.tb<4>().device(*dev.edevice) = xs[0]->tb<4>().slice(indices, sizes);
-}
-
-// derivative is 0 in all dimensions except the slice range
-template<class MyDevice>
-void PickRange::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Eigen::DSizes<ptrdiff_t, 5> indices(0,0,0,0,0);
-  indices[dim] = start;
-  Eigen::DSizes<ptrdiff_t, 5> sizes(static_cast<ptrdiff_t>(fx.d[0]), 
-                                    static_cast<ptrdiff_t>(fx.d[1]),
-                                    static_cast<ptrdiff_t>(fx.d[2]),
-                                    static_cast<ptrdiff_t>(fx.d[3]),
-                                    static_cast<ptrdiff_t>(fx.d.bd));
-  sizes[dim] = end-start;
-  dEdxi.tb<4>().slice(indices, sizes).device(*dev.edevice) += dEdf.tb<4>();
-}
-DYNET_NODE_INST_DEV_IMPL(PickRange)
-
-template<class MyDevice>
-void PickBatchElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  if (pval) {
-    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().chip<1>(*pval);
-  } else {
-    DYNET_ASSERT(pvals != nullptr, "Neither single nor vector of elements available in PickBatchElements::forward");
-    DYNET_ARG_CHECK(pvals->size() == fx.d.batch_elems(), 
-                            "In PickBatchElements::forward, number of elements in the passed-in index vector (" << pvals->size() << ") "
-                            "did not match number of elements in mini-batch elements in expression (of dimension" << fx.d << ")");
-    for (unsigned b = 0; b < pvals->size(); ++b) {
-      DYNET_ARG_CHECK((*pvals)[b] < xs[0]->d.bd,
-                              "PickBatchElements::forward_impl requested element " << (*pvals)[b] << " from a batch size of " << xs[0]->d.bd);
-      fx.tbvec().chip<1>(b).device(*dev.edevice) = xs[0]->tbvec().chip<1>((*pvals)[b]);
-    }
-  }
-}
-
-template<class MyDevice>
-void PickBatchElements::backward_dev_impl(const MyDevice & dev,
-                                  const vector<const Tensor*>& xs,
-                                  const Tensor& fx,
-                                  const Tensor& dEdf,
-                                  unsigned i,
-                                  Tensor& dEdxi) const {
-  DYNET_ASSERT(i == 0, "Failed dimension check in PickBatchElements::backward");
-  if (pval) {
-    dEdxi.tbvec().chip<1>(*pval).device(*dev.edevice) += dEdf.tvec();
-  } else {
-    DYNET_ASSERT(pvals, "Neither single nor vector of elements available in PickBatchElements::backward");
-    for (unsigned b = 0; b < pvals->size(); ++b)
-      dEdxi.tbvec().chip<1>((*pvals)[b]).device(*dev.edevice) += dEdf.tbvec().chip<1>(b);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(PickBatchElements)
-
-template<class MyDevice>
-void PoissonRegressionLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  const real y = *pty;
-  const auto z = std::lgamma(y + 1);
-  // const auto x = as_scalar(*xs[0]);
-  fx.t<0>().device(*dev.edevice) = xs[0]->t<0>().exp() + z - xs[0]->t<0>() * y;
-}
-
-template<class MyDevice>
-void PoissonRegressionLoss::backward_dev_impl(const MyDevice & dev,
-                            const vector<const Tensor*>& xs,
-                            const Tensor& fx,
-                            const Tensor& dEdf,
-                            unsigned i,
-                            Tensor& dEdxi) const {
-  const real y = *pty;
-  dEdxi.t<0>().device(*dev.edevice) += xs[0]->t<0>().exp() - y;
-}
-DYNET_NODE_INST_DEV_IMPL(PoissonRegressionLoss)
-
-template<class MyDevice>
-void Pow::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed dimension check in Pow::forward");
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().pow(as_scalar(*xs[1]));
-}
-
-template<class MyDevice>
-void Pow::backward_dev_impl(const MyDevice & dev,
-                            const vector<const Tensor*>& xs,
-                            const Tensor& fx,
-                            const Tensor& dEdf,
-                            unsigned i,
-                            Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(xs.size() == 2, "Failed dimension check in Pow::backward");
-  real x2 = as_scalar(*xs[1]);
-  if (i == 0) {
-    dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().pow(x2 - 1) * dEdf.tvec() * x2;
-  } else {
-#if defined(__CUDACC__) && defined(EIGEN_NO_MALLOC)
-    DYNET_RUNTIME_ERR("CUDA memory allocation in Pow");
-#endif
-    // y = a^x
-    // dy/dx = a^x * log(a)
-    dEdxi.t<0>().device(*dev.edevice) += (fx.tvec() * xs[0]->tvec().log() * dEdf.tvec()).sum();
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Pow)
-
-template<class MyDevice>
-void Rectify::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in Rectify::forward");
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().cwiseMax(0.f);
-}
-
-template<class MyDevice>
-void Rectify::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), FRectifyBackward());
-}
-DYNET_NODE_INST_DEV_IMPL(Rectify)
-
-template<class MyDevice>
-void ExponentialLinearUnit::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in ExponentialLinearUnit::forward");
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(FELUForward(alpha, lambda));;
-}
-
-template<class MyDevice>
-void ExponentialLinearUnit::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += xs[0]->tvec().binaryExpr(dEdf.tvec(), FELUBackward(alpha, lambda));
-}
-DYNET_NODE_INST_DEV_IMPL(ExponentialLinearUnit)
-
-template<class MyDevice>
-void Reshape::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  // just point to the input memory and change dimensions
-  // dimensions are handled by forward_dim
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-}
-
-template<class MyDevice>
-void Reshape::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  const Tensor reshaped(dEdxi.d, dEdf.v, dEdxi.device, dEdf.mem_pool);
-  dEdxi.tvec().device(*dev.edevice) += reshaped.tvec();
-}
-DYNET_NODE_INST_DEV_IMPL(Reshape)
-
-template<class MyDevice>
-void RestrictedLogSoftmax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in RestrictedLogSoftmax");
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("RestrictedLogSoftmax not yet implemented for CUDA (contributions welcome!)");
-#else
-  // TODO create auxiliary mask with -infty's
-  // and do usual LogSoftmax stuff
-  if(denom.size() == 0)
-    DYNET_INVALID_ARG("Number of elements in denominator of RestrictedLogSoftmax::forward must be zero");
-  auto x = **xs[0];
-  if(denom.size() == 0)
-    DYNET_RUNTIME_ERR("RestrictedLogSoftmax currently only supports single column expressions (contributions expanding support to multiple columns welcome!)");
-  const real logz = logsumexp(x, denom);
-  TensorTools::constant(fx, -numeric_limits<real>::infinity());
-  for (auto i : denom)
-    (*fx)(i,0) = x(i,0) - logz;
-  if (denom.size() == 1) (*fx)(denom.front(), 0) = 0;
-#endif
-}
-
-template<class MyDevice>
-void RestrictedLogSoftmax::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i == 0, "Failed dimension check in RestrictedLogSoftmax");
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("RestrictedLogSoftmax not yet implemented for CUDA (contributions welcome!)");
-#else
-  float z = 0;
-  for (auto ind : denom)
-    z += (*dEdf)(ind, 0);
-  for (auto ind : denom)
-    (*dEdxi)(ind, 0) += (*dEdf)(ind, 0) - expf((*fx)(ind, 0)) * z;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(RestrictedLogSoftmax)
-
-template<class MyDevice>
-void SelectCols::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectCols::forward");
-  auto& rm = *pcols;
-  for (unsigned i = 0; i < rm.size(); ++i) {
-    DYNET_ARG_CHECK(rm[i] < xs[0]->d.cols(),
-                            "Out-of-bounds index " << rm[i] << " in SelectCols over expression of dimensions " << xs[0]->d);
-    fx.t<2>().chip<1>(i).device(*dev.edevice) = xs[0]->t<2>().chip<1>(rm[i]);
-  }
-}
-
-template<class MyDevice>
-void SelectCols::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectCols::backward");
-  auto& rm = *pcols;
-  for (unsigned i = 0; i < rm.size(); ++i)
-    dEdxi.t<2>().chip<1>(rm[i]).device(*dev.edevice) += dEdf.t<2>().chip<1>(i);
-}
-DYNET_NODE_INST_DEV_IMPL(SelectCols)
-
-template<class MyDevice>
-void SelectRows::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectRows::forward");
-  auto& rm = *prows;
-  for (unsigned i = 0; i < rm.size(); ++i) {
-    DYNET_ARG_CHECK(rm[i] < xs[0]->d.rows(),
-                            "Out-of-bounds index " << rm[i] << " in SelectRows over expression of dimensions " << xs[0]->d);
-    fx.t<4>().chip<0>(i).device(*dev.edevice) = xs[0]->t<4>().chip<0>(rm[i]);
-  }
-}
-
-template<class MyDevice>
-void SelectRows::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SelectRows::backward");
-  auto& rm = *prows;
-  for (unsigned i = 0; i < rm.size(); ++i)
-    dEdxi.t<4>().chip<0>(rm[i]).device(*dev.edevice) += dEdf.t<4>().chip<0>(i);
-}
-DYNET_NODE_INST_DEV_IMPL(SelectRows)
-
-template<class MyDevice>
-void Softmax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in Softmax::forward");
-  Tensor z(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  Tensor m(Dim({xs[0]->d.cols()},fx.d.bd), (float*)aux_mem + z.d.size(), fx.device, DeviceMempool::FXS);
-  TensorTools::logsumexp_dev(dev, *xs[0], m, z);
-  // TODO? Is this broadcast efficient on CPU?
-  Eigen::array<int, 3> bcasts = {(int)xs[0]->d.rows(), 1, 1};
-  Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
-  fx.tb<2>().device(*dev.edevice) = (xs[0]->tb<2>() - z.tvec().reshape(morph).broadcast(bcasts)).exp();
-}
-
-template<class MyDevice>
-void Softmax::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Tensor z(Dim({fx.d.cols()},fx.d.bd), (float*)aux_mem, fx.device, DeviceMempool::FXS);
-  // TODO? Is this broadcast efficient on CPU?
-  Eigen::array<int, 1> red_axis = {0};
-  z.tb<1>().device(*dev.edevice) = (fx.tb<2>() * dEdf.tb<2>()).sum(red_axis);
-  Eigen::array<int, 3> bcast = {(int)xs[0]->d.rows(), 1, 1};
-  Eigen::array<int, 3> morph = {1, (int)z.d[0], (int)z.d.bd};
-  dEdxi.tb<2>().device(*dev.edevice) += (dEdf.tb<2>() - z.tvec().reshape(morph).broadcast(bcast)) * fx.tb<2>();
-}
-DYNET_NODE_INST_DEV_IMPL(Softmax)
-
-template<class MyDevice>
-void SoftSign::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SoftSign::forward");
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().unaryExpr(FSoftSign());
-}
-
-template<class MyDevice>
-void SoftSign::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), FSoftSignBackward());
-}
-DYNET_NODE_INST_DEV_IMPL(SoftSign)
-
-template<class MyDevice>
-void Sparsemax::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  if (xs[0]->d.cols() == 1) {
-#ifdef __CUDACC__
-    DYNET_RUNTIME_ERR("Sparsemax not implemented for CUDA");
-#else
-    const unsigned rows = xs[0]->d.rows();
-    float *zs = static_cast<float*>(aux_mem);
-    std::partial_sort_copy(xs[0]->v, xs[0]->v+rows, zs, zs + rows, std::greater<float>());
-    float sum = 0, maxsum = 0;
-    unsigned k = 0;
-    for (k = 0; k < rows; ++k) {
-      sum += zs[k];
-      float t = 1 + (k + 1) * zs[k];
-      if (t <= sum) break;
-      maxsum = sum;
-    }
-    float tau = (maxsum - 1) / k;
-    auto y = *fx;
-    fx.tvec() = (xs[0]->tvec() - tau).cwiseMax(0.f);
-    int c = 1;
-    int *cc = static_cast<int*>(aux_mem);
-    for (unsigned i = 0; i < rows; ++i)
-      if (y(i,0) > 0.f) cc[c++] = i;
-    cc[0] = c - 1;
-#endif
-  } else {
-    DYNET_RUNTIME_ERR("Sparsemax not yet implemented for multiple columns");
-  }
-}
-
-template<class MyDevice>
-void Sparsemax::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("Sparsemax not implemented for CUDA");
-#else
-  const int ssize = static_cast<int*>(aux_mem)[0];
-  int *support = static_cast<int*>(aux_mem) + 1;
-  float dhat = 0;
-  auto& d = *dEdf;
-  for (int i = 0; i < ssize; ++i)
-    dhat += d(support[i], 0);
-  dhat /= ssize;
-  for (int i = 0; i < ssize; ++i)
-    (*dEdxi)(support[i], 0) += d(support[i], 0) - dhat;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(Sparsemax)
-
-template<class MyDevice>
-void SparsemaxLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  if (xs[0]->d.cols() == 1) {
-#ifdef __CUDACC__
-    DYNET_RUNTIME_ERR("SparsemaxLoss not implemented for CUDA");
-#else
-    const int rows = xs[0]->d.rows();
-    if (rows > MAX_SPARSEMAX_LOSS_ROWS)
-      DYNET_RUNTIME_ERR("MAX_SPARSEMAX_LOSS_ROWS is not sufficient. Recompile with larger value.");
-    const unsigned qsupport_size = pq->size();
-    const float qprop = 1.f / qsupport_size;
-
-    float *zs = static_cast<float*>(aux_mem);
-    std::partial_sort_copy(xs[0]->v, xs[0]->v+rows, zs, zs + rows, std::greater<float>());
-    float sum = 0, maxsum = 0;
-    int k = 0;
-    for (k = 0; k < rows; ++k) {
-      sum += zs[k];
-      float t = 1 + (k + 1) * zs[k];
-      if (t <= sum) break;
-      maxsum = sum;
-    }
-    float tau = (maxsum - 1) / k;
-    Tensor tsm(xs[0]->d, (float*)aux_mem, xs[0]->device, DeviceMempool::FXS);
-    tsm.t<1>() = (xs[0]->t<1>() - tau).cwiseMax(0.f);
-    fx.t<0>() = ( (tsm.t<1>() != 0.f).cast<float>() * (xs[0]->t<1>().square() - (tau * tau)) ).sum();
-    fx.t<0>() = ( fx.t<0>() + qprop * qprop * qsupport_size ) / 2.f;
-    for (unsigned i = 0; i < qsupport_size; ++i)
-      fx.t<0>() = fx.t<0>() - xs[0]->t<1>().chip<0>((*pq)[i]) * qprop;
-    fx.t<0>() = fx.t<0>().cwiseMax(0.f);
-#endif
-  } else {
-    DYNET_RUNTIME_ERR("SparsemaxLoss not yet implemented for multiple columns");
-  }
-}
-
-template<class MyDevice>
-void SparsemaxLoss::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("SparsemaxLoss not implemented for CUDA");
-#else
-  const float d = dEdf.v[0];
-  float* psm = static_cast<float*>(aux_mem);
-  float dqprop = d / pq->size();
-  Tensor tsm(xs[0]->d, psm, xs[0]->device, DeviceMempool::FXS);
-  auto sm = *tsm;  // sparsemax(z)
-  *dEdxi += sm * d;
-  for (unsigned i = 0; i < pq->size(); ++i)
-    (*dEdxi)((*pq)[i], 0) -= dqprop;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(SparsemaxLoss)
-
-template<class MyDevice>
-void Sum::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  const unsigned num_args = xs.size();
-  if (num_args == 1) 
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-  else if (num_args == 2 && xs[0]->d.bd == xs[1]->d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec();
-  else if (num_args == 3 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec();
-  else if (num_args == 4 && xs[0]->d.bd == xs[1]->d.bd && xs[1]->d.bd == xs[2]->d.bd && xs[2]->d.bd == xs[3]->d.bd)
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
-  else {
-    bool allSameBatchSize = std::all_of(xs.begin(), xs.end(), [&](const Tensor* x) { return x->d.bd == xs[0]->d.bd;});
-    if (allSameBatchSize) {
-      // Since they are all the same batch size, we can easily unroll the addition (results in lower GPU latency by merging multiple adds together in one CUDA call):
-      DYNET_ASSERT(num_args > 4, "Bad loop unrolling in Sum::forward");        // If it was <=4, we would have handled it in the special cases above
-      fx.tvec().device(*dev.edevice) = xs[0]->tvec() + xs[1]->tvec() + xs[2]->tvec() + xs[3]->tvec();
-
-      const unsigned remainder = (num_args - 4 ) % 4;
-      switch (remainder) {
-        case 0: break;
-        case 1: fx.tvec().device(*dev.edevice) += xs[4]->tvec(); break;
-        case 2: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec(); break;
-        case 3: fx.tvec().device(*dev.edevice) += xs[4]->tvec() + xs[5]->tvec() + xs[6]->tvec(); break;
-      }
-      for (unsigned i = 4 + remainder; i < num_args; i += 4)
-        fx.tvec().device(*dev.edevice) += xs[i]->tvec() + xs[i + 1]->tvec() + xs[i + 2]->tvec() + xs[i + 3]->tvec();
-    }
-    else {
-      // Not all the same batch size, so need to broadcast in the cases where they differ
-      TensorTools::zero(fx);
-#ifdef __CUDACC__
-      Eigen::array<int, 2> bcast({ 1, (int)fx.d.bd });
-#endif
-      for (unsigned i = 0; i < num_args; ++i) {
-        if (xs[i]->d.bd == fx.d.bd) {
-          fx.tvec().device(*dev.edevice) += xs[i]->tvec();
-        }
-        else {
-#ifdef __CUDACC__
-          fx.tbvec().device(*dev.edevice) += xs[i]->tbvec().broadcast(bcast);
-#else
-          for (unsigned b = 0; b < fx.d.bd; ++b)
-            fx.tbvec().chip<1>(b).device(*dev.edevice) += xs[i]->tvec();
-#endif
-        }
-      }
-    }
-  }
-}
-
-template<class MyDevice>
-void Sum::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  if(dEdxi.d.bd == fx.d.bd) {
-    dEdxi.tvec().device(*dev.edevice) += dEdf.tvec();
-  } else {
-    Eigen::array<int, 1> red_axis = {1};
-    dEdxi.tvec().device(*dev.edevice) += dEdf.tbvec().sum(red_axis);
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(Sum)
-
-template<class MyDevice>
-void SumElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SumElements::forward");
-  Eigen::array<int, 1> red_axis; red_axis[0] = 0;
-  fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().sum(red_axis);
-}
-
-template<class MyDevice>
-void SumElements::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in SumElements::backward");
-  Eigen::array<int, 2> bcast = {(int)xs[0]->d.batch_size(), 1};
-  dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().broadcast(bcast);
-}
-DYNET_NODE_INST_DEV_IMPL(SumElements)
-
-template<class MyDevice>
-void MomentElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in MomentElements::forward");
-  Eigen::array<int, 1> red_axis; red_axis[0] = 0;
-  if(order == 1)
-    fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().sum(red_axis) / (float) xs[0]->d.batch_size();
-  else if (order == 2)
-    fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().square().sum(red_axis) / (float) xs[0]->d.batch_size();
-  else
-    fx.tb<0>().device(*dev.edevice) = xs[0]->tbvec().pow(order).sum(red_axis) / (float) xs[0]->d.batch_size();
-}
-
-template<class MyDevice>
-void MomentElements::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MomentElements::backward");
-  Eigen::array<int, 2> bcast = {(int)xs[0]->d.batch_size(), 1};
-  if (order == 1)
-    dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().broadcast(bcast) / (float) xs[0]->d.batch_size();
-  else if (order == 2)
-    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec()) * ( 2.f / (float) xs[0]->d.batch_size());
-  else if (order == 3)
-    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().square()) * ( 3.f / (float) xs[0]->d.batch_size());
-  else
-    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().pow(order - 1)) * ( (float) order / (float) xs[0]->d.batch_size());
-}
-DYNET_NODE_INST_DEV_IMPL(MomentElements)
-
-
-template<class MyDevice>
-void StdElements::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in StdElements::forward");
-  Eigen::array<ptrdiff_t, 1> red_axis = {0};
-  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
-  Eigen::array<ptrdiff_t, 2> newaxis = {1, xs[0]->d.bd};
-  float n = (float) xs[0]->d.batch_size();
-  fx.tb<0>().device(*dev.edevice) = ((xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)).square().sum(red_axis) / n).sqrt();
-}
-
-template<class MyDevice>
-void StdElements::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 1, "Failed dimension check in StdElements::backward");
-  Eigen::array<ptrdiff_t, 2> bcast = {xs[0]->d.batch_size(), 1};
-  Eigen::array<ptrdiff_t, 2> newaxis = {1, xs[0]->d.bd};
-  Eigen::array<ptrdiff_t, 1> red_axis = {0};
-  float n = (float) xs[0]->d.batch_size();
-  dEdxi.tbvec().device(*dev.edevice) +=  (2 / n) * (xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)) * (fx.tbvec().binaryExpr(dEdf.tbvec(), FSqrtBackward())).broadcast(bcast);
-
-}
-DYNET_NODE_INST_DEV_IMPL(StdElements)
-
-template<class MyDevice>
-void MomentBatches::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in MomentBatches::forward");
-  Eigen::array<int, 1> red_axis; red_axis[0] = 1;
-  if(order == 1)
-    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().sum(red_axis) / (float) xs[0]->d.bd;
-  else if (order == 2)
-    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().square().sum(red_axis) / (float) xs[0]->d.bd;
-  else
-    fx.tvec().device(*dev.edevice) = xs[0]->tbvec().pow(order).sum(red_axis) / (float) xs[0]->d.bd;
-}
-
-template<class MyDevice>
-void MomentBatches::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MomentBatches::backward");
-  Eigen::array<int, 2> bcast = {1, (int)xs[0]->d.bd};
-  if (order == 1)
-    dEdxi.tbvec().device(*dev.edevice) += dEdf.tbvec().broadcast(bcast) / (float) xs[0]->d.bd;
-  else if (order == 2)
-    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec()) * ( 2.f / (float) xs[0]->d.bd);
-  else if (order == 3)
-    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().square()) * ( 3.f / (float) xs[0]->d.bd);
-  else
-    dEdxi.tbvec().device(*dev.edevice) += (dEdf.tbvec().broadcast(bcast) * xs[0]->tbvec().pow(order - 1)) * ( (float) order / (float) xs[0]->d.bd);
-}
-DYNET_NODE_INST_DEV_IMPL(MomentBatches)
-
-template<class MyDevice>
-void MomentDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in SumDimension");
-  Eigen::array<int, 1> reduction_axis = {(int)dimension};
-  float n = (float) xs[0]->d[dimension];
-  if(order == 1)
-    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().sum(reduction_axis) / n;
-  else if (order == 2)
-    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().square().sum(reduction_axis) / n;
-  else
-    fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().pow(order).sum(reduction_axis) / n;
-}
-
-template<class MyDevice>
-void MomentDimension::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MomentDimension::backward");
-  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = xs[0]->d[dimension];
-  Eigen::array<int, 4> morph = {(int)xs[0]->d[0],(int)xs[0]->d[1],(int)xs[0]->d[2],(int)xs[0]->d.bd}; morph[dimension] = 1;
-  float n = (float) xs[0]->d[dimension];
-  if (order == 1)
-    dEdxi.tb<3>().device(*dev.edevice) += dEdf.tb<2>().reshape(morph).broadcast(bcast) / n;
-  else if (order == 2)
-    dEdxi.tb<3>().device(*dev.edevice) += (dEdf.tb<2>().reshape(morph).broadcast(bcast) * xs[0]->tb<3>()) * ( 2.f / n);
-  else if (order == 3)
-    dEdxi.tb<3>().device(*dev.edevice) += (dEdf.tb<2>().reshape(morph).broadcast(bcast) * xs[0]->tb<3>().square()) * ( 3.f / n);
-  else
-    dEdxi.tb<3>().device(*dev.edevice) += (dEdf.tb<2>().reshape(morph).broadcast(bcast) * xs[0]->tb<3>().pow(order - 1)) * ( (float) order / n);
-}
-DYNET_NODE_INST_DEV_IMPL(MomentDimension)
-
-template<class MyDevice>
-void StdDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed input count check in SumDimension");
-  Eigen::array<int, 1> red_axis = {(int)dimension};
-  Eigen::array<int, 4> morph = {(int)xs[0]->d[0],(int)xs[0]->d[1],(int)xs[0]->d[2],(int)xs[0]->d.bd}; morph[dimension] = 1;
-  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = xs[0]->d[dimension];
-  float n = (float) xs[0]->d[dimension];
-  fx.tb<2>().device(*dev.edevice) = ((xs[0]->tb<3>() - (xs[0]->tb<3>().sum(red_axis).reshape(morph) / n).broadcast(bcast)).square().sum(red_axis) / n).sqrt();
-}
-
-template<class MyDevice>
-void StdDimension::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in StdDimension::backward");
-  Eigen::array<int, 1> red_axis = {(int)dimension};
-  Eigen::array<int, 4> bcast = {1,1,1,1}; bcast[dimension] = xs[0]->d[dimension];
-  Eigen::array<int, 4> morph = {(int)xs[0]->d[0],(int)xs[0]->d[1],(int)xs[0]->d[2],(int)xs[0]->d.bd}; morph[dimension] = 1;
-  float n = (float) xs[0]->d[dimension];
-  dEdxi.tb<3>().device(*dev.edevice) +=  (2 / n) * (xs[0]->tb<3>() - (xs[0]->tb<3>().sum(red_axis).reshape(morph) / n).broadcast(bcast)) * (fx.tb<2>().binaryExpr(dEdf.tb<2>(), FSqrtBackward())).reshape(morph).broadcast(bcast);
-
-}
-DYNET_NODE_INST_DEV_IMPL(StdDimension)
-
-
-template<class MyDevice>
-void StdBatches::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 1, "Failed dimension check in StdBatches::forward");
-  Eigen::array<ptrdiff_t, 1> red_axis = {1};
-  Eigen::array<ptrdiff_t, 2> newaxis = {xs[0]->d.batch_size(), 1};
-  Eigen::array<ptrdiff_t, 2> bcast = {1, xs[0]->d.bd};
-  float n = (float)xs[0]->d.bd;
-  fx.t<1>().device(*dev.edevice) = ((xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)).square().sum(red_axis) / n).sqrt();
-}
-
-template<class MyDevice>
-void StdBatches::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ASSERT(i < 1, "Failed dimension check in StdBatches::backward");
-  Eigen::array<ptrdiff_t, 1> red_axis = {1};
-  Eigen::array<ptrdiff_t, 2> bcast = {1, xs[0]->d.bd};
-  Eigen::array<ptrdiff_t, 2> newaxis = {xs[0]->d.batch_size(), 1};
-  float n = (float)xs[0]->d.bd;
-  dEdxi.tbvec().device(*dev.edevice) +=  (2 / n) * (xs[0]->tbvec() - (xs[0]->tbvec().sum(red_axis).reshape(newaxis) / n).broadcast(bcast)) * (fx.tbvec().binaryExpr(dEdf.tbvec(), FSqrtBackward())).broadcast(bcast);
-
-}
-DYNET_NODE_INST_DEV_IMPL(StdBatches)
-
-
-template<class MyDevice>
-void SumBatches::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ARG_CHECK(xs.size() == 1, "Failed dimension check in SumBatches::forward");
-  unsigned num_args = xs[0]->d.bd;
-#ifdef __CUDACC__
-  Eigen::array<int, 1> red_axis; red_axis[0] = 2;
-  fx.t<2>().device(*dev.edevice) = xs[0]->tb<2>().sum(red_axis);
-#else
-  // TODO: Is this CPU version really good? Overhead can probably be reduced.
-  auto res = *fx;
-  const unsigned remainder = num_args % 4;
-  switch (remainder) {
-    case 0: res.setZero(); break;
-    case 1: res = xs[0]->batch_matrix(0); break;
-    case 2: res = xs[0]->batch_matrix(0) + xs[0]->batch_matrix(1); break;
-    case 3: res = xs[0]->batch_matrix(0) + xs[0]->batch_matrix(1) + xs[0]->batch_matrix(2); break;
-  }
-  for (unsigned i = remainder; i < num_args; i += 4)
-    res += xs[0]->batch_matrix(i) + xs[0]->batch_matrix(i+1) + xs[0]->batch_matrix(i+2) + xs[0]->batch_matrix(i+3);
-#endif
-}
-
-template<class MyDevice>
-void SumBatches::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in SumBatches::backward");
-#ifdef __CUDACC__
-  Eigen::array<int, 3> bcast({1, 1, (int)fx.d.bd});
-  dEdxi.tb<2>().device(*dev.edevice) += dEdf.tb<2>().broadcast(bcast);
-#else
-  for (unsigned i = 0; i < dEdxi.d.bd; ++i)
-    dEdxi.batch_matrix(i) += *dEdf;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(SumBatches)
-
-template<class MyDevice>
-void TraceOfProduct::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("TraceOfProduct not yet implemented for CUDA");
-#else
-  auto x1 = **xs[0];
-  auto x2 = **xs[1];
-  fx.v[0] = (x1 * x2.transpose()).trace();
-#endif
-}
-
-template<class MyDevice>
-void TraceOfProduct::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i < 2, "Failed dimension check in TraceOfProduce::backward");
-#ifdef __CUDACC__
-  DYNET_RUNTIME_ERR("TraceOfProduct not yet implemented for CUDA");
-#else
-  const float d = dEdf.v[0];
-  auto xother = **xs[1 - i];
-  *dEdxi += d * xother;
-#endif
-}
-DYNET_NODE_INST_DEV_IMPL(TraceOfProduct)
-
-template<class MyDevice>
-void Tanh::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  fx.tvec().device(*dev.edevice) = xs[0]->tvec().tanh();
-}
-
-template<class MyDevice>
-void Tanh::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  dEdxi.tvec().device(*dev.edevice) += fx.tvec().binaryExpr(dEdf.tvec(), scalar_tanh_backward_op<float>());
-}
-DYNET_NODE_INST_DEV_IMPL(Tanh)
-
-template<class MyDevice>
-void Transpose::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  if (dim.num_nonone_dims() <= 1) {
-    fx.tvec().device(*dev.edevice) = xs[0]->tvec();
-  } else {
-    Eigen::array<ptrdiff_t, 5> order;
-    for(size_t i = 0; i < 5; ++i)
-      order[i] = (i >= dims.size() ? i : dims[i]);
-    fx.tb<4>().device(*dev.edevice) = xs[0]->tb<4>().shuffle(order);
-  }
-}
-
-template<class MyDevice>
-void Transpose::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Eigen::array<ptrdiff_t, 5> order;
-  for(size_t i = 0; i < 5; ++i)
-    order[(i >= dims.size() ? i : dims[i])] = i;
-  dEdxi.tb<4>().device(*dev.edevice) += dEdf.tb<4>().shuffle(order);
-}
-DYNET_NODE_INST_DEV_IMPL(Transpose)
-
-template<class MyDevice>
-void Zeroes::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in Zeroes::forward");
-  TensorTools::zero(fx);
-}
-
-template<class MyDevice>
-void Zeroes::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
-}
-DYNET_NODE_INST_DEV_IMPL(Zeroes)
-
-template<class MyDevice>
-void RandomNormal::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomNormal::forward");
-  TensorTools::randomize_normal(fx);
-}
-
-template<class MyDevice>
-void RandomNormal::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
-}
-DYNET_NODE_INST_DEV_IMPL(RandomNormal)
-
-template<class MyDevice>
-void RandomBernoulli::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomBernoulli::forward");
-  TensorTools::randomize_bernoulli(fx, p, scale);
-}
-
-template<class MyDevice>
-void RandomBernoulli::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
-}
-DYNET_NODE_INST_DEV_IMPL(RandomBernoulli)
-
-template<class MyDevice>
-void RandomUniform::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomUniform::forward");
-  TensorTools::randomize_uniform(fx, left, right);
-}
-
-template<class MyDevice>
-void RandomUniform::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
-}
-DYNET_NODE_INST_DEV_IMPL(RandomUniform)
-
-template<class MyDevice>
-void RandomGumbel::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 0, "Failed dimension check in RandomGumbel::forward");
-  DYNET_ARG_CHECK(mu == 0.0 && beta == 1.0, "RandomGumbel only supports Gumbel(0,1) at the moment (pull requests welcome)");
-  TensorTools::randomize_uniform(fx, 0, 1);
-  float eps = 1e-20;
-  fx.tvec().device(*dev.edevice) = -(-fx.tvec().cwiseMax(eps).log()).cwiseMax(eps).log();
-}
-
-template<class MyDevice>
-void RandomGumbel::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_RUNTIME_ERR("Called backward() on an arity 0 node");
-}
-DYNET_NODE_INST_DEV_IMPL(RandomGumbel)
-
-template<class MyDevice>
-void MaxDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Eigen::DenseIndex* maxmap = static_cast<Eigen::DenseIndex*>(aux_mem);
-  const unsigned batch_size = dim.batch_elems();
-  const unsigned first_dim_size = dim[0];
-  const unsigned second_dim_size = dim[1];
-  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(maxmap, first_dim_size, second_dim_size, batch_size);
-  const Eigen::array<Eigen::DenseIndex, 1> reduction_axis = {reduced_dim};
-  locs.device(*dev.edevice) = xs[0]->tb<3>().argmax(reduced_dim);
-  fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().maximum(reduction_axis);
-}
-
-template<class MyDevice>
-void MaxDimension::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MaxDimension::backward");
-#ifdef __CUDACC__
-  vector<Eigen::DenseIndex> indices(dim.size());
-  Eigen::DenseIndex* maxmap = &indices[0];
-  CUDA_CHECK(cudaMemcpy((void*)maxmap, aux_mem, sizeof(Eigen::DenseIndex) * dim.size(), cudaMemcpyDeviceToHost));
-#else
-  Eigen::DenseIndex* maxmap = static_cast<Eigen::DenseIndex*>(aux_mem);
-#endif
-  const unsigned batch_size = dim.batch_elems();
-  const unsigned first_dim_size = dim[0];
-  const unsigned second_dim_size = dim[1];
-  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(maxmap, first_dim_size, second_dim_size, batch_size);
-  for(unsigned b = 0; b < batch_size; ++b){
-    for(unsigned j = 0; j < second_dim_size; ++j){
-      for(unsigned i = 0; i < first_dim_size; ++i){
-        if (reduced_dim > second_dim)
-          dEdxi.tb<3>().chip<3>(b).chip(locs(i, j, b), reduced_dim).chip(j, second_dim).chip(i, first_dim).device(*dev.edevice) 
-            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
-        else if (reduced_dim > first_dim)
-          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(locs(i, j, b), reduced_dim).chip(i, first_dim).device(*dev.edevice) 
-            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
-        else
-          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(i, first_dim).chip(locs(i, j, b), reduced_dim).device(*dev.edevice) 
-            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
-      }
-    }
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(MaxDimension)
-
-template<class MyDevice>
-void MinDimension::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  Eigen::DenseIndex* minmap = static_cast<Eigen::DenseIndex*>(aux_mem);
-  const unsigned batch_size = dim.batch_elems();
-  const unsigned first_dim_size = dim[0];
-  const unsigned second_dim_size = dim[1];
-  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(minmap, first_dim_size, second_dim_size, batch_size);
-  const Eigen::array<Eigen::DenseIndex, 1> reduction_axis = {reduced_dim};
-  locs.device(*dev.edevice) = xs[0]->tb<3>().argmin(reduced_dim);
-  fx.tb<2>().device(*dev.edevice) = xs[0]->tb<3>().minimum(reduction_axis);
-}
-
-template<class MyDevice>
-void MinDimension::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  DYNET_ARG_CHECK(i == 0, "Failed dimension check in MinDimension::backward");
-#ifdef __CUDACC__
-  vector<Eigen::DenseIndex> indices(dim.size());
-  Eigen::DenseIndex* minmap = &indices[0];
-  CUDA_CHECK(cudaMemcpy((void*)minmap, aux_mem, sizeof(Eigen::DenseIndex) * dim.size(), cudaMemcpyDeviceToHost));
-#else
-  Eigen::DenseIndex* minmap = static_cast<Eigen::DenseIndex*>(aux_mem);
-#endif
-  const unsigned batch_size = dim.batch_elems();
-  const unsigned first_dim_size = dim[0];
-  const unsigned second_dim_size = dim[1];
-  Eigen::TensorMap<Eigen::Tensor<Eigen::DenseIndex, 3>> locs(minmap, first_dim_size, second_dim_size, batch_size);
-  for(unsigned b = 0; b < batch_size; ++b){
-    for(unsigned j = 0; j < second_dim_size; ++j){
-      for(unsigned i = 0; i < first_dim_size; ++i){
-        if (reduced_dim > second_dim)
-          dEdxi.tb<3>().chip<3>(b).chip(locs(i, j, b), reduced_dim).chip(j, second_dim).chip(i, first_dim).device(*dev.edevice) 
-            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
-        else if (reduced_dim > first_dim)
-          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(locs(i, j, b), reduced_dim).chip(i, first_dim).device(*dev.edevice) 
-            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
-        else
-          dEdxi.tb<3>().chip<3>(b).chip(j, second_dim).chip(i, first_dim).chip(locs(i, j, b), reduced_dim).device(*dev.edevice) 
-            += dEdf.tb<2>().chip<2>(b).chip<1>(j).chip<0>(i);
-      }
-    }
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(MinDimension)
-
-template<class MyDevice>
-void WeightNormalization::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
-  DYNET_ASSERT(xs.size() == 2, "Failed dimension check in WeightNormalization::forward");
-  Eigen::array<ptrdiff_t, 1> red_axis = {0};
-  Eigen::array<ptrdiff_t, 1> bcast = {xs[0]->d.size()};
-  Eigen::array<ptrdiff_t, 1> morph = {1};
-  fx.tvec().device(*dev.edevice) = (xs[0]->tvec() / xs[0]->tvec().square().sum(red_axis).sqrt().reshape(morph).broadcast(bcast)) * as_scalar(*xs[1]);
-}
-
-template<class MyDevice>
-void WeightNormalization::backward_dev_impl(const MyDevice & dev,
-                             const vector<const Tensor*>& xs,
-                             const Tensor& fx,
-                             const Tensor& dEdf,
-                             unsigned i,
-                             Tensor& dEdxi) const {
-  Eigen::array<ptrdiff_t, 1> red_axis = {0};
-  Eigen::array<ptrdiff_t, 1> bcast = {xs[0]->d.size()};
-  Eigen::array<ptrdiff_t, 1> morph = {1};
-  if (i==0){
-    dEdxi.tvec().device(*dev.edevice) += (dEdf.tvec() / xs[0]->tvec().square().sum(red_axis).sqrt().reshape(morph).broadcast(bcast)) * as_scalar(*xs[1]) - fx.tvec() * (((dEdf.tvec() * xs[0]->tvec()).sum(red_axis)) / xs[0]->tvec().square().sum(red_axis)).reshape(morph).broadcast(bcast);
-  }else{
-    dEdxi.t<0>().device(*dev.edevice) += ((dEdf.tvec() * xs[0]->tvec()).sum(red_axis)) /  xs[0]->tvec().square().sum(red_axis).sqrt();
-  }
-}
-DYNET_NODE_INST_DEV_IMPL(WeightNormalization)
-
-} // namespace dynet
diff --git a/dynet/nodes.h b/dynet/nodes.h
index b54663a0d..ef421418b 100644
--- a/dynet/nodes.h
+++ b/dynet/nodes.h
@@ -17,6 +17,20 @@ struct AddVectorToAllColumns : public Node {
   DYNET_NODE_DEFINE_DEV_IMPL()
 };
 
+// with a single argument x \in R^{n x m}
+// y_i = \sum_j x_i,j / m
+struct AverageColumns : public Node {
+  template <typename T> explicit AverageColumns(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// sum along a single dimension
+struct SumDimension : public Node {
+  template <typename T> explicit SumDimension(const T& a, unsigned d) : Node(a), dimension(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  unsigned dimension;
+};
+
 // y = L_sparsemax(x_0; q)
 // where x_0 is a std::vector of "unnormalized" probabilities
 // q are the std::vector of labels
@@ -128,13 +142,6 @@ struct Reshape : public Node {
   Dim to;
 };
 
-// y_i = \sum_{j=1}^n x_1:{i-1+j}
-struct KMHNGram : public Node {
-  explicit KMHNGram(const std::initializer_list<VariableIndex>& a, unsigned n) : Node(a), n(n) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  unsigned n;  // width, n=2 for Karl's paper
-};
-
 // n_{i,j} ~ N(0,stddev)
 // y = x + n
 struct GaussianNoise : public Node {

From c1d0a56804af9f60c72e5a91f19421cfa4de9c89 Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 10 Jul 2017 10:41:13 -0400
Subject: [PATCH 2/3] Separated header files

Former-commit-id: c4da3b7712b218c62523ca0a7d6129c625501f5d
---
 dynet/dynet.cc                   |   1 -
 dynet/nodes-activations.cc       |   2 +-
 dynet/nodes-activations.h        |  57 ++
 dynet/nodes-affinetransform.cc   |   4 +-
 dynet/nodes-affinetransform.h    |  28 +
 dynet/nodes-arith-const.cc       |   2 +-
 dynet/nodes-arith-const.h        |  41 ++
 dynet/nodes-arith-cwise.cc       |   2 +-
 dynet/nodes-arith-cwise.h        |  34 ++
 dynet/nodes-arith-scalar.cc      |   2 +-
 dynet/nodes-arith-scalar.h       |  32 ++
 dynet/nodes-arith-sum.cc         |   2 +-
 dynet/nodes-arith-sum.h          |  57 ++
 dynet/nodes-arith-unary.cc       |   2 +-
 dynet/nodes-arith-unary.h        |  83 +++
 dynet/nodes-concat.cc            |   2 +-
 dynet/nodes-concat.h             |  39 ++
 dynet/nodes-const.cc             |   2 +-
 dynet/nodes-const.h              |  18 +
 dynet/nodes-contract.cc          |  63 ++-
 dynet/nodes-contract.h           |  11 +-
 dynet/nodes-conv.h               |  63 +--
 dynet/nodes-conv2d.cc            |   3 +-
 dynet/nodes-conv2d.h             |  34 ++
 dynet/nodes-dropout.cc           |   2 +-
 dynet/nodes-dropout.h            |  47 ++
 dynet/nodes-flow.cc              |   2 +-
 dynet/nodes-flow.h               |  46 ++
 dynet/nodes-hinge.cc             |   4 +-
 dynet/nodes-hinge.h              |  28 +
 dynet/nodes-linalg.cc            |   2 +-
 dynet/nodes-linalg.h             |  40 ++
 dynet/nodes-logsumexp.cc         |   2 +-
 dynet/nodes-logsumexp.h          |  20 +
 dynet/nodes-losses.cc            |   2 +-
 dynet/nodes-losses.h             |  47 ++
 dynet/nodes-matrixmultiply.cc    |   4 +-
 dynet/nodes-matrixmultiply.h     |  27 +
 dynet/nodes-maxpooling2d.cc      |   3 +-
 dynet/nodes-maxpooling2d.h       |  36 ++
 dynet/nodes-minmax.cc            |   2 +-
 dynet/nodes-minmax.h             |  53 ++
 dynet/nodes-moments.cc           |   2 +-
 dynet/nodes-moments.h            |  76 +++
 dynet/nodes-normalization.cc     |   2 +-
 dynet/nodes-normalization.h      |  18 +
 dynet/nodes-norms.cc             |   2 +-
 dynet/nodes-norms.h              |  25 +
 dynet/nodes-pickneglogsoftmax.cc |   4 +-
 dynet/nodes-pickneglogsoftmax.h  |  40 ++
 dynet/nodes-random.cc            |   2 +-
 dynet/nodes-random.h             |  59 ++
 dynet/nodes-select.cc            |   2 +-
 dynet/nodes-select.h             |  84 +++
 dynet/nodes-similarities.cc      |   2 +-
 dynet/nodes-similarities.h       |  47 ++
 dynet/nodes-softmaxes.cc         |   2 +-
 dynet/nodes-softmaxes.h          |  66 +++
 dynet/nodes-trig.cc              |   4 +-
 dynet/nodes-trig.h               |  20 +
 dynet/nodes.h                    | 899 ++-----------------------------
 61 files changed, 1316 insertions(+), 991 deletions(-)
 create mode 100644 dynet/nodes-activations.h
 create mode 100644 dynet/nodes-affinetransform.h
 create mode 100644 dynet/nodes-arith-const.h
 create mode 100644 dynet/nodes-arith-cwise.h
 create mode 100644 dynet/nodes-arith-scalar.h
 create mode 100644 dynet/nodes-arith-sum.h
 create mode 100644 dynet/nodes-arith-unary.h
 create mode 100644 dynet/nodes-concat.h
 create mode 100644 dynet/nodes-const.h
 create mode 100644 dynet/nodes-conv2d.h
 create mode 100644 dynet/nodes-dropout.h
 create mode 100644 dynet/nodes-flow.h
 create mode 100644 dynet/nodes-hinge.h
 create mode 100644 dynet/nodes-linalg.h
 create mode 100644 dynet/nodes-logsumexp.h
 create mode 100644 dynet/nodes-losses.h
 create mode 100644 dynet/nodes-matrixmultiply.h
 create mode 100644 dynet/nodes-maxpooling2d.h
 create mode 100644 dynet/nodes-minmax.h
 create mode 100644 dynet/nodes-moments.h
 create mode 100644 dynet/nodes-normalization.h
 create mode 100644 dynet/nodes-norms.h
 create mode 100644 dynet/nodes-pickneglogsoftmax.h
 create mode 100644 dynet/nodes-random.h
 create mode 100644 dynet/nodes-select.h
 create mode 100644 dynet/nodes-similarities.h
 create mode 100644 dynet/nodes-softmaxes.h
 create mode 100644 dynet/nodes-trig.h

diff --git a/dynet/dynet.cc b/dynet/dynet.cc
index eb01eb301..fc46cfbaa 100644
--- a/dynet/dynet.cc
+++ b/dynet/dynet.cc
@@ -1,7 +1,6 @@
 #include "dynet/dynet.h"
 
 #include "dynet/exec.h"
-#include "dynet/nodes.h"
 #include "dynet/param-nodes.h"
 #include "dynet/aligned-mem-pool.h"
 #include "dynet/dynet-helper.h"
diff --git a/dynet/nodes-activations.cc b/dynet/nodes-activations.cc
index 678626ba9..e1050230c 100644
--- a/dynet/nodes-activations.cc
+++ b/dynet/nodes-activations.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-activations.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-activations.h b/dynet/nodes-activations.h
new file mode 100644
index 000000000..a5552d035
--- /dev/null
+++ b/dynet/nodes-activations.h
@@ -0,0 +1,57 @@
+#ifndef DYNET_NODES_ACTIVATIONS_H_
+#define DYNET_NODES_ACTIVATIONS_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = max(0,x)
+struct Rectify : public Node {
+  explicit Rectify(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::rectify); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = \sigma(x_1)
+struct LogisticSigmoid : public Node {
+  explicit LogisticSigmoid(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::logistic); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x / (1 + |x|)
+struct SoftSign : public Node {
+  explicit SoftSign(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::softsign); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = erf x_1
+struct Erf : public Node {
+  explicit Erf(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::erf); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = ELU(0,x)
+struct ExponentialLinearUnit : public Node {
+  explicit ExponentialLinearUnit(const std::initializer_list<VariableIndex>& a, float lambda=1.f, float alpha=1.f) : Node(a), lambda(lambda), alpha(alpha) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::rectify); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  float lambda, alpha;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-affinetransform.cc b/dynet/nodes-affinetransform.cc
index a417a0651..596761248 100644
--- a/dynet/nodes-affinetransform.cc
+++ b/dynet/nodes-affinetransform.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-affinetransform.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/cuda-matrix-multiply.h"
@@ -7,6 +7,8 @@ using namespace std;
 
 namespace dynet {
 
+// ************* AffineTransform *************
+
 #ifndef __CUDACC__
 
 string AffineTransform::as_string(const vector<string>& arg_names) const {
diff --git a/dynet/nodes-affinetransform.h b/dynet/nodes-affinetransform.h
new file mode 100644
index 000000000..4f67ebc05
--- /dev/null
+++ b/dynet/nodes-affinetransform.h
@@ -0,0 +1,28 @@
+#ifndef DYNET_NODES_AFFINETRANSFORM_H_
+#define DYNET_NODES_AFFINETRANSFORM_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1 \sum_{i=2, 4 ...} A_i * x_{i+1}
+struct AffineTransform : public Node {
+  template <typename T> explicit AffineTransform(const T& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  mutable float* dEdf_mem;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-arith-const.cc b/dynet/nodes-arith-const.cc
index ce7818e71..13b260301 100644
--- a/dynet/nodes-arith-const.cc
+++ b/dynet/nodes-arith-const.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-arith-const.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-arith-const.h b/dynet/nodes-arith-const.h
new file mode 100644
index 000000000..86c736206
--- /dev/null
+++ b/dynet/nodes-arith-const.h
@@ -0,0 +1,41 @@
+#ifndef DYNET_NODES_ARITH_CONST_H_
+#define DYNET_NODES_ARITH_CONST_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = c + x_1
+// (c is a std::vector or matrix of the constant, usually 1, but can be configured)
+struct ConstantPlusX : public Node {
+  explicit ConstantPlusX(const std::initializer_list<VariableIndex>& a, real o) : Node(a), c(o) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::plus_const); s.add_float(c); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  real c;
+};
+
+// y = c - x_1
+// (c is a std::vector or matrix of the constant, usually 1, but can be configured)
+struct ConstantMinusX : public Node {
+  explicit ConstantMinusX(const std::initializer_list<VariableIndex>& a, real o) : Node(a), c(o) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  real c;
+};
+
+// y = alpha * x_1
+struct ConstScalarMultiply : public Node {
+  explicit ConstScalarMultiply(const std::initializer_list<VariableIndex>& a, float alpha) : Node(a), alpha(alpha) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::scalar_mult); s.add_float(alpha); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  float alpha;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-arith-cwise.cc b/dynet/nodes-arith-cwise.cc
index 26ac47b0e..28eccf3cf 100644
--- a/dynet/nodes-arith-cwise.cc
+++ b/dynet/nodes-arith-cwise.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-arith-cwise.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-arith-cwise.h b/dynet/nodes-arith-cwise.h
new file mode 100644
index 000000000..925332ee2
--- /dev/null
+++ b/dynet/nodes-arith-cwise.h
@@ -0,0 +1,34 @@
+#ifndef DYNET_NODES_ARITH_CWISE_H_
+#define DYNET_NODES_ARITH_CWISE_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1 \cdot x_2  (Hadamard product)
+struct CwiseMultiply : public Node {
+  explicit CwiseMultiply(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x_1 / x_2  (cwiseQuotient)
+struct CwiseQuotient : public Node {
+  explicit CwiseQuotient(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = pow(x_1, x_2)
+// x_2 raise every element in x_1 to the power of scalar x_2
+struct Pow : public Node {
+  explicit Pow(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-arith-scalar.cc b/dynet/nodes-arith-scalar.cc
index baca3fe0a..eafd19af5 100644
--- a/dynet/nodes-arith-scalar.cc
+++ b/dynet/nodes-arith-scalar.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-arith-scalar.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-arith-scalar.h b/dynet/nodes-arith-scalar.h
new file mode 100644
index 000000000..f3d536895
--- /dev/null
+++ b/dynet/nodes-arith-scalar.h
@@ -0,0 +1,32 @@
+#ifndef DYNET_NODES_ARITH_SCALAR_H_
+#define DYNET_NODES_ARITH_SCALAR_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1 + x_2  (Addition where x_2 is a scalar)
+struct ScalarAdd : public Node {
+  explicit ScalarAdd(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x_1 \cdot x_2  (Hadamard product where x_1 is a scalar)
+struct ScalarMultiply : public Node {
+  explicit ScalarMultiply(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x_1 / x_2  (Elementwise division where x_2 is a scalar)
+struct ScalarQuotient : public Node {
+  explicit ScalarQuotient(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-arith-sum.cc b/dynet/nodes-arith-sum.cc
index d3b96f24f..f7622e99c 100644
--- a/dynet/nodes-arith-sum.cc
+++ b/dynet/nodes-arith-sum.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-arith-sum.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-arith-sum.h b/dynet/nodes-arith-sum.h
new file mode 100644
index 000000000..4a419f545
--- /dev/null
+++ b/dynet/nodes-arith-sum.h
@@ -0,0 +1,57 @@
+#ifndef DYNET_NODES_ARITH_SUM_H_
+#define DYNET_NODES_ARITH_SUM_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = \sum_i x_i
+struct Sum : public Node {
+  template <typename T> explicit Sum(const T& a) : Node(a) {}
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    if(dim.bd != 1)
+      autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+// y = \sum_i,j,... x[i,j,...]
+struct SumElements : public Node {
+  template <typename T> explicit SumElements(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+// sum along a single dimension
+struct SumDimension : public Node {
+  template <typename T> explicit SumDimension(const T& a, unsigned d) : Node(a), dimension(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  unsigned dimension;
+};
+
+// y = \sum_i x_i
+struct SumBatches : public Node {
+  template <typename T> explicit SumBatches(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+// M = x_0, v = x_1
+// y = M + v (broadcasting over columns)
+struct AddVectorToAllColumns : public Node {
+  explicit AddVectorToAllColumns(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-arith-unary.cc b/dynet/nodes-arith-unary.cc
index 4d779279e..c712ee3c2 100644
--- a/dynet/nodes-arith-unary.cc
+++ b/dynet/nodes-arith-unary.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-arith-unary.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-arith-unary.h b/dynet/nodes-arith-unary.h
new file mode 100644
index 000000000..9200287e2
--- /dev/null
+++ b/dynet/nodes-arith-unary.h
@@ -0,0 +1,83 @@
+#ifndef DYNET_NODES_ARITH_UNARY_H_
+#define DYNET_NODES_ARITH_UNARY_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1 \odot x_1
+struct Square : public Node {
+  explicit Square(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::square); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x_1 \odot x_1 \odot x_1
+struct Cube : public Node {
+  explicit Cube(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::cube); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = sqrt x_1
+struct Sqrt : public Node {
+  explicit Sqrt(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::sqrt); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = exp x_1
+struct Exp : public Node {
+  explicit Exp(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::exp); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = log x_1  (base e, i.e., natural log)
+struct Log : public Node {
+  explicit Log(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::log); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = -x_1
+struct Negate : public Node {
+  explicit Negate(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; } 
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::negate); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = abs x_1
+struct Abs : public Node {
+  explicit Abs(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::abs); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = lgamma x_1
+struct LogGamma : public Node {
+  explicit LogGamma(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::loggamma); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-concat.cc b/dynet/nodes-concat.cc
index f601ef67d..fa96965b2 100644
--- a/dynet/nodes-concat.cc
+++ b/dynet/nodes-concat.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-concat.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-concat.h b/dynet/nodes-concat.h
new file mode 100644
index 000000000..fabc35b08
--- /dev/null
+++ b/dynet/nodes-concat.h
@@ -0,0 +1,39 @@
+#ifndef DYNET_NODES_CONCAT_H_
+#define DYNET_NODES_CONCAT_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// concatenate along a particular dimension
+struct Concatenate : public Node {
+  template <typename T> explicit Concatenate(const T& a, unsigned d) : Node(a), dimension(d) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(args.size(), 1); }  
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  // src_row_indices[i] says what row in fx the ith x std::vector was assigned to
+  // used to simplify backprop
+  mutable std::vector<unsigned> src_indices;
+  unsigned dimension;
+};
+
+// concatenate different batched experssions into one single batched tensor
+struct ConcatenateToBatch : public Node {
+  template <typename T> explicit ConcatenateToBatch(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override {return true;}
+  mutable std::vector<unsigned> src_element_indices;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-const.cc b/dynet/nodes-const.cc
index 3f99a84d4..0f3ec4fc2 100644
--- a/dynet/nodes-const.cc
+++ b/dynet/nodes-const.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-const.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-const.h b/dynet/nodes-const.h
new file mode 100644
index 000000000..6f21183e9
--- /dev/null
+++ b/dynet/nodes-const.h
@@ -0,0 +1,18 @@
+#ifndef DYNET_NODES_CONST_H_
+#define DYNET_NODES_CONST_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// represents a simple std::vector of 0s
+struct Zeroes : public Node {
+  explicit Zeroes(const Dim& d) : dim(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  Dim dim;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-contract.cc b/dynet/nodes-contract.cc
index 99719ccfa..aad22fa87 100644
--- a/dynet/nodes-contract.cc
+++ b/dynet/nodes-contract.cc
@@ -5,14 +5,11 @@
 #include <stdexcept>
 
 #include "dynet/nodes-macros.h"
-#include "dynet/nodes.h"
 
 // This file takes a long time to compile on GPU. Uncomment this line to skip it.
 #define DYNET_SKIP_CUDA_CONTRACTIONS
 
-
 #if defined(__CUDACC__) && !defined(DYNET_SKIP_CUDA_CONTRACTIONS)
-#include "dynet/nodes.cc"
 #include "dynet/cuda.h"
 #include "dynet/gpu-ops.h"
 #include "dynet/cuda-matrix-multiply.h"
@@ -23,6 +20,8 @@ using namespace std;
 
 namespace dynet {
 
+// ************* InnerProduct3D_1D *************
+
 #ifndef __CUDACC__
 
 string InnerProduct3D_1D::as_string(const vector<string>& arg_names) const {
@@ -50,32 +49,6 @@ Dim InnerProduct3D_1D::dim_forward(const vector<Dim>& xs) const {
   return d;
 }
 
-string InnerProduct3D_1D_1D::as_string(const vector<string>& arg_names) const {
-  ostringstream s;
-  s << "dotdot(" << arg_names[0] << "," << arg_names[1] << "," << arg_names[2] << ')';
-  if (arg_names.size() == 4) s << " + " << arg_names[3];
-  return s.str();
-}
-
-Dim InnerProduct3D_1D_1D::dim_forward(const vector<Dim>& xs) const {
-  if (xs.size() != 3 && xs.size() != 4)
-    throw std::invalid_argument("Expected three or four arguments in InnerProduct3D_1D");
-  if (xs[0].ndims() != 3 ||
-      !LooksLikeVector(xs[1]) ||
-      !LooksLikeVector(xs[2])) {
-    // TODO fix add check
-    ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D_1D: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  Dim d({xs[0].size(0)}, max(max(xs[0].bd, xs[1].bd), xs[2].bd));
-  if (xs.size() == 4) d.bd = max(d.bd, xs[3].bd);
-  if (xs.size() == 4 && xs[3] != d) {
-    ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D_1D: " << xs;
-    throw std::invalid_argument(s.str());
-  }
-  return d;
-}
-
 #endif
 
 //   Y_ij = A_ijk * B_k (+ C_ij)
@@ -273,6 +246,38 @@ void InnerProduct3D_1D::backward_dev_impl(const MyDevice & dev,
 }
 DYNET_NODE_INST_DEV_IMPL(InnerProduct3D_1D)
 
+// ************* InnerProduct3D_1D_1D *************
+
+#ifndef __CUDACC__
+
+string InnerProduct3D_1D_1D::as_string(const vector<string>& arg_names) const {
+  ostringstream s;
+  s << "dotdot(" << arg_names[0] << "," << arg_names[1] << "," << arg_names[2] << ')';
+  if (arg_names.size() == 4) s << " + " << arg_names[3];
+  return s.str();
+}
+
+Dim InnerProduct3D_1D_1D::dim_forward(const vector<Dim>& xs) const {
+  if (xs.size() != 3 && xs.size() != 4)
+    throw std::invalid_argument("Expected three or four arguments in InnerProduct3D_1D");
+  if (xs[0].ndims() != 3 ||
+      !LooksLikeVector(xs[1]) ||
+      !LooksLikeVector(xs[2])) {
+    // TODO fix add check
+    ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D_1D: " << xs;
+    throw std::invalid_argument(s.str());
+  }
+  Dim d({xs[0].size(0)}, max(max(xs[0].bd, xs[1].bd), xs[2].bd));
+  if (xs.size() == 4) d.bd = max(d.bd, xs[3].bd);
+  if (xs.size() == 4 && xs[3] != d) {
+    ostringstream s; s << "Bad input dimensions in InnerProduct3D_1D_1D: " << xs;
+    throw std::invalid_argument(s.str());
+  }
+  return d;
+}
+
+#endif
+
 //   Y_ij = A_ijk * B_k * C_j (+ D_i)
 template<class MyDevice>
 void InnerProduct3D_1D_1D::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
diff --git a/dynet/nodes-contract.h b/dynet/nodes-contract.h
index ce56fc289..58e7637ad 100644
--- a/dynet/nodes-contract.h
+++ b/dynet/nodes-contract.h
@@ -2,20 +2,11 @@
 #define DYNET_NODES_CONTRACT_H_
 
 #include "dynet/dynet.h"
-#include "dynet/devices.h"
 #include "dynet/nodes-macros.h"
 
-// See nodes-macros.h for more details about DYNET_NODE_DEFINE_DEV_IMPL().
-
 namespace dynet {
 
-// Forward:
-//   Y_ij = A_ijk * B_k + C_ij
-//
-// Backward:
-//   (dE/dA)_ijk = (dE/dY)_ij * L_k
-//   (dE/dB)_k = (dE/dY)_ij * A_ijk
-//   (dE/dC)_ij = (dE/dY)_ij
+//   Y_i = A_ijk * B_k
 struct InnerProduct3D_1D : public Node {
   InnerProduct3D_1D(const std::initializer_list<VariableIndex>& a) : Node(a) {}
   virtual bool supports_multibatch() const override { return true; }
diff --git a/dynet/nodes-conv.h b/dynet/nodes-conv.h
index a4fd2ca02..b9bd6ff31 100644
--- a/dynet/nodes-conv.h
+++ b/dynet/nodes-conv.h
@@ -3,11 +3,6 @@
 
 #include "dynet/dynet.h"
 #include "dynet/nodes-macros.h"
-#include "dynet/op-helper.h"
-
-#if HAVE_CUDNN
-#include "dynet/cudnn-ops.h"
-#endif
 
 namespace dynet {
 
@@ -39,54 +34,6 @@ struct KMaxPooling : public Node {
   unsigned second_dim;
 };
 
-// conv2d 
-// y = x_1 *conv2d x_2
-// x_1 \in R^{H x W x Ci x N} (input)
-// x_2 \in R^{H x W x Ci x Co} (filter)
-// stride[0] corresponds to H
-// stride[1] corresponds to W
-// is_valid: true for 'VALID' and false for 'SAME'
-struct Conv2D: public Node {
-  explicit Conv2D(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& s,
-    const bool padding_type = true)
-      : Node(a), stride(s), is_valid(padding_type) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  const std::vector<unsigned> stride;
-  const bool is_valid;
-
- private:
-#if HAVE_CUDNN
-  mutable CudnnConvOp* cudnn_conv_op_ = NULL;
-#endif
-};
-
-// maxpooling2d
-// y = x_1 * maxpooling2d
-// x_1 \in R^{H x W x Ci x N} (input)
-// ksize[0] corresponds to H
-// ksize[1] corresponds to W
-// stride[0] corresponds to H
-// stride[1] corresponds to W
-// is_valid: true for 'VALID' and false for 'SAME'
-struct MaxPooling2D: public Node {
-  explicit MaxPooling2D(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& k, const std::vector<unsigned>& s,
-    const bool padding_type = true)
-      : Node(a), ksize(k), stride(s), is_valid(padding_type) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  const std::vector<unsigned> ksize;
-  const std::vector<unsigned> stride;
-  const bool is_valid;
-
- private:
-#if HAVE_CUDNN
-  mutable CudnnMaxPooling2DOp* cudnn_maxpool_op_ = NULL;
-#endif
-};
-
 // y_i = \sum_{j=1}^n x_1:{i-1+j}
 struct KMHNGram : public Node {
   explicit KMHNGram(const std::initializer_list<VariableIndex>& a, unsigned n) : Node(a), n(n) {}
@@ -94,6 +41,16 @@ struct KMHNGram : public Node {
   unsigned n;  // width, n=2 for Karl's paper
 };
 
+// hyperparameter: width > 1
+// x_1 is a std::vector in R^n, which we write x
+// y is a std::vector in R^{n / width}
+// y_i = max_{x_{i * width - width + 1}, ..., x_{i * width}}
+struct MaxPooling1D : public Node {
+  MaxPooling1D(const std::initializer_list<VariableIndex>& a, unsigned w) : Node(a), width(w) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  unsigned width;
+  mutable std::vector<unsigned> ind;
+};
 
 } // namespace dynet
 
diff --git a/dynet/nodes-conv2d.cc b/dynet/nodes-conv2d.cc
index 9918081ef..0a7cdedc9 100644
--- a/dynet/nodes-conv2d.cc
+++ b/dynet/nodes-conv2d.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes-conv.h"
+#include "dynet/nodes-conv2d.h"
 
 #include <algorithm>
 #include <sstream>
@@ -9,6 +9,7 @@
 
 #include "dynet/functors.h"
 #include "dynet/nodes-macros.h"
+#include "dynet/op-helper.h"
 #include "third_party/eigen_spatial_convolutions.h"
 #include "third_party/eigen_backward_spatial_convolutions.h"
 
diff --git a/dynet/nodes-conv2d.h b/dynet/nodes-conv2d.h
new file mode 100644
index 000000000..484c3222a
--- /dev/null
+++ b/dynet/nodes-conv2d.h
@@ -0,0 +1,34 @@
+#ifndef DYNET_NODES_CONV2D_H_
+#define DYNET_NODES_CONV2D_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// conv2d 
+// y = x_1 *conv2d x_2
+// x_1 \in R^{H x W x Ci x N} (input)
+// x_2 \in R^{H x W x Ci x Co} (filter)
+// stride[0] corresponds to H
+// stride[1] corresponds to W
+// is_valid: true for 'VALID' and false for 'SAME'
+struct Conv2D: public Node {
+  explicit Conv2D(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& s,
+    const bool padding_type = true)
+      : Node(a), stride(s), is_valid(padding_type) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  const std::vector<unsigned> stride;
+  const bool is_valid;
+
+ private:
+#if HAVE_CUDNN
+  mutable CudnnConvOp* cudnn_conv_op_ = NULL;
+#endif
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-dropout.cc b/dynet/nodes-dropout.cc
index 7addca034..1d052f18b 100644
--- a/dynet/nodes-dropout.cc
+++ b/dynet/nodes-dropout.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-dropout.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-dropout.h b/dynet/nodes-dropout.h
new file mode 100644
index 000000000..fc9913bd2
--- /dev/null
+++ b/dynet/nodes-dropout.h
@@ -0,0 +1,47 @@
+#ifndef DYNET_NODES_DROPOUT_H_
+#define DYNET_NODES_DROPOUT_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = dropout(x,p) where p specifies the dropout probability
+struct Dropout : public Node {
+  explicit Dropout(const std::initializer_list<VariableIndex>& a, real p) : Node(a), p(p) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  virtual bool supports_multibatch() const override { return true; }
+  real p;
+};
+
+// y = dropout(x,p) where p specifies the dropout probability
+struct DropoutDim : public Node {
+  explicit DropoutDim(const std::initializer_list<VariableIndex>& a, unsigned d,real p) : Node(a), dimension(d), p(p) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  virtual bool supports_multibatch() const override { return true; }
+  unsigned dimension;
+  real p;
+};
+
+// y = dropout(x,p) where p specifies the dropout probability
+struct DropoutBatch : public Node {
+  explicit DropoutBatch(const std::initializer_list<VariableIndex>& a, real p) : Node(a), p(p) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  virtual bool supports_multibatch() const override { return true; }
+  real p;
+};
+
+// y = block_dropout(x,p) where p specifies the probability for dropping-out the entire block
+struct BlockDropout : public Node {
+  explicit BlockDropout(const std::initializer_list<VariableIndex>& a, real p) : Node(a), dropout_probability(p) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  real dropout_probability;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-flow.cc b/dynet/nodes-flow.cc
index 95d0b10ae..4ec5507fe 100644
--- a/dynet/nodes-flow.cc
+++ b/dynet/nodes-flow.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-flow.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-flow.h b/dynet/nodes-flow.h
new file mode 100644
index 000000000..189afba30
--- /dev/null
+++ b/dynet/nodes-flow.h
@@ -0,0 +1,46 @@
+#ifndef DYNET_NODES_FLOW_H_
+#define DYNET_NODES_FLOW_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = reshape(x_1, --> to)
+struct Reshape : public Node {
+  explicit Reshape(const std::initializer_list<VariableIndex>& a, const Dim& to) : Node(a), to(to) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  Dim to;
+};
+
+// y = x_1
+struct Identity : public Node {
+  explicit Identity(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::identity); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x_1, but dy/dx is set to 0
+struct NoBackprop : public Node {
+  explicit NoBackprop(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::nobackprop); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = x_1, dy/dx is set to negative. 
+struct FlipGradient : public Node {
+  explicit FlipGradient(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::flipgradient); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};  
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-hinge.cc b/dynet/nodes-hinge.cc
index 50db9b3b3..f3af41cce 100644
--- a/dynet/nodes-hinge.cc
+++ b/dynet/nodes-hinge.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-hinge.h"
 
 #include "dynet/nodes-macros.h"
 
@@ -6,6 +6,8 @@ using namespace std;
 
 namespace dynet {
 
+// ************* Hinge *************
+
 #ifndef __CUDACC__
 
 string Hinge::as_string(const vector<string>& arg_names) const {
diff --git a/dynet/nodes-hinge.h b/dynet/nodes-hinge.h
new file mode 100644
index 000000000..7f68fa886
--- /dev/null
+++ b/dynet/nodes-hinge.h
@@ -0,0 +1,28 @@
+#ifndef DYNET_NODES_HINGE_H_
+#define DYNET_NODES_HINGE_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// Let x be a std::vector-valued input, x_i represents the score of the ith element, then
+// y = \sum{i != element} max{0, margin - x_element + x_i}
+struct Hinge : public Node {
+  explicit Hinge(const std::initializer_list<VariableIndex>& a, unsigned e, real m = 1.0) : Node(a), element(e), pelement(&element), margin(m) {}
+  explicit Hinge(const std::initializer_list<VariableIndex>& a, const unsigned* pe, real m = 1.0) : Node(a), element(), pelement(pe), margin(m) {}
+  explicit Hinge(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& e, real m = 1.0) : Node(a), element(), pelement(), elements(e), pelements(&elements), margin(m) {}
+  explicit Hinge(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pe, real m = 1.0) : Node(a), element(), pelement(), elements(), pelements(pe), margin(m) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  unsigned element;
+  const unsigned* pelement;
+  std::vector<unsigned> elements;
+  const std::vector<unsigned>* pelements;
+  real margin;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-linalg.cc b/dynet/nodes-linalg.cc
index a1c17eaec..f9d3c772d 100644
--- a/dynet/nodes-linalg.cc
+++ b/dynet/nodes-linalg.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-linalg.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-linalg.h b/dynet/nodes-linalg.h
new file mode 100644
index 000000000..1529cb26e
--- /dev/null
+++ b/dynet/nodes-linalg.h
@@ -0,0 +1,40 @@
+#ifndef DYNET_NODES_LINALG_H_
+#define DYNET_NODES_LINALG_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1^T
+// NOTE: if you have a column or row std::vector as input, runtime is constant
+// if you have a matrix as input, the runtime is O(mn) - try to avoid using this
+struct Transpose : public Node {
+  explicit Transpose(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned> & dims) : Node(a), dims(dims) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  std::vector<unsigned> dims;
+};
+
+// y = inv(x)
+// x = an invertible matrix
+struct MatrixInverse : public Node {
+  explicit MatrixInverse(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = log det(x)
+struct LogDet : public Node {
+  template <typename T> explicit LogDet(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = Tr(x_1 * x_2^T)
+struct TraceOfProduct : public Node {
+  explicit TraceOfProduct(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-logsumexp.cc b/dynet/nodes-logsumexp.cc
index 71fb3e0b8..a21075242 100644
--- a/dynet/nodes-logsumexp.cc
+++ b/dynet/nodes-logsumexp.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-logsumexp.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-logsumexp.h b/dynet/nodes-logsumexp.h
new file mode 100644
index 000000000..39404a841
--- /dev/null
+++ b/dynet/nodes-logsumexp.h
@@ -0,0 +1,20 @@
+#ifndef DYNET_NODES_LOGSUMEXP_H_
+#define DYNET_NODES_LOGSUMEXP_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = \log \sum_i \exp x_i
+// done in log space carefully to avoid over/underflow issues
+struct LogSumExp : public Node {
+  template <typename T> explicit LogSumExp(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  size_t aux_storage_size() const override;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-losses.cc b/dynet/nodes-losses.cc
index a12db7bc8..c67a3da2d 100644
--- a/dynet/nodes-losses.cc
+++ b/dynet/nodes-losses.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-losses.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-losses.h b/dynet/nodes-losses.h
new file mode 100644
index 000000000..8a5747b29
--- /dev/null
+++ b/dynet/nodes-losses.h
@@ -0,0 +1,47 @@
+#ifndef DYNET_NODES_LOSSES_H_
+#define DYNET_NODES_LOSSES_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// x_1 is a scalar (or row std::vector)
+// x_2 is a scalar (or row std::vector)
+// y = max(0, margin - x_1 + x_2)
+struct PairwiseRankLoss : public Node {
+  explicit PairwiseRankLoss(const std::initializer_list<VariableIndex>& a, real m = 1.0) : Node(a), margin(m) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  real margin;
+};
+
+// you could do this with LogisticSigmoid, Softmax or a variety of other
+// functions, but this is often useful.
+// x_1 must be a std::vector with values between 0 and 1
+// target_y is an equivalently sized std::vector w values between 0 and 1
+// y = ty * log(x_1) + (1 - ty) * log(x_1)
+struct BinaryLogLoss : public Node {
+  BinaryLogLoss(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// this is used to implement poisson regression
+// x_1 = log predicted mean
+// ty = true y (this is not a VariableIndex since it has to be a nonnegative integer and
+//              is therefore nondifferentiable. There are various continuous extensions
+//              using the incomplete gamma function that could be used, but meh)
+// y = log Poisson(ty; \lambda = \exp x_1)
+//   = ty*x_1 - exp(x_1) - log(ty!)
+struct PoissonRegressionLoss : public Node {
+  explicit PoissonRegressionLoss(const std::initializer_list<VariableIndex>& a, unsigned true_y) : Node(a), ty(true_y), pty(&ty) {}
+  explicit PoissonRegressionLoss(const std::initializer_list<VariableIndex>& a, const unsigned* ptrue_y) : Node(a), ty(), pty(ptrue_y) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+ private:
+  unsigned ty;
+  const unsigned* pty;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-matrixmultiply.cc b/dynet/nodes-matrixmultiply.cc
index 7c13dd753..b8bc5a5ac 100644
--- a/dynet/nodes-matrixmultiply.cc
+++ b/dynet/nodes-matrixmultiply.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-matrixmultiply.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/cuda-matrix-multiply.h"
@@ -7,6 +7,8 @@ using namespace std;
 
 namespace dynet {
 
+// ************* MatrixMultiply *************
+
 #ifndef __CUDACC__
 
 string MatrixMultiply::as_string(const vector<string>& arg_names) const {
diff --git a/dynet/nodes-matrixmultiply.h b/dynet/nodes-matrixmultiply.h
new file mode 100644
index 000000000..ba78957b4
--- /dev/null
+++ b/dynet/nodes-matrixmultiply.h
@@ -0,0 +1,27 @@
+#ifndef DYNET_NODES_MATRIXMULTIPLY_H_
+#define DYNET_NODES_MATRIXMULTIPLY_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1 * x_2
+struct MatrixMultiply : public Node {
+  explicit MatrixMultiply(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-maxpooling2d.cc b/dynet/nodes-maxpooling2d.cc
index b679af7f3..7249c9578 100644
--- a/dynet/nodes-maxpooling2d.cc
+++ b/dynet/nodes-maxpooling2d.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes-conv.h"
+#include "dynet/nodes-maxpooling2d.h"
 
 #include <sstream>
 #include <limits>
@@ -8,6 +8,7 @@
 
 #include "dynet/functors.h"
 #include "dynet/nodes-macros.h"
+#include "dynet/op-helper.h"
 #include "third_party/eigen_pooling.h"
 
 #if HAVE_CUDA
diff --git a/dynet/nodes-maxpooling2d.h b/dynet/nodes-maxpooling2d.h
new file mode 100644
index 000000000..20bfff67d
--- /dev/null
+++ b/dynet/nodes-maxpooling2d.h
@@ -0,0 +1,36 @@
+#ifndef DYNET_NODES_MAXPOOLING2D_H_
+#define DYNET_NODES_MAXPOOLING2D_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// maxpooling2d
+// y = x_1 * maxpooling2d
+// x_1 \in R^{H x W x Ci x N} (input)
+// ksize[0] corresponds to H
+// ksize[1] corresponds to W
+// stride[0] corresponds to H
+// stride[1] corresponds to W
+// is_valid: true for 'VALID' and false for 'SAME'
+struct MaxPooling2D: public Node {
+  explicit MaxPooling2D(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& k, const std::vector<unsigned>& s,
+    const bool padding_type = true)
+      : Node(a), ksize(k), stride(s), is_valid(padding_type) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  const std::vector<unsigned> ksize;
+  const std::vector<unsigned> stride;
+  const bool is_valid;
+
+ private:
+#if HAVE_CUDNN
+  mutable CudnnMaxPooling2DOp* cudnn_maxpool_op_ = NULL;
+#endif
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-minmax.cc b/dynet/nodes-minmax.cc
index 6b5d45f87..cc0913eab 100644
--- a/dynet/nodes-minmax.cc
+++ b/dynet/nodes-minmax.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-minmax.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-minmax.h b/dynet/nodes-minmax.h
new file mode 100644
index 000000000..b1a500e55
--- /dev/null
+++ b/dynet/nodes-minmax.h
@@ -0,0 +1,53 @@
+#ifndef DYNET_NODES_MINMAX_H_
+#define DYNET_NODES_MINMAX_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = min{x_1, x_2}
+struct Min : public Node {
+  explicit Min(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  size_t aux_storage_size() const override;
+};
+
+// y = max{x_1, x_2}
+struct Max : public Node {
+  template <typename T> explicit Max(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  size_t aux_storage_size() const override;
+};
+
+struct MinDimension : public Node {
+  explicit MinDimension(const std::initializer_list<VariableIndex>& a, unsigned dimension = 0) : Node(a), reduced_dim(dimension) {
+    first_dim = reduced_dim == 0 ? 1 : 0;
+    second_dim = first_dim + 1 == reduced_dim ? first_dim + 2 : first_dim + 1;
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  size_t aux_storage_size() const override;
+  unsigned reduced_dim;
+  unsigned first_dim;
+  unsigned second_dim;
+};
+
+struct MaxDimension : public Node {
+  explicit MaxDimension(const std::initializer_list<VariableIndex>& a, unsigned dimension = 0) : Node(a), reduced_dim(dimension) {
+    first_dim = reduced_dim == 0 ? 1 : 0;
+    second_dim = first_dim + 1 == reduced_dim ? first_dim + 2 : first_dim + 1;
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  size_t aux_storage_size() const override;
+  unsigned reduced_dim;
+  unsigned first_dim;
+  unsigned second_dim;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-moments.cc b/dynet/nodes-moments.cc
index b4d618165..99a8cf52e 100644
--- a/dynet/nodes-moments.cc
+++ b/dynet/nodes-moments.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-moments.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-moments.h b/dynet/nodes-moments.h
new file mode 100644
index 000000000..57f4df095
--- /dev/null
+++ b/dynet/nodes-moments.h
@@ -0,0 +1,76 @@
+#ifndef DYNET_NODES_MOMENTS_H_
+#define DYNET_NODES_MOMENTS_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = ( \sum_i x_i ) / |x|
+struct Average : public Node {
+  template <typename T> explicit Average(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+// with a single argument x \in R^{n x m}
+// y_i = \sum_j x_i,j / m
+struct AverageColumns : public Node {
+  template <typename T> explicit AverageColumns(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = \sum_i,j,... x[i,j,...]
+struct MomentElements : public Node {
+  template <typename T> explicit MomentElements(const T& a, unsigned o) : Node(a), order(o) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+private:
+  unsigned order;
+};
+
+// y = \sum_i x_i
+struct MomentBatches : public Node {
+  template <typename T> explicit MomentBatches(const T& a, unsigned o) : Node(a), order(o) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+private:
+  unsigned order;
+};
+
+//y = \sum_i x_i
+struct MomentDimension : public Node {
+  template <typename T> explicit MomentDimension(const T& a, unsigned d, unsigned o) : Node(a), dimension(d), order(o) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+private:
+  unsigned dimension;
+  unsigned order;
+};
+
+// y = \sum_i,j,... x[i,j,...]
+struct StdElements : public Node {
+  template <typename T> explicit StdElements(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+// y = \sum_i x_i
+struct StdBatches : public Node {
+  template <typename T> explicit StdBatches(const T& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+//y = \sum_i x_i
+struct StdDimension : public Node {
+  template <typename T> explicit StdDimension(const T& a, unsigned d) : Node(a), dimension(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+private:
+  unsigned dimension;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-normalization.cc b/dynet/nodes-normalization.cc
index d4faacb9d..c96f38e7d 100644
--- a/dynet/nodes-normalization.cc
+++ b/dynet/nodes-normalization.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-normalization.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-normalization.h b/dynet/nodes-normalization.h
new file mode 100644
index 000000000..29e94e5c5
--- /dev/null
+++ b/dynet/nodes-normalization.h
@@ -0,0 +1,18 @@
+#ifndef DYNET_NODES_NORMALIZATION_H_
+#define DYNET_NODES_NORMALIZATION_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1 * x_2
+struct WeightNormalization : public Node {
+  explicit WeightNormalization(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return false; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-norms.cc b/dynet/nodes-norms.cc
index 65d18e7b6..29cf8b8bf 100644
--- a/dynet/nodes-norms.cc
+++ b/dynet/nodes-norms.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-norms.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-norms.h b/dynet/nodes-norms.h
new file mode 100644
index 000000000..f18ea33cc
--- /dev/null
+++ b/dynet/nodes-norms.h
@@ -0,0 +1,25 @@
+#ifndef DYNET_NODES_NORMS_H_
+#define DYNET_NODES_NORMS_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = || x_1 ||^2
+struct SquaredNorm : public Node {
+  explicit SquaredNorm(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = || x_1 ||
+struct L2Norm : public Node {
+  explicit L2Norm(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-pickneglogsoftmax.cc b/dynet/nodes-pickneglogsoftmax.cc
index 99e7abda9..a543e178d 100644
--- a/dynet/nodes-pickneglogsoftmax.cc
+++ b/dynet/nodes-pickneglogsoftmax.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-pickneglogsoftmax.h"
 
 #include "dynet/nodes-macros.h"
 
@@ -11,6 +11,8 @@ using namespace std;
 
 namespace dynet {
 
+// ************* PickNegLogSoftmax *************
+
 #ifndef __CUDACC__
 
 string PickNegLogSoftmax::as_string(const vector<string>& arg_names) const {
diff --git a/dynet/nodes-pickneglogsoftmax.h b/dynet/nodes-pickneglogsoftmax.h
new file mode 100644
index 000000000..ae8007b52
--- /dev/null
+++ b/dynet/nodes-pickneglogsoftmax.h
@@ -0,0 +1,40 @@
+#ifndef DYNET_NODES_PICKNEGLOGSOFTMAX_H_
+#define DYNET_NODES_PICKNEGLOGSOFTMAX_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// z = \sum_j \exp (x_i)_j
+// y = (x_1)_element - \log z
+struct PickNegLogSoftmax : public Node {
+  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, unsigned v) : Node(a), val(v), pval(&val), vals(), pvals() {}
+  // use this constructor if you want to perform mini-batching
+  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& v) : Node(a), val(), pval(), vals(v), pvals(&vals) {}
+  // use these constructors if you want to change the value after the graph is constructed
+  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, const unsigned* pv) : Node(a), val(), pval(pv), vals(), pvals() {}
+  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pv) : Node(a), val(), pval(), vals(), pvals(pv) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  size_t aux_storage_size() const override;
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  virtual Node* autobatch_pseudo_node(const ComputationGraph & cg,
+                                      const std::vector<VariableIndex> & batch_ids) const override;
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  unsigned val;
+  const unsigned* pval;
+  std::vector<unsigned> vals;
+  const std::vector<unsigned>* pvals;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-random.cc b/dynet/nodes-random.cc
index 9e221112b..56f5b289d 100644
--- a/dynet/nodes-random.cc
+++ b/dynet/nodes-random.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-random.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-random.h b/dynet/nodes-random.h
new file mode 100644
index 000000000..7e9e43582
--- /dev/null
+++ b/dynet/nodes-random.h
@@ -0,0 +1,59 @@
+#ifndef DYNET_NODES_RANDOM_H_
+#define DYNET_NODES_RANDOM_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// n_{i,j} ~ N(0,stddev)
+// y = x + n
+struct GaussianNoise : public Node {
+  explicit GaussianNoise(const std::initializer_list<VariableIndex>& a, real stddev) : Node(a), stddev(stddev) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  virtual bool supports_multibatch() const override { return true; }
+  real stddev;
+};
+
+// draw random noise from Normal(0, 1)
+struct RandomNormal : public Node {
+  explicit RandomNormal(const Dim& d) : dim(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  Dim dim;
+};
+
+// draw from Bernoulli(p)
+struct RandomBernoulli : public Node {
+  explicit RandomBernoulli(const std::initializer_list<VariableIndex>& a, const Dim& d, real p, real scale = 1.0f) : dim(d), p(p), scale(scale) {
+    DYNET_ASSERT(a.size() == 0, "RandomBernoulli doesn't accept nodes as input");
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  Dim dim;
+  real p;
+  real scale;
+};
+
+// draw a random real from Uniform(left, right)
+struct RandomUniform : public Node {
+  explicit RandomUniform(const std::initializer_list<VariableIndex>& a, const Dim& d, real left, real right) : dim(d), left(left), right(right) {
+    DYNET_ASSERT(a.size() == 0, "RandomUniform doesn't accept nodes as input");
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  Dim dim;
+  real left, right;
+};
+
+// draw a random real from Uniform(left, right)
+struct RandomGumbel : public Node {
+  explicit RandomGumbel(const std::initializer_list<VariableIndex>& a, const Dim& d, real mu, real beta) : dim(d), mu(mu), beta(beta) {
+    DYNET_ASSERT(a.size() == 0, "RandomGumbel doesn't accept nodes as input");
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  Dim dim;
+  real mu, beta;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-select.cc b/dynet/nodes-select.cc
index f8760b4c7..359ac5532 100644
--- a/dynet/nodes-select.cc
+++ b/dynet/nodes-select.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-select.h"
 
 #include "dynet/nodes-macros.h"
 
diff --git a/dynet/nodes-select.h b/dynet/nodes-select.h
new file mode 100644
index 000000000..927c97d7a
--- /dev/null
+++ b/dynet/nodes-select.h
@@ -0,0 +1,84 @@
+#ifndef DYNET_NODES_SELECT_H_
+#define DYNET_NODES_SELECT_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = select_rows(x, rows)
+// x = a matrix
+struct SelectRows : public Node {
+  explicit SelectRows(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& r) : Node(a), rows(r), prows(&rows) {}
+  explicit SelectRows(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pr) : Node(a), prows(pr) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  std::vector<unsigned> rows;
+  const std::vector<unsigned>* prows;
+};
+
+// y = select_cols(x, cols)
+// x = a matrix
+struct SelectCols : public Node {
+  explicit SelectCols(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& c) : Node(a), cols(c), pcols(&cols) {}
+  explicit SelectCols(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pc) : Node(a), pcols(pc) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  std::vector<unsigned> cols;
+  const std::vector<unsigned>* pcols;
+};
+
+// x_1 is a std::vector
+// y = (x_1)_{*pval}
+// this is used to implement cross-entropy training
+struct PickElement : public Node {
+  explicit PickElement(const std::initializer_list<VariableIndex>& a, unsigned v, unsigned d = 0) : Node(a), val(v), pval(&val), vals(), pvals(), dimension(d) {}
+  // use this constructor if you want to perform mini-batching
+  explicit PickElement(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& v, unsigned d = 0) : Node(a), val(), pval(), vals(v), pvals(&vals), dimension(d) {}
+  // use these constructors if you want to change the value after the graph is constructed
+  explicit PickElement(const std::initializer_list<VariableIndex>& a, const unsigned* pv, unsigned d = 0) : Node(a), val(), pval(pv), vals(), pvals(), dimension(d) {}
+  explicit PickElement(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pv, unsigned d = 0) : Node(a), val(), pval(), vals(), pvals(pv), dimension(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  unsigned val;
+  const unsigned* pval;
+  std::vector<unsigned> vals;
+  const std::vector<unsigned>* pvals;
+  unsigned dimension;
+};
+
+// x_1 is a tensor
+// y = x_1[start:end] along dimension d
+// (start inclusive, end exclusive)
+struct PickRange : public Node {
+  explicit PickRange(const std::initializer_list<VariableIndex>& a, unsigned s, unsigned e, unsigned d = 0) : Node(a), start(s), end(e), dim(d) {}
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  unsigned start, end, dim;
+};
+
+// x is a batched tensor
+// y = (x)_{[*pval]}
+struct PickBatchElements : public Node {
+  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, unsigned v) : Node(a), val(v), pval(&val), vals(), pvals() {}
+  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& v) : Node(a), val(), pval(), vals(v), pvals(&vals) {}
+  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, const unsigned* pv) : Node(a), val(), pval(pv), vals(), pvals() {}
+  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pv) : Node(a), val(), pval(), vals(), pvals(pv) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  virtual bool supports_multibatch() const override { return true; }
+  unsigned val;
+  const unsigned* pval;
+  std::vector<unsigned> vals;
+  const std::vector<unsigned>* pvals;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-similarities.cc b/dynet/nodes-similarities.cc
index 51bd8e367..275485da5 100644
--- a/dynet/nodes-similarities.cc
+++ b/dynet/nodes-similarities.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-similarities.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-similarities.h b/dynet/nodes-similarities.h
new file mode 100644
index 000000000..9ec500c73
--- /dev/null
+++ b/dynet/nodes-similarities.h
@@ -0,0 +1,47 @@
+#ifndef DYNET_NODES_SIMILARITIES_H_
+#define DYNET_NODES_SIMILARITIES_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = x_1^T . x_2
+struct DotProduct : public Node {
+  explicit DotProduct(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = || x_1 - x_2 ||_H(d)
+struct HuberDistance : public Node {
+  explicit HuberDistance(const std::initializer_list<VariableIndex>& a, float d = 1.345f) : Node(a), d(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  float d;
+};
+
+// y = || x_1 - x_2 ||_1
+struct L1Distance : public Node {
+  explicit L1Distance(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+// y = || x_1 - x_2 ||^2
+struct SquaredEuclideanDistance : public Node {
+  explicit SquaredEuclideanDistance(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-softmaxes.cc b/dynet/nodes-softmaxes.cc
index e8b672757..fb222c903 100644
--- a/dynet/nodes-softmaxes.cc
+++ b/dynet/nodes-softmaxes.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-softmaxes.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/functors.h"
diff --git a/dynet/nodes-softmaxes.h b/dynet/nodes-softmaxes.h
new file mode 100644
index 000000000..2775fe049
--- /dev/null
+++ b/dynet/nodes-softmaxes.h
@@ -0,0 +1,66 @@
+#ifndef DYNET_NODES_SOFTMAXES_H_
+#define DYNET_NODES_SOFTMAXES_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// z = \sum_j \exp (x_i)_j
+// y_i = (x_1)_i / z
+struct Softmax : public Node {
+  explicit Softmax(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
+  virtual void autobatch_reshape(const ComputationGraph & cg,
+                                 const std::vector<VariableIndex> & batch_ids,
+                                 const std::vector<int> & concat,
+                                 std::vector<const Tensor*>& xs,
+                                 Tensor& fx) const override {
+    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
+  }
+};
+
+// z = \sum_j \exp (x_i)_j
+// y_i = (x_1)_i - \log z
+struct LogSoftmax : public Node {
+  explicit LogSoftmax(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  virtual bool supports_multibatch() const override { return true; }
+};
+
+// z = \sum_{j \in denom} \exp (x_i)_j
+// y_i = (x_1)_i - \log z
+struct RestrictedLogSoftmax : public Node {
+  explicit RestrictedLogSoftmax(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& d) : Node(a), denom(d) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  std::vector<unsigned> denom;
+};
+
+// y = sparsemax(x)
+// y = arg min_y ||y - x||^2
+struct Sparsemax : public Node {
+  explicit Sparsemax(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+};
+
+// y = L_sparsemax(x_0; q)
+// where x_0 is a std::vector of "unnormalized" probabilities
+// q are the std::vector of labels
+struct SparsemaxLoss : public Node {
+  explicit SparsemaxLoss(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& target) : Node(a), q(target), pq(&q) {}
+  explicit SparsemaxLoss(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* ptarget) : Node(a), q(), pq(ptarget) {}
+  DYNET_NODE_DEFINE_DEV_IMPL()
+  size_t aux_storage_size() const override;
+  const std::vector<unsigned> q;
+  const std::vector<unsigned>* pq;
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes-trig.cc b/dynet/nodes-trig.cc
index c5965879b..f9c75758f 100644
--- a/dynet/nodes-trig.cc
+++ b/dynet/nodes-trig.cc
@@ -1,4 +1,4 @@
-#include "dynet/nodes.h"
+#include "dynet/nodes-trig.h"
 
 #include "dynet/nodes-macros.h"
 #include "dynet/simd-functors.h"
@@ -7,7 +7,7 @@ using namespace std;
 
 namespace dynet {
 
-// ************* *************
+// ************* Tanh *************
 
 #ifndef __CUDACC__
 
diff --git a/dynet/nodes-trig.h b/dynet/nodes-trig.h
new file mode 100644
index 000000000..8212661dd
--- /dev/null
+++ b/dynet/nodes-trig.h
@@ -0,0 +1,20 @@
+#ifndef DYNET_NODES_TRIG_H_
+#define DYNET_NODES_TRIG_H_
+
+#include "dynet/dynet.h"
+#include "dynet/nodes-macros.h"
+
+namespace dynet {
+
+// y = tanh x_1
+struct Tanh : public Node {
+  explicit Tanh(const std::initializer_list<VariableIndex>& a) : Node(a) {}
+  virtual bool supports_multibatch() const override { return true; }
+  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::tanh); return sm.get_idx(s); }
+  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
+  DYNET_NODE_DEFINE_DEV_IMPL()
+};
+
+} // namespace dynet
+
+#endif
diff --git a/dynet/nodes.h b/dynet/nodes.h
index ef421418b..7d6a13211 100644
--- a/dynet/nodes.h
+++ b/dynet/nodes.h
@@ -1,870 +1,31 @@
-#ifndef DYNET_NODES_H_
-#define DYNET_NODES_H_
-
-#include "dynet/dynet.h"
-#include "dynet/devices.h"
+#include "dynet/nodes-activations.h"
+#include "dynet/nodes-affinetransform.h"
+#include "dynet/nodes-arith-const.h"
+#include "dynet/nodes-arith-cwise.h"
+#include "dynet/nodes-arith-scalar.h"
+#include "dynet/nodes-arith-sum.h"
+#include "dynet/nodes-arith-unary.h"
+#include "dynet/nodes-concat.h"
+#include "dynet/nodes-const.h"
+#include "dynet/nodes-contract.h"
+#include "dynet/nodes-conv.h"
+#include "dynet/nodes-conv2d.h"
+#include "dynet/nodes-dropout.h"
+#include "dynet/nodes-flow.h"
+#include "dynet/nodes-hinge.h"
+#include "dynet/nodes-linalg.h"
+#include "dynet/nodes-logsumexp.h"
+#include "dynet/nodes-losses.h"
 #include "dynet/nodes-macros.h"
-
-// See nodes-macros.h for more details about DYNET_NODE_DEFINE_DEV_IMPL().
-
-namespace dynet {
-
-// M = x_0, v = x_1
-// y = M + v (broadcasting over columns)
-struct AddVectorToAllColumns : public Node {
-  explicit AddVectorToAllColumns(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// with a single argument x \in R^{n x m}
-// y_i = \sum_j x_i,j / m
-struct AverageColumns : public Node {
-  template <typename T> explicit AverageColumns(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// sum along a single dimension
-struct SumDimension : public Node {
-  template <typename T> explicit SumDimension(const T& a, unsigned d) : Node(a), dimension(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  unsigned dimension;
-};
-
-// y = L_sparsemax(x_0; q)
-// where x_0 is a std::vector of "unnormalized" probabilities
-// q are the std::vector of labels
-struct SparsemaxLoss : public Node {
-  explicit SparsemaxLoss(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& target) : Node(a), q(target), pq(&q) {}
-  explicit SparsemaxLoss(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* ptarget) : Node(a), q(), pq(ptarget) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  const std::vector<unsigned> q;
-  const std::vector<unsigned>* pq;
-};
-
-// y = sparsemax(x)
-// y = arg min_y ||y - x||^2
-struct Sparsemax : public Node {
-  explicit Sparsemax(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-};
-
-// y = inv(x)
-// x = an invertible matrix
-struct MatrixInverse : public Node {
-  explicit MatrixInverse(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = select_rows(x, rows)
-// x = a matrix
-struct SelectRows : public Node {
-  explicit SelectRows(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& r) : Node(a), rows(r), prows(&rows) {}
-  explicit SelectRows(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pr) : Node(a), prows(pr) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  std::vector<unsigned> rows;
-  const std::vector<unsigned>* prows;
-};
-
-// y = select_cols(x, cols)
-// x = a matrix
-struct SelectCols : public Node {
-  explicit SelectCols(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& c) : Node(a), cols(c), pcols(&cols) {}
-  explicit SelectCols(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pc) : Node(a), pcols(pc) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  std::vector<unsigned> cols;
-  const std::vector<unsigned>* pcols;
-};
-
-// y = pow(x_1, x_2)
-// x_2 raise every element in x_1 to the power of scalar x_2
-struct Pow : public Node {
-  explicit Pow(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = min{x_1, x_2}
-struct Min : public Node {
-  explicit Min(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  size_t aux_storage_size() const override;
-};
-
-// y = max{x_1, x_2}
-struct Max : public Node {
-  template <typename T> explicit Max(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  size_t aux_storage_size() const override;
-};
-
-// y = Tr(x_1 * x_2^T)
-struct TraceOfProduct : public Node {
-  explicit TraceOfProduct(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = alpha * x_1
-struct ConstScalarMultiply : public Node {
-  explicit ConstScalarMultiply(const std::initializer_list<VariableIndex>& a, float alpha) : Node(a), alpha(alpha) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::scalar_mult); s.add_float(alpha); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  float alpha;
-};
-
-// y = x_1^T . x_2
-struct DotProduct : public Node {
-  explicit DotProduct(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1^T
-// NOTE: if you have a column or row std::vector as input, runtime is constant
-// if you have a matrix as input, the runtime is O(mn) - try to avoid using this
-struct Transpose : public Node {
-  explicit Transpose(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned> & dims) : Node(a), dims(dims) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  std::vector<unsigned> dims;
-};
-
-// y = reshape(x_1, --> to)
-struct Reshape : public Node {
-  explicit Reshape(const std::initializer_list<VariableIndex>& a, const Dim& to) : Node(a), to(to) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  Dim to;
-};
-
-// n_{i,j} ~ N(0,stddev)
-// y = x + n
-struct GaussianNoise : public Node {
-  explicit GaussianNoise(const std::initializer_list<VariableIndex>& a, real stddev) : Node(a), stddev(stddev) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  virtual bool supports_multibatch() const override { return true; }
-  real stddev;
-};
-
-// y = dropout(x,p) where p specifies the dropout probability
-struct Dropout : public Node {
-  explicit Dropout(const std::initializer_list<VariableIndex>& a, real p) : Node(a), p(p) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  virtual bool supports_multibatch() const override { return true; }
-  real p;
-};
-
-// y = dropout(x,p) where p specifies the dropout probability
-struct DropoutDim : public Node {
-  explicit DropoutDim(const std::initializer_list<VariableIndex>& a, unsigned d,real p) : Node(a), dimension(d), p(p) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  virtual bool supports_multibatch() const override { return true; }
-  unsigned dimension;
-  real p;
-};
-
-// y = dropout(x,p) where p specifies the dropout probability
-struct DropoutBatch : public Node {
-  explicit DropoutBatch(const std::initializer_list<VariableIndex>& a, real p) : Node(a), p(p) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  virtual bool supports_multibatch() const override { return true; }
-  real p;
-};
-
-// y = block_dropout(x,p) where p specifies the probability for dropping-out the entire block
-struct BlockDropout : public Node {
-  explicit BlockDropout(const std::initializer_list<VariableIndex>& a, real p) : Node(a), dropout_probability(p) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  real dropout_probability;
-};
-
-// y = c + x_1
-// (c is a std::vector or matrix of the constant, usually 1, but can be configured)
-struct ConstantPlusX : public Node {
-  explicit ConstantPlusX(const std::initializer_list<VariableIndex>& a, real o) : Node(a), c(o) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::plus_const); s.add_float(c); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  real c;
-};
-
-// y = c - x_1
-// (c is a std::vector or matrix of the constant, usually 1, but can be configured)
-struct ConstantMinusX : public Node {
-  explicit ConstantMinusX(const std::initializer_list<VariableIndex>& a, real o) : Node(a), c(o) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  real c;
-};
-
-// y = sqrt x_1
-struct Sqrt : public Node {
-  explicit Sqrt(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::sqrt); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = abs x_1
-struct Abs : public Node {
-  explicit Abs(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::abs); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = erf x_1
-struct Erf : public Node {
-  explicit Erf(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::erf); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = tanh x_1
-struct Tanh : public Node {
-  explicit Tanh(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::tanh); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 \odot x_1
-struct Square : public Node {
-  explicit Square(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::square); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 \odot x_1 \odot x_1
-struct Cube : public Node {
-  explicit Cube(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::cube); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = exp x_1
-struct Exp : public Node {
-  explicit Exp(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::exp); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = lgamma x_1
-struct LogGamma : public Node {
-  explicit LogGamma(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::loggamma); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = log x_1  (base e, i.e., natural log)
-struct Log : public Node {
-  explicit Log(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::log); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// concatenate rows
-struct Concatenate : public Node {
-  template <typename T> explicit Concatenate(const T& a, unsigned d) : Node(a), dimension(d) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(args.size(), 1); }  
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  // src_row_indices[i] says what row in fx the ith x std::vector was assigned to
-  // used to simplify backprop
-  mutable std::vector<unsigned> src_indices;
-  unsigned dimension;
-};
-
-// concatenate different batched experssions into one single batched tensor
-struct ConcatenateToBatch : public Node {
-  template <typename T> explicit ConcatenateToBatch(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override {return true;}
-  mutable std::vector<unsigned> src_element_indices;
-};
-
-// x_1 is a scalar (or row std::vector)
-// x_2 is a scalar (or row std::vector)
-// y = max(0, margin - x_1 + x_2)
-struct PairwiseRankLoss : public Node {
-  explicit PairwiseRankLoss(const std::initializer_list<VariableIndex>& a, real m = 1.0) : Node(a), margin(m) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  real margin;
-};
-
-// Let x be a std::vector-valued input, x_i represents the score of the ith element, then
-// y = \sum{i != element} max{0, margin - x_element + x_i}
-struct Hinge : public Node {
-  explicit Hinge(const std::initializer_list<VariableIndex>& a, unsigned e, real m = 1.0) : Node(a), element(e), pelement(&element), margin(m) {}
-  explicit Hinge(const std::initializer_list<VariableIndex>& a, const unsigned* pe, real m = 1.0) : Node(a), element(), pelement(pe), margin(m) {}
-  explicit Hinge(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& e, real m = 1.0) : Node(a), element(), pelement(), elements(e), pelements(&elements), margin(m) {}
-  explicit Hinge(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pe, real m = 1.0) : Node(a), element(), pelement(), elements(), pelements(pe), margin(m) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  unsigned element;
-  const unsigned* pelement;
-  std::vector<unsigned> elements;
-  const std::vector<unsigned>* pelements;
-  real margin;
-};
-
-// y = x_1, but dy/dx is set to 0
-struct NoBackprop : public Node {
-  explicit NoBackprop(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::nobackprop); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1, dy/dx is set to negative. 
-struct FlipGradient : public Node {
-  explicit FlipGradient(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::flipgradient); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};  
-  
-// y = x_1
-struct Identity : public Node {
-  explicit Identity(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::identity); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// hyperparameter: width > 1
-// x_1 is a std::vector in R^n, which we write x
-// y is a std::vector in R^{n / width}
-// y_i = max_{x_{i * width - width + 1}, ..., x_{i * width}}
-struct MaxPooling1D : public Node {
-  MaxPooling1D(const std::initializer_list<VariableIndex>& a, unsigned w) : Node(a), width(w) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  unsigned width;
-  mutable std::vector<unsigned> ind;
-};
-
-// y = x_1 * x_2
-struct MatrixMultiply : public Node {
-  explicit MatrixMultiply(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 \cdot x_2  (Hadamard product)
-struct CwiseMultiply : public Node {
-  explicit CwiseMultiply(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 + x_2  (Addition where x_2 is a scalar)
-struct ScalarAdd : public Node {
-  explicit ScalarAdd(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 \cdot x_2  (Hadamard product where x_1 is a scalar)
-struct ScalarMultiply : public Node {
-  explicit ScalarMultiply(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 / x_2  (Elementwise division where x_2 is a scalar)
-struct ScalarQuotient : public Node {
-  explicit ScalarQuotient(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 / x_2  (cwiseQuotient)
-struct CwiseQuotient : public Node {
-  explicit CwiseQuotient(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x_1 \sum_{i=2, 4 ...} A_i * x_{i+1}
-struct AffineTransform : public Node {
-  template <typename T> explicit AffineTransform(const T& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  mutable float* dEdf_mem;
-};
-
-// y = -x_1
-struct Negate : public Node {
-  explicit Negate(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; } 
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::negate); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = max(0,x)
-struct Rectify : public Node {
-  explicit Rectify(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::rectify); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = ELU(0,x)
-struct ExponentialLinearUnit : public Node {
-  explicit ExponentialLinearUnit(const std::initializer_list<VariableIndex>& a, float lambda=1.f, float alpha=1.f) : Node(a), lambda(lambda), alpha(alpha) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::rectify); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  float lambda, alpha;
-};
-
-// you could do this with LogisticSigmoid, Softmax or a variety of other
-// functions, but this is often useful.
-// x_1 must be a std::vector with values between 0 and 1
-// target_y is an equivalently sized std::vector w values between 0 and 1
-// y = ty * log(x_1) + (1 - ty) * log(x_1)
-struct BinaryLogLoss : public Node {
-  BinaryLogLoss(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = \log \sum_i \exp x_i
-// done in log space carefully to avoid over/underflow issues
-struct LogSumExp : public Node {
-  template <typename T> explicit LogSumExp(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  size_t aux_storage_size() const override;
-};
-
-struct LogDet : public Node {
-  template <typename T> explicit LogDet(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = \sum_i x_i
-struct Sum : public Node {
-  template <typename T> explicit Sum(const T& a) : Node(a) {}
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    if(dim.bd != 1)
-      autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-// y = \sum_i,j,... x[i,j,...]
-struct SumElements : public Node {
-  template <typename T> explicit SumElements(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-// y = \sum_i x_i
-struct SumBatches : public Node {
-  template <typename T> explicit SumBatches(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-// y = \sum_i,j,... x[i,j,...]
-struct StdElements : public Node {
-  template <typename T> explicit StdElements(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-// y = \sum_i x_i
-struct StdBatches : public Node {
-  template <typename T> explicit StdBatches(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-//y = \sum_i x_i
-struct StdDimension : public Node {
-  template <typename T> explicit StdDimension(const T& a, unsigned d) : Node(a), dimension(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-private:
-  unsigned dimension;
-};
-
-// y = \sum_i,j,... x[i,j,...]
-struct MomentElements : public Node {
-  template <typename T> explicit MomentElements(const T& a, unsigned o) : Node(a), order(o) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-private:
-  unsigned order;
-};
-
-// y = \sum_i x_i
-struct MomentBatches : public Node {
-  template <typename T> explicit MomentBatches(const T& a, unsigned o) : Node(a), order(o) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-private:
-  unsigned order;
-};
-
-//y = \sum_i x_i
-struct MomentDimension : public Node {
-  template <typename T> explicit MomentDimension(const T& a, unsigned d, unsigned o) : Node(a), dimension(d), order(o) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-private:
-  unsigned dimension;
-  unsigned order;
-};
-
-// y = ( \sum_i x_i ) / |x|
-struct Average : public Node {
-  template <typename T> explicit Average(const T& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-// this is used to implement poisson regression
-// x_1 = log predicted mean
-// ty = true y (this is not a VariableIndex since it has to be a nonnegative integer and
-//              is therefore nondifferentiable. There are various continuous extensions
-//              using the incomplete gamma function that could be used, but meh)
-// y = log Poisson(ty; \lambda = \exp x_1)
-//   = ty*x_1 - exp(x_1) - log(ty!)
-struct PoissonRegressionLoss : public Node {
-  explicit PoissonRegressionLoss(const std::initializer_list<VariableIndex>& a, unsigned true_y) : Node(a), ty(true_y), pty(&ty) {}
-  explicit PoissonRegressionLoss(const std::initializer_list<VariableIndex>& a, const unsigned* ptrue_y) : Node(a), ty(), pty(ptrue_y) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
- private:
-  unsigned ty;
-  const unsigned* pty;
-};
-
-// y = || x_1 ||^2
-struct SquaredNorm : public Node {
-  explicit SquaredNorm(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = || x_1 ||
-struct L2Norm : public Node {
-  explicit L2Norm(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = || x_1 - x_2 ||^2
-struct SquaredEuclideanDistance : public Node {
-  explicit SquaredEuclideanDistance(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = || x_1 - x_2 ||_H(d)
-struct HuberDistance : public Node {
-  explicit HuberDistance(const std::initializer_list<VariableIndex>& a, float d = 1.345f) : Node(a), d(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  float d;
-};
-
-// y = || x_1 - x_2 ||_1
-struct L1Distance : public Node {
-  explicit L1Distance(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = \sigma(x_1)
-struct LogisticSigmoid : public Node {
-  explicit LogisticSigmoid(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::logistic); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// y = x / (1 + |x|)
-struct SoftSign : public Node {
-  explicit SoftSign(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override { Sig s(nt::softsign); return sm.get_idx(s); }
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-// z = \sum_j \exp (x_i)_j
-// y_i = (x_1)_i / z
-struct Softmax : public Node {
-  explicit Softmax(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  virtual bool supports_multibatch() const override { return true; }
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-};
-
-// z = \sum_j \exp (x_i)_j
-// y_i = (x_1)_i - \log z
-struct LogSoftmax : public Node {
-  explicit LogSoftmax(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  size_t aux_storage_size() const override;
-  virtual bool supports_multibatch() const override { return true; }
-};
-
-// z = \sum_j \exp (x_i)_j
-// y = (x_1)_element - \log z
-struct PickNegLogSoftmax : public Node {
-  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, unsigned v) : Node(a), val(v), pval(&val), vals(), pvals() {}
-  // use this constructor if you want to perform mini-batching
-  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& v) : Node(a), val(), pval(), vals(v), pvals(&vals) {}
-  // use these constructors if you want to change the value after the graph is constructed
-  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, const unsigned* pv) : Node(a), val(), pval(pv), vals(), pvals() {}
-  explicit PickNegLogSoftmax(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pv) : Node(a), val(), pval(), vals(), pvals(pv) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  size_t aux_storage_size() const override;
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override;
-  virtual Node* autobatch_pseudo_node(const ComputationGraph & cg,
-                                      const std::vector<VariableIndex> & batch_ids) const override;
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  unsigned val;
-  const unsigned* pval;
-  std::vector<unsigned> vals;
-  const std::vector<unsigned>* pvals;
-};
-
-// z = \sum_{j \in denom} \exp (x_i)_j
-// y_i = (x_1)_i - \log z
-struct RestrictedLogSoftmax : public Node {
-  explicit RestrictedLogSoftmax(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& d) : Node(a), denom(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  std::vector<unsigned> denom;
-};
-
-// x_1 is a std::vector
-// y = (x_1)_{*pval}
-// this is used to implement cross-entropy training
-struct PickElement : public Node {
-  explicit PickElement(const std::initializer_list<VariableIndex>& a, unsigned v, unsigned d = 0) : Node(a), val(v), pval(&val), vals(), pvals(), dimension(d) {}
-  // use this constructor if you want to perform mini-batching
-  explicit PickElement(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& v, unsigned d = 0) : Node(a), val(), pval(), vals(v), pvals(&vals), dimension(d) {}
-  // use these constructors if you want to change the value after the graph is constructed
-  explicit PickElement(const std::initializer_list<VariableIndex>& a, const unsigned* pv, unsigned d = 0) : Node(a), val(), pval(pv), vals(), pvals(), dimension(d) {}
-  explicit PickElement(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pv, unsigned d = 0) : Node(a), val(), pval(), vals(), pvals(pv), dimension(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  unsigned val;
-  const unsigned* pval;
-  std::vector<unsigned> vals;
-  const std::vector<unsigned>* pvals;
-  unsigned dimension;
-};
-
-// x_1 is a tensor
-// y = x_1[start:end] along dimension d
-// (start inclusive, end exclusive)
-struct PickRange : public Node {
-  explicit PickRange(const std::initializer_list<VariableIndex>& a, unsigned s, unsigned e, unsigned d = 0) : Node(a), start(s), end(e), dim(d) {}
-  virtual int autobatch_sig(const ComputationGraph &cg, SigMap &sm) const override;
-  virtual std::vector<int> autobatch_concat(const ComputationGraph & cg) const override { return std::vector<int>(1, 1); }  
-  virtual void autobatch_reshape(const ComputationGraph & cg,
-                                 const std::vector<VariableIndex> & batch_ids,
-                                 const std::vector<int> & concat,
-                                 std::vector<const Tensor*>& xs,
-                                 Tensor& fx) const override {
-    autobatch_reshape_concatonly(cg, batch_ids, concat, xs, fx);
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  unsigned start, end, dim;
-};
-
-// x is a batched tensor
-// y = (x)_{[*pval]}
-struct PickBatchElements : public Node {
-  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, unsigned v) : Node(a), val(v), pval(&val), vals(), pvals() {}
-  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>& v) : Node(a), val(), pval(), vals(v), pvals(&vals) {}
-  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, const unsigned* pv) : Node(a), val(), pval(pv), vals(), pvals() {}
-  explicit PickBatchElements(const std::initializer_list<VariableIndex>& a, const std::vector<unsigned>* pv) : Node(a), val(), pval(), vals(), pvals(pv) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  unsigned val;
-  const unsigned* pval;
-  std::vector<unsigned> vals;
-  const std::vector<unsigned>* pvals;
-};
-
-// represents a simple std::vector of 0s
-struct Zeroes : public Node {
-  explicit Zeroes(const Dim& d) : dim(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  Dim dim;
-};
-
-// draw random noise from Normal(0, 1)
-struct RandomNormal : public Node {
-  explicit RandomNormal(const Dim& d) : dim(d) {}
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  Dim dim;
-};
-
-// draw from Bernoulli(p)
-struct RandomBernoulli : public Node {
-  explicit RandomBernoulli(const std::initializer_list<VariableIndex>& a, const Dim& d, real p, real scale = 1.0f) : dim(d), p(p), scale(scale) {
-    DYNET_ASSERT(a.size() == 0, "RandomBernoulli doesn't accept nodes as input");
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  Dim dim;
-  real p;
-  real scale;
-};
-
-// draw a random real from Uniform(left, right)
-struct RandomUniform : public Node {
-  explicit RandomUniform(const std::initializer_list<VariableIndex>& a, const Dim& d, real left, real right) : dim(d), left(left), right(right) {
-    DYNET_ASSERT(a.size() == 0, "RandomUniform doesn't accept nodes as input");
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  Dim dim;
-  real left, right;
-};
-
-// draw a random real from Uniform(left, right)
-struct RandomGumbel : public Node {
-  explicit RandomGumbel(const std::initializer_list<VariableIndex>& a, const Dim& d, real mu, real beta) : dim(d), mu(mu), beta(beta) {
-    DYNET_ASSERT(a.size() == 0, "RandomGumbel doesn't accept nodes as input");
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  Dim dim;
-  real mu, beta;
-};
-
-struct MaxDimension : public Node {
-  explicit MaxDimension(const std::initializer_list<VariableIndex>& a, unsigned dimension = 0) : Node(a), reduced_dim(dimension) {
-    first_dim = reduced_dim == 0 ? 1 : 0;
-    second_dim = first_dim + 1 == reduced_dim ? first_dim + 2 : first_dim + 1;
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  size_t aux_storage_size() const override;
-  unsigned reduced_dim;
-  unsigned first_dim;
-  unsigned second_dim;
-};
-
-struct MinDimension : public Node {
-  explicit MinDimension(const std::initializer_list<VariableIndex>& a, unsigned dimension = 0) : Node(a), reduced_dim(dimension) {
-    first_dim = reduced_dim == 0 ? 1 : 0;
-    second_dim = first_dim + 1 == reduced_dim ? first_dim + 2 : first_dim + 1;
-  }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-  virtual bool supports_multibatch() const override { return true; }
-  size_t aux_storage_size() const override;
-  unsigned reduced_dim;
-  unsigned first_dim;
-  unsigned second_dim;
-};
-
-// y = x_1 * x_2
-struct WeightNormalization : public Node {
-  explicit WeightNormalization(const std::initializer_list<VariableIndex>& a) : Node(a) {}
-  virtual bool supports_multibatch() const override { return false; }
-  DYNET_NODE_DEFINE_DEV_IMPL()
-};
-
-} // namespace dynet
-
-#endif
+#include "dynet/nodes-matrixmultiply.h"
+#include "dynet/nodes-maxpooling2d.h"
+#include "dynet/nodes-minmax.h"
+#include "dynet/nodes-moments.h"
+#include "dynet/nodes-normalization.h"
+#include "dynet/nodes-norms.h"
+#include "dynet/nodes-pickneglogsoftmax.h"
+#include "dynet/nodes-random.h"
+#include "dynet/nodes-select.h"
+#include "dynet/nodes-similarities.h"
+#include "dynet/nodes-softmaxes.h"
+#include "dynet/nodes-trig.h"

From 786d3a26bb1477597230760c085be755b86ed8ac Mon Sep 17 00:00:00 2001
From: Graham Neubig <neubig@gmail.com>
Date: Mon, 10 Jul 2017 11:13:02 -0400
Subject: [PATCH 3/3] Fixed compile on GPU

Former-commit-id: 3a93e2ba017f5eb6d3696aa9a0ec7f0aec783b9a
---
 dynet/CMakeLists.txt             | 45 +++++++++++++++++++++++++++++---
 dynet/gpu-nodes-activations.cu   |  3 +++
 dynet/gpu-nodes-arith-const.cu   |  3 +++
 dynet/gpu-nodes-arith-cwise.cu   |  3 +++
 dynet/gpu-nodes-arith-scalar.cu  |  3 +++
 dynet/gpu-nodes-arith-sum.cu     |  3 +++
 dynet/gpu-nodes-arith-unary.cu   |  2 +-
 dynet/gpu-nodes-concat.cu        |  3 +++
 dynet/gpu-nodes-const.cu         |  3 +++
 dynet/gpu-nodes-conv.cu          |  2 +-
 dynet/gpu-nodes-conv2d.cu        |  2 ++
 dynet/gpu-nodes-dropout.cu       |  3 +++
 dynet/gpu-nodes-flow.cu          |  3 +++
 dynet/gpu-nodes-linalg.cu        |  3 +++
 dynet/gpu-nodes-logsumexp.cu     |  3 +++
 dynet/gpu-nodes-losses.cu        |  3 +++
 dynet/gpu-nodes-maxpooling2d.cu  |  2 ++
 dynet/gpu-nodes-minmax.cu        |  3 +++
 dynet/gpu-nodes-moments.cu       |  3 +++
 dynet/gpu-nodes-normalization.cu |  3 +++
 dynet/gpu-nodes-norms.cu         |  2 +-
 dynet/gpu-nodes-random.cu        |  3 +++
 dynet/gpu-nodes-select.cu        |  3 +++
 dynet/gpu-nodes-similarities.cu  |  2 +-
 dynet/gpu-nodes-softmaxes.cu     |  3 +++
 dynet/gpu-nodes-trig.cu          |  3 +++
 dynet/nodes-conv2d.cc            |  1 +
 dynet/nodes-conv2d.h             |  4 +++
 dynet/nodes-losses.cc            |  4 +--
 dynet/nodes-maxpooling2d.h       |  4 +++
 30 files changed, 118 insertions(+), 9 deletions(-)
 create mode 100644 dynet/gpu-nodes-activations.cu
 create mode 100644 dynet/gpu-nodes-arith-const.cu
 create mode 100644 dynet/gpu-nodes-arith-cwise.cu
 create mode 100644 dynet/gpu-nodes-arith-scalar.cu
 create mode 100644 dynet/gpu-nodes-arith-sum.cu
 create mode 100644 dynet/gpu-nodes-concat.cu
 create mode 100644 dynet/gpu-nodes-const.cu
 create mode 100644 dynet/gpu-nodes-dropout.cu
 create mode 100644 dynet/gpu-nodes-flow.cu
 create mode 100644 dynet/gpu-nodes-linalg.cu
 create mode 100644 dynet/gpu-nodes-logsumexp.cu
 create mode 100644 dynet/gpu-nodes-losses.cu
 create mode 100644 dynet/gpu-nodes-minmax.cu
 create mode 100644 dynet/gpu-nodes-moments.cu
 create mode 100644 dynet/gpu-nodes-normalization.cu
 create mode 100644 dynet/gpu-nodes-random.cu
 create mode 100644 dynet/gpu-nodes-select.cu
 create mode 100644 dynet/gpu-nodes-softmaxes.cu
 create mode 100644 dynet/gpu-nodes-trig.cu

diff --git a/dynet/CMakeLists.txt b/dynet/CMakeLists.txt
index 5888c7332..28283050a 100644
--- a/dynet/CMakeLists.txt
+++ b/dynet/CMakeLists.txt
@@ -123,6 +123,46 @@ if(ENABLE_BOOST)
   list(APPEND dynet_library_HDRS mp.h)
 endif()
 
+set(dynet_gpu_SRCS
+    cuda.cc
+    cudnn-ops.cu
+    gpu-ops.cu 
+    gpu-nodes-activations.cu
+    gpu-nodes-affinetransform.cu
+    gpu-nodes-arith-const.cu
+    gpu-nodes-arith-cwise.cu
+    gpu-nodes-arith-scalar.cu
+    gpu-nodes-arith-sum.cu
+    gpu-nodes-arith-unary.cu
+    gpu-nodes-concat.cu
+    gpu-nodes-const.cu
+    gpu-nodes-contract.cu
+    gpu-nodes-conv2d.cu
+    gpu-nodes-conv.cu
+    gpu-nodes-dropout.cu
+    gpu-nodes-flow.cu
+    gpu-nodes-hinge.cu
+    gpu-nodes-linalg.cu
+    gpu-nodes-logsumexp.cu
+    gpu-nodes-losses.cu
+    gpu-nodes-matrixmultiply.cu
+    gpu-nodes-maxpooling2d.cu
+    gpu-nodes-minmax.cu
+    gpu-nodes-moments.cu
+    gpu-nodes-normalization.cu
+    gpu-nodes-norms.cu
+    gpu-nodes-pickneglogsoftmax.cu
+    gpu-nodes-random.cu
+    gpu-nodes-select.cu
+    gpu-nodes-similarities.cu
+    gpu-nodes-softmaxes.cu
+    gpu-nodes-trig.cu
+    gpu-param-nodes.cu
+    gpu-tensor.cu
+    gpu-training.cu
+    gpu-model.cu
+)
+
 file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} tests/*.cc)
 if (NOT MSVC)
   set(BUILD_SHARED_LIBS ON)
@@ -175,10 +215,10 @@ if(WITH_CUDA_BACKEND)
     list(APPEND CUDA_NVCC_FLAGS_DEBUG "--compiler-options \"/MDd\"")
     list(APPEND CUDA_NVCC_FLAGS_RELEASE "--compiler-options \"/MD\"")
     SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
-    cuda_add_library(gdynet ${dynet_library_SRCS} ${dynet_library_HDRS} cuda.cc cudnn-ops.cu gpu-ops.cu gpu-nodes.cu gpu-nodes-contract.cu gpu-nodes-conv.cu gpu-nodes-conv2d.cu gpu-nodes-maxpooling2d.cu gpu-param-nodes.cu gpu-tensor.cu gpu-training.cu gpu-model.cu gpu-nodes-pickneglogsoftmax.cu gpu-nodes-matrixmultiply.cu gpu-nodes-hinge.cu gpu-nodes-affinetransform.cu gpu-nodes-similarities.cu gpu-nodes-norms.cu gpu-nodes-unary-arith.cu)
+    cuda_add_library(gdynet ${dynet_library_SRCS} ${dynet_library_HDRS} ${dynet_gpu_SRCS})
   else()
     SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
-    cuda_add_library(gdynet ${dynet_library_SRCS} ${dynet_library_HDRS} cuda.cc cudnn-ops.cu gpu-ops.cu gpu-nodes.cu gpu-nodes-contract.cu gpu-nodes-conv.cu gpu-nodes-conv2d.cu gpu-nodes-maxpooling2d.cu gpu-param-nodes.cu gpu-tensor.cu gpu-training.cu gpu-model.cu gpu-nodes-pickneglogsoftmax.cu gpu-nodes-matrixmultiply.cu gpu-nodes-hinge.cu gpu-nodes-affinetransform.cu gpu-nodes-similarities.cu gpu-nodes-norms.cu gpu-nodes-unary-arith.cu OPTIONS --compiler-options "-fPIC")
+    cuda_add_library(gdynet ${dynet_library_SRCS} ${dynet_library_HDRS} ${dynet_gpu_SRCS} OPTIONS --compiler-options "-fPIC")
   endif()
   set_target_properties(gdynet PROPERTIES
                         COMPILE_DEFINITIONS HAVE_CUDA)
@@ -197,4 +237,3 @@ if(WITH_CUDA_BACKEND)
 endif(WITH_CUDA_BACKEND)
 
 # target_compile_features(dynet PRIVATE cxx_range_for)
-
diff --git a/dynet/gpu-nodes-activations.cu b/dynet/gpu-nodes-activations.cu
new file mode 100644
index 000000000..b50d5a62c
--- /dev/null
+++ b/dynet/gpu-nodes-activations.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-activations.cc"
diff --git a/dynet/gpu-nodes-arith-const.cu b/dynet/gpu-nodes-arith-const.cu
new file mode 100644
index 000000000..2abc645f9
--- /dev/null
+++ b/dynet/gpu-nodes-arith-const.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-arith-const.cc"
diff --git a/dynet/gpu-nodes-arith-cwise.cu b/dynet/gpu-nodes-arith-cwise.cu
new file mode 100644
index 000000000..93e62b7df
--- /dev/null
+++ b/dynet/gpu-nodes-arith-cwise.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-arith-cwise.cc"
diff --git a/dynet/gpu-nodes-arith-scalar.cu b/dynet/gpu-nodes-arith-scalar.cu
new file mode 100644
index 000000000..2e4ff0c0e
--- /dev/null
+++ b/dynet/gpu-nodes-arith-scalar.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-arith-scalar.cc"
diff --git a/dynet/gpu-nodes-arith-sum.cu b/dynet/gpu-nodes-arith-sum.cu
new file mode 100644
index 000000000..c80bdfe49
--- /dev/null
+++ b/dynet/gpu-nodes-arith-sum.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-arith-sum.cc"
diff --git a/dynet/gpu-nodes-arith-unary.cu b/dynet/gpu-nodes-arith-unary.cu
index 15198bef2..d5e6c6917 100644
--- a/dynet/gpu-nodes-arith-unary.cu
+++ b/dynet/gpu-nodes-arith-unary.cu
@@ -1,3 +1,3 @@
-// This is a dummy file that contains the same content as nodes-unary-arith.cc but compiled
+// This is a dummy file that contains the same content as nodes.cc but compiled
 // on CUDA
 #include "nodes-arith-unary.cc"
diff --git a/dynet/gpu-nodes-concat.cu b/dynet/gpu-nodes-concat.cu
new file mode 100644
index 000000000..2fcfc98c4
--- /dev/null
+++ b/dynet/gpu-nodes-concat.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-concat.cc"
diff --git a/dynet/gpu-nodes-const.cu b/dynet/gpu-nodes-const.cu
new file mode 100644
index 000000000..8a28ebe56
--- /dev/null
+++ b/dynet/gpu-nodes-const.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-const.cc"
diff --git a/dynet/gpu-nodes-conv.cu b/dynet/gpu-nodes-conv.cu
index 451f71b36..0bff1eca4 100644
--- a/dynet/gpu-nodes-conv.cu
+++ b/dynet/gpu-nodes-conv.cu
@@ -1,3 +1,3 @@
-// This is a dummy file that contains the same content as nodes-conv.cc but compiled
+// This is a dummy file that contains the same content as nodes.cc but compiled
 // on CUDA
 #include "nodes-conv.cc"
diff --git a/dynet/gpu-nodes-conv2d.cu b/dynet/gpu-nodes-conv2d.cu
index 347aaadcf..cc2f78e4f 100644
--- a/dynet/gpu-nodes-conv2d.cu
+++ b/dynet/gpu-nodes-conv2d.cu
@@ -1 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
 #include "nodes-conv2d.cc"
diff --git a/dynet/gpu-nodes-dropout.cu b/dynet/gpu-nodes-dropout.cu
new file mode 100644
index 000000000..3911d2bc1
--- /dev/null
+++ b/dynet/gpu-nodes-dropout.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-dropout.cc"
diff --git a/dynet/gpu-nodes-flow.cu b/dynet/gpu-nodes-flow.cu
new file mode 100644
index 000000000..27cfed8c8
--- /dev/null
+++ b/dynet/gpu-nodes-flow.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-flow.cc"
diff --git a/dynet/gpu-nodes-linalg.cu b/dynet/gpu-nodes-linalg.cu
new file mode 100644
index 000000000..cbebed454
--- /dev/null
+++ b/dynet/gpu-nodes-linalg.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-linalg.cc"
diff --git a/dynet/gpu-nodes-logsumexp.cu b/dynet/gpu-nodes-logsumexp.cu
new file mode 100644
index 000000000..f7abe4950
--- /dev/null
+++ b/dynet/gpu-nodes-logsumexp.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-logsumexp.cc"
diff --git a/dynet/gpu-nodes-losses.cu b/dynet/gpu-nodes-losses.cu
new file mode 100644
index 000000000..4bb8863fd
--- /dev/null
+++ b/dynet/gpu-nodes-losses.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-losses.cc"
diff --git a/dynet/gpu-nodes-maxpooling2d.cu b/dynet/gpu-nodes-maxpooling2d.cu
index ea93114c9..090ef624e 100644
--- a/dynet/gpu-nodes-maxpooling2d.cu
+++ b/dynet/gpu-nodes-maxpooling2d.cu
@@ -1 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
 #include "nodes-maxpooling2d.cc"
diff --git a/dynet/gpu-nodes-minmax.cu b/dynet/gpu-nodes-minmax.cu
new file mode 100644
index 000000000..dcac97cc4
--- /dev/null
+++ b/dynet/gpu-nodes-minmax.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-minmax.cc"
diff --git a/dynet/gpu-nodes-moments.cu b/dynet/gpu-nodes-moments.cu
new file mode 100644
index 000000000..253a0860f
--- /dev/null
+++ b/dynet/gpu-nodes-moments.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-moments.cc"
diff --git a/dynet/gpu-nodes-normalization.cu b/dynet/gpu-nodes-normalization.cu
new file mode 100644
index 000000000..16d4a3048
--- /dev/null
+++ b/dynet/gpu-nodes-normalization.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-normalization.cc"
diff --git a/dynet/gpu-nodes-norms.cu b/dynet/gpu-nodes-norms.cu
index 4fa94dc81..470e1f97c 100644
--- a/dynet/gpu-nodes-norms.cu
+++ b/dynet/gpu-nodes-norms.cu
@@ -1,3 +1,3 @@
-// This is a dummy file that contains the same content as nodes-norms.cc but compiled
+// This is a dummy file that contains the same content as nodes.cc but compiled
 // on CUDA
 #include "nodes-norms.cc"
diff --git a/dynet/gpu-nodes-random.cu b/dynet/gpu-nodes-random.cu
new file mode 100644
index 000000000..7ef0d2564
--- /dev/null
+++ b/dynet/gpu-nodes-random.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-random.cc"
diff --git a/dynet/gpu-nodes-select.cu b/dynet/gpu-nodes-select.cu
new file mode 100644
index 000000000..25871a1bb
--- /dev/null
+++ b/dynet/gpu-nodes-select.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-select.cc"
diff --git a/dynet/gpu-nodes-similarities.cu b/dynet/gpu-nodes-similarities.cu
index 068d0cec7..933edb421 100644
--- a/dynet/gpu-nodes-similarities.cu
+++ b/dynet/gpu-nodes-similarities.cu
@@ -1,3 +1,3 @@
-// This is a dummy file that contains the same content as nodes-similarities.cc but compiled
+// This is a dummy file that contains the same content as nodes.cc but compiled
 // on CUDA
 #include "nodes-similarities.cc"
diff --git a/dynet/gpu-nodes-softmaxes.cu b/dynet/gpu-nodes-softmaxes.cu
new file mode 100644
index 000000000..43730a67b
--- /dev/null
+++ b/dynet/gpu-nodes-softmaxes.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-softmaxes.cc"
diff --git a/dynet/gpu-nodes-trig.cu b/dynet/gpu-nodes-trig.cu
new file mode 100644
index 000000000..47449a016
--- /dev/null
+++ b/dynet/gpu-nodes-trig.cu
@@ -0,0 +1,3 @@
+// This is a dummy file that contains the same content as nodes.cc but compiled
+// on CUDA
+#include "nodes-trig.cc"
diff --git a/dynet/nodes-conv2d.cc b/dynet/nodes-conv2d.cc
index 0a7cdedc9..b5f554152 100644
--- a/dynet/nodes-conv2d.cc
+++ b/dynet/nodes-conv2d.cc
@@ -16,6 +16,7 @@
 #if HAVE_CUDA
 #include "dynet/cuda.h"
 #include "dynet/gpu-ops.h"
+#include "dynet/cudnn-ops.h"
 #endif
 
 using namespace std;
diff --git a/dynet/nodes-conv2d.h b/dynet/nodes-conv2d.h
index 484c3222a..fc7140cf5 100644
--- a/dynet/nodes-conv2d.h
+++ b/dynet/nodes-conv2d.h
@@ -4,6 +4,10 @@
 #include "dynet/dynet.h"
 #include "dynet/nodes-macros.h"
 
+#if HAVE_CUDNN
+#include "dynet/cudnn-ops.h"
+#endif
+
 namespace dynet {
 
 // conv2d 
diff --git a/dynet/nodes-losses.cc b/dynet/nodes-losses.cc
index c67a3da2d..a0d498832 100644
--- a/dynet/nodes-losses.cc
+++ b/dynet/nodes-losses.cc
@@ -26,6 +26,8 @@ Dim PairwiseRankLoss::dim_forward(const vector<Dim>& xs) const {
   return xs[0].bd >= xs[1].bd ? xs[0] : xs[1];
 }
 
+#endif
+
 template<class MyDevice>
 void PairwiseRankLoss::forward_dev_impl(const MyDevice & dev, const vector<const Tensor*>& xs, Tensor& fx) const {
   fx.tvec().device(*dev.edevice) = xs[0]->tvec().binaryExpr(xs[1]->tvec(), FPairwiseRankLoss(margin));
@@ -46,8 +48,6 @@ void PairwiseRankLoss::backward_dev_impl(const MyDevice & dev,
 }
 DYNET_NODE_INST_DEV_IMPL(PairwiseRankLoss)
 
-#endif
-
 // ************* BinaryLogLoss *************
 
 #ifndef __CUDACC__
diff --git a/dynet/nodes-maxpooling2d.h b/dynet/nodes-maxpooling2d.h
index 20bfff67d..1172b14dc 100644
--- a/dynet/nodes-maxpooling2d.h
+++ b/dynet/nodes-maxpooling2d.h
@@ -4,6 +4,10 @@
 #include "dynet/dynet.h"
 #include "dynet/nodes-macros.h"
 
+#if HAVE_CUDNN
+#include "dynet/cudnn-ops.h"
+#endif
+
 namespace dynet {
 
 // maxpooling2d